I was recently working on my CUDA wrappers library, and this particular class is one of the oldest pieces of code in the entire project. Since that time, I added tons of other features (for example memset that isn't really a memset, and memcpy that provides weird API), and it became bloated and ugly quickly.
Small Introduction into my code
This code serves as a RAII oriented class to wrap CUDA C-styled API, it uses cudaStream_t's to perform operations asynchronously. Also it tries to be host/device side buffer via template parameter Side.
Down below is the code
#pragma once
#include <cuda_runtime.h>
#include <memory>
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t error = call; \
if (error != cudaSuccess) { \
const char* msg = cudaGetErrorName(error); \
const char* msg_name = cudaGetErrorString(error); \
throw raw::cuda_wrappers::cuda_exception(std::format( \
"[Error] Function {} failed with error: {} and description: {} in file: {} on line {}", \
#call, msg, msg_name, std::source_location::current().file_name(), \
std::source_location::current().line())); \
} \
} while (0)
#include "fwd.h"
#include "stream.h"
namespace raw::cuda_wrappers {
class cuda_exception : public std::runtime_error {
public:
using std::runtime_error::runtime_error;
};
enum class side { host, device };
template<typename T, side Side = side::device>
class buffer {
private:
size_t _size = 0;
std::shared_ptr<cuda_stream> data_stream;
T *ptr = nullptr;
void _memcpy(T *dst, const T *src, size_t size, cudaMemcpyKind kind) {
CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, size, kind, data_stream->stream()));
}
void alloc()
requires(Side == side::device)
{
CUDA_SAFE_CALL(cudaMallocAsync(&ptr, _size, data_stream->stream()));
}
void alloc()
requires(Side == side::host)
{
CUDA_SAFE_CALL(cudaMallocHost(&ptr, _size));
}
public:
buffer(std::shared_ptr<cuda_stream> stream) : data_stream(stream) {}
__host__ T &operator*()
requires(Side == side::host)
{
return *ptr;
}
static buffer create(const size_t size) {
static std::shared_ptr<cuda_stream> _stream = std::make_shared<cuda_stream>();
return buffer<T>(size, _stream);
}
explicit buffer(const size_t size) : _size(size) {
data_stream = std::make_shared<cuda_stream>();
alloc();
}
buffer(const size_t size, std::shared_ptr<cuda_stream> stream)
: _size(size), data_stream(std::move(stream)) {
alloc();
}
template<side Side_>
explicit buffer(const buffer<T, Side_> &rhs) : _size(rhs._size), data_stream(rhs.data_stream) {
alloc();
_memcpy(ptr, rhs.ptr, _size, cudaMemcpyDefault);
}
buffer &operator=(const buffer &rhs) {
if (this == &rhs) {
return *this;
}
data_stream = rhs.data_stream;
_size = rhs._size;
alloc();
_memcpy(ptr, rhs.ptr, _size, cudaMemcpyDefault);
return *this;
}
buffer(buffer &&rhs) noexcept
: _size(rhs._size), data_stream(std::move(rhs.data_stream)), ptr(rhs.ptr) {
rhs.ptr = nullptr;
rhs._size = 0;
}
buffer &operator=(buffer &&rhs) noexcept {
free();
data_stream = std::move(rhs.data_stream);
ptr = rhs.ptr;
rhs.ptr = nullptr;
_size = rhs._size;
rhs._size = 0;
return *this;
}
~buffer() {
free();
ptr = nullptr;
data_stream = nullptr;
_size = 0;
}
T *get() const {
return ptr;
}
void allocate(size_t size) {
free();
_size = size;
alloc();
}
void free() {
if (_size != 0) {
try {
if constexpr (Side == side::device) {
CUDA_SAFE_CALL(cudaFreeAsync(ptr, data_stream->stream()));
} else {
CUDA_SAFE_CALL(cudaFreeHost(ptr));
}
} catch (const cuda_exception &e) {
std::cerr << std::format("[CRITICAL] Error Occured In Free Function. \n{}",
e.what())
<< std::endl;
}
_size = 0;
}
}
void memset(void *_ptr, size_t size, cudaMemcpyKind kind) {
CUDA_SAFE_CALL(cudaMemcpyAsync(ptr, _ptr, size, kind, data_stream->stream()));
}
explicit operator bool() const {
return ptr != nullptr;
}
void zero_data(size_t amount) const {
CUDA_SAFE_CALL(cudaMemsetAsync(ptr, 0, amount, data_stream->stream()));
}
void set_stream(std::shared_ptr<cuda_stream> stream) {
data_stream = std::move(stream);
}
void memcpy(void *_ptr, size_t size, size_t offset, cudaMemcpyKind kind) {
CUDA_SAFE_CALL(cudaMemcpyAsync(ptr + offset, _ptr, size, kind, data_stream->stream()));
}
size_t get_size() const {
return _size;
}
};
} // namespace raw::cuda_wrappers
What I want from this review
- Suggestions on how to improve this class
- Explanation of what i did wrong and what should be separated
- Some suggestions on how to improve the interface
What I want to remain unchanged
- Usage of
CUDA_SAFE_CALLmacro - API that is meant to be used via passing bytes to this class's methods (i want to keep it since 70% of the time i work with bytes and not amount of elements)
cuda_streamclass \$\endgroup\$