RAII Wrapper For CUDA Pointers

Question

I was recently working on my CUDA wrappers library, and this particular class is one of the oldest pieces of code in the entire project. Since that time, I added tons of other features (for example memset that isn't really a memset, and memcpy that provides weird API), and it became bloated and ugly quickly.

Small Introduction into my code

This code serves as a RAII oriented class to wrap CUDA C-styled API, it uses cudaStream_t's to perform operations asynchronously. Also it tries to be host/device side buffer via template parameter Side.

Down below is the code

#pragma once

#include <cuda_runtime.h>

#include <memory>

#define CUDA_SAFE_CALL(call)                                                                            \
    do {                                                                                                \
        cudaError_t error = call;                                                                       \
        if (error != cudaSuccess) {                                                                     \
            const char* msg      = cudaGetErrorName(error);                                             \
            const char* msg_name = cudaGetErrorString(error);                                           \
            throw raw::cuda_wrappers::cuda_exception(std::format(                                       \
                "[Error] Function {} failed with error: {} and description: {} in file: {} on line {}", \
                #call, msg, msg_name, std::source_location::current().file_name(),                      \
                std::source_location::current().line()));                                               \
        }                                                                                               \
    } while (0)


#include "fwd.h"
#include "stream.h"
namespace raw::cuda_wrappers {
class cuda_exception : public std::runtime_error {
public:
    using std::runtime_error::runtime_error;
};
enum class side { host, device };
template<typename T, side Side = side::device>
class buffer {
private:
    size_t                       _size = 0;
    std::shared_ptr<cuda_stream> data_stream;
    T                           *ptr = nullptr;

    void _memcpy(T *dst, const T *src, size_t size, cudaMemcpyKind kind) {
        CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, size, kind, data_stream->stream()));
    }
    void alloc()
        requires(Side == side::device)
    {
        CUDA_SAFE_CALL(cudaMallocAsync(&ptr, _size, data_stream->stream()));
    }
    void alloc()
        requires(Side == side::host)
    {
        CUDA_SAFE_CALL(cudaMallocHost(&ptr, _size));
    }

public:
    buffer(std::shared_ptr<cuda_stream> stream) : data_stream(stream) {}

    __host__ T &operator*()
        requires(Side == side::host)
    {
        return *ptr;
    }

    static buffer create(const size_t size) {
        static std::shared_ptr<cuda_stream> _stream = std::make_shared<cuda_stream>();
        return buffer<T>(size, _stream);
    }

    explicit buffer(const size_t size) : _size(size) {
        data_stream = std::make_shared<cuda_stream>();
        alloc();
    }

    buffer(const size_t size, std::shared_ptr<cuda_stream> stream)
        : _size(size), data_stream(std::move(stream)) {
        alloc();
    }

    template<side Side_>
    explicit buffer(const buffer<T, Side_> &rhs) : _size(rhs._size), data_stream(rhs.data_stream) {
        alloc();
        _memcpy(ptr, rhs.ptr, _size, cudaMemcpyDefault);
    }

    buffer &operator=(const buffer &rhs) {
        if (this == &rhs) {
            return *this;
        }
        data_stream = rhs.data_stream;
        _size       = rhs._size;

        alloc();
        _memcpy(ptr, rhs.ptr, _size, cudaMemcpyDefault);

        return *this;
    }

    buffer(buffer &&rhs) noexcept
        : _size(rhs._size), data_stream(std::move(rhs.data_stream)), ptr(rhs.ptr) {
        rhs.ptr   = nullptr;
        rhs._size = 0;
    }

    buffer &operator=(buffer &&rhs) noexcept {
        free();
        data_stream = std::move(rhs.data_stream);

        ptr       = rhs.ptr;
        rhs.ptr   = nullptr;
        _size     = rhs._size;
        rhs._size = 0;

        return *this;
    }

    ~buffer() {
        free();
        ptr         = nullptr;
        data_stream = nullptr;
        _size       = 0;
    }

    T *get() const {
        return ptr;
    }

    void allocate(size_t size) {
        free();
        _size = size;
        alloc();
    }

    void free() {
        if (_size != 0) {
            try {
                if constexpr (Side == side::device) {
                    CUDA_SAFE_CALL(cudaFreeAsync(ptr, data_stream->stream()));
                } else {
                    CUDA_SAFE_CALL(cudaFreeHost(ptr));
                }
            } catch (const cuda_exception &e) {
                std::cerr << std::format("[CRITICAL] Error Occured In Free Function. \n{}",
                                         e.what())
                          << std::endl;
            }
            _size = 0;
        }
    }

    void memset(void *_ptr, size_t size, cudaMemcpyKind kind) {
        CUDA_SAFE_CALL(cudaMemcpyAsync(ptr, _ptr, size, kind, data_stream->stream()));
    }

    explicit operator bool() const {
        return ptr != nullptr;
    }

    void zero_data(size_t amount) const {
        CUDA_SAFE_CALL(cudaMemsetAsync(ptr, 0, amount, data_stream->stream()));
    }
    void set_stream(std::shared_ptr<cuda_stream> stream) {
        data_stream = std::move(stream);
    }

    void memcpy(void *_ptr, size_t size, size_t offset, cudaMemcpyKind kind) {
        CUDA_SAFE_CALL(cudaMemcpyAsync(ptr + offset, _ptr, size, kind, data_stream->stream()));
    }
    size_t get_size() const {
        return _size;
    }
};
} // namespace raw::cuda_wrappers

What I want from this review

Suggestions on how to improve this class
Explanation of what i did wrong and what should be separated
Some suggestions on how to improve the interface

What I want to remain unchanged

Usage of CUDA_SAFE_CALL macro
API that is meant to be used via passing bytes to this class's methods (i want to keep it since 70% of the time i work with bytes and not amount of elements)

I recently posted a Strongly-typed CUDA device memory abstraction for review - you might be interested in some of the ideas there. BTW, it would be helpful to reviewers to see some usage examples (or better, the unit tests). — Toby Speight
– Toby Speight, Commented Sep 26 at 8:04
fwd.h contains forward declaration of other classes and stream.h contains cuda_stream class — NeKon
– NeKon, Commented Sep 26 at 9:48
You may well have changes to make following my review (I certainly would expect you to fix the memory leak), so perhaps a good idea to post a follow-up question and include the tests there. — Toby Speight
– Toby Speight, Commented Sep 26 at 12:40

Toby Speight · Accepted Answer · 2025-09-26 12:33:38Z

We define a helper macro CUDA_SAFE_CALL() but don't #undef it after use. Not clear whether this is intentional, but if our abstraction is complete, then it shouldn't be needed by users.

We should include the headers we need, such as <stdexcept>. Don't rely on transitive inclusion via other headers, as that can change without warning and cause compilation failure at a distance.

I'm guessing that size_t is an alias for std::size_t. It would be better to write it in full, rather that aliasing (which then affects every user of this header-only library).

I'm also guessing that cuda_stream is a similar RAII wrapper around cudaStream_t.

Presumably you have some convention whereby _size and _memcpy have initial underscores, but other members don't. Because the convention isn't explicit, it's just confusing to me.

I would prefer to see all constructors consecutive, rather than intermingled with other functions. As a general rule of thumb, it's helpful to have all overloads of each function together, so readers can easily find them all.

This class is getting to the size where it's useful to group member functions by theme. I would would define lifecycle functions (constructors, destructor, assignment), then informational accessors (generally const) and then mutators, for instance.

If possible, get rid of the constructor that takes only a stream. This results in an object that's unusable and likely to cause errors when its functions are called. (If for some reason you can't remove it, it should certainly be explicit.)

The constructor that takes a size_t implicitly default-initialises stream but then immediately assigns to it. Make that an initialiser instead:

    explicit buffer(const std::size_t size)
      : _size{size},
        data_stream{std::make_shared<cuda_stream>()};
    {
        alloc();
    }

Construction from another buffer uses cudaMemcpyDefault. That requires the target to have Unified virtual Addressing or it will fail.

Copy-assignment operator fails to free ptr, making a memory leak.

Consider writing a (possibly private) swap() that can be used to reduce duplication in move construction and assignment.

Destructor needlessly assigns to members whose lifetime is ending. Just release the resource.

free() is a dangerous function - it allows users to put the object into a state where it points at released memory. I don't see any reason to have this in the public interface (and probably not at all - just inline it into the destructor).

memset() is strangely named, because it copies rather than filling as std::memset() does. And I think it would be better if it accepted T* and a count (or perhaps std::byte* and size) rather than void* which allows mistakes much more readily.

It's also inconvenient for users, who need to determine and use the correct cudaMemcpyKind. We could do that for them, if we took the source memory as a buffer and examined source and destination's Side. (We might do that by overloading the function, for instance.)

Stack Exchange Network

RAII Wrapper For CUDA Pointers

Small Introduction into my code

What I want from this review

What I want to remain unchanged

1 Answer 1

You must log in to answer this question.

Linked

Hot Network Questions

RAII Wrapper For CUDA Pointers

Small Introduction into my code

What I want from this review

What I want to remain unchanged

1 Answer 1

You must log in to answer this question.

Linked

Related

Hot Network Questions