What is the most efficient way to allocate memory using cudaMallocHost() and cudaMalloc() so that the memory is aligned to a certain value.
Just like __mm_malloc where you can pass the size with the alignment as parameters.
Pointers returned from cudaMallocHost() can be post-processed with a bit bigger total size for the allocation:
auto alignedPtr = ptrAsULL + 4096 - (ptrAsULL%4096);
then encapsulated like this:
struct AlignedCudaBuf
{
char * ptrOriginal;
char * ptrAligned;
AlignedCudaBuf()
{
cudaMallocHost( &ptrOriginal, size + extra );
ptrAligned = ((unsigned long long)ptrOriginal) + 4096 - (((unsigned long long)ptrOriginal)%4096);
}
~AlignedCudaBuf()
{
if(ptrOriginal)
{
cudaFreeHost(ptrOriginal);
ptrOriginal=0;
}
}
}
This does not put too much overhead unless they are frequently created/destructed (that makes CUDA-side add API latency too).
Maybe to evade accidental copies, you could use smart pointers instead of raw pointers so that the last remaining instance of AlignedCudaBuf would destroy it only once:
struct AlignedCudaBuf
{
// will it be shared by multiple graphics cards & threads?
std::shared_ptr<char> ptrOriginal;
char * ptrAligned;
AlignedCudaBuf()
{
char * tmp;
cudaMallocHost( &tmp, size + extra );
ptrAligned = ((unsigned long long)tmp) + 4096 - (((unsigned long long)tmp)%4096);
ptrOriginal = std::shared_ptr<char>(
tmp,
[](char * ptr0){ cudaFreeHost(ptr0); } // custom destructor
);
}
}
cudaMallocPitchfunction.