I am trying to compile some code which allows some CPU routines to call a function which uses the GPU to speed up some calculations. The GPU code uses Thrust, specifically reduce and device_ptr. When I build the GPU code as a standalone using nvcc, there are no problems. However, attempting to integrate the GPU code with CPU C++ code causes the following compiler error when compiling the final "wrapper":
nvcc -O2 -c NLC_2D_TFIM.cpp -lcuda -lcudart -lcublas -lcusparse -L../CUDA/Lanczos/sort/sort/gnu/release -lmgpusort
In file included from /usr/local/cuda/bin/../include/thrust/pair.h:265:0,
from /usr/local/cuda/bin/../include/thrust/tuple.h:35,
from /usr/local/cuda/bin/../include/thrust/detail/functional/actor.h:29,
from /usr/local/cuda/bin/../include/thrust/detail/functional/placeholder.h:20,
from /usr/local/cuda/bin/../include/thrust/functional.h:26,
from /usr/local/cuda/bin/../include/thrust/system/detail/error_category.inl:22,
from /usr/local/cuda/bin/../include/thrust/system/error_code.h:516,
from /usr/local/cuda/bin/../include/thrust/system/cuda_error.h:26,
from /usr/local/cuda/bin/../include/thrust/detail/backend/cuda/malloc.inl:26,
from /usr/local/cuda/bin/../include/thrust/detail/backend/cuda/malloc.h:50,
from /usr/local/cuda/bin/../include/thrust/detail/backend/dispatch/malloc.h:22,
from /usr/local/cuda/bin/../include/thrust/detail/device_malloc.inl:23,
from /usr/local/cuda/bin/../include/thrust/device_malloc.h:102,
from /usr/local/cuda/bin/../include/thrust/detail/backend/internal_allocator.h:22,
from /usr/local/cuda/bin/../include/thrust/detail/uninitialized_array.h:23,
from /usr/local/cuda/bin/../include/thrust/detail/backend/cuda/copy_cross_space.inl:20,
from /usr/local/cuda/bin/../include/thrust/detail/backend/cuda/copy_cross_space.h:57,
from /usr/local/cuda/bin/../include/thrust/detail/backend/cuda/dispatch/copy.h:23,
from /usr/local/cuda/bin/../include/thrust/detail/backend/cuda/copy.h:21,
from /usr/local/cuda/bin/../include/thrust/detail/backend/dispatch/copy.h:24,
from /usr/local/cuda/bin/../include/thrust/detail/backend/copy.inl:20,
from /usr/local/cuda/bin/../include/thrust/detail/backend/copy.h:44,
from /usr/local/cuda/bin/../include/thrust/detail/copy.inl:20,
from /usr/local/cuda/bin/../include/thrust/detail/copy.h:39,
from /usr/local/cuda/bin/../include/thrust/detail/reference_base.inl:18,
from /usr/local/cuda/bin/../include/thrust/detail/reference_base.h:138,
from /usr/local/cuda/bin/../include/thrust/device_reference.h:27,
from /usr/local/cuda/bin/../include/thrust/detail/device_ptr.inl:23,
from /usr/local/cuda/bin/../include/thrust/device_ptr.h:181,
from ../CUDA/Lanczos/hamiltonian.h:32,
from ../CUDA/Lanczos/lanczos.h:8,
from NLC_2D_TFIM.cpp:17:
/usr/local/cuda/bin/../include/thrust/detail/pair.inl: In function ‘bool thrust::operator<(const thrust::pair<T1, T2>&, const thrust::pair<T1, T2>&)’:
/usr/local/cuda/bin/../include/thrust/detail/pair.inl:72:22: error: ‘.’ cannot appear in a constant-expression
/usr/local/cuda/bin/../include/thrust/detail/pair.inl:72:46: error: ‘.’ cannot appear in a constant-expression
/usr/local/cuda/bin/../include/thrust/detail/pair.inl:72:36: error: parse error in template argument list
/usr/local/cuda/bin/../include/thrust/detail/pair.inl:72:36: error: ‘.’ cannot appear in a constant-expression
/usr/local/cuda/bin/../include/thrust/detail/pair.inl:72:58: error: ‘.’ cannot appear in a constant-expression
/usr/local/cuda/bin/../include/thrust/detail/pair.inl:72:69: error: ‘.’ cannot appear in a constant-expression
/usr/local/cuda/bin/../include/thrust/detail/pair.inl:72:12: error: parse error in template argument list
make: *** [NLC_2D_TFIM.o] Error 1
NLC_2D_TFIM works with another module (Graphs) which uses std::pairs, but none of these are passed to the module which talks to the GPU. Every header uses std as its namespace, not thrust. All the parameters I'm passing to the GPU handler are regular C arrays, ints, etc.
The lines referenced above are:
#include"lanczos.h"
Which uses:
#include"hamiltonian.h"
And then from there:
#include<thrust/device_ptr.h>
In NLC_2D_TFIM.cu, the "wrapper", the code is:
ReadGraphsFromFile(fileGraphs, "rectanglegraphs.dat", TypeFlag); //graphs the information generated by the Graphs module
double J=1.;
for(int hh=1; hh<10; hh++) {
h = hh;
//Create some storage for things to be used in GPU functions
d_hamiltonian* HamilLancz = (d_hamiltonian*)malloc(HowMany*sizeof(d_hamiltonian));
parameters* data = (parameters*)malloc(HowMany*sizeof(parameters));
double** groundstates = (double**)malloc(HowMany*sizeof(double*));
double** eigenvalues = (double**)malloc(HowMany*sizeof(double*));
int* NumElem = (int*)malloc(HowMany*sizeof(int));
int** Bonds = (int**)malloc(HowMany*sizeof(int*));
//Go through each graph we read in earlier
unsigned int i = 1;
while ( i<fileGraphs.size() && fileGraphs.at(i)->Order < 14) { //skip the zeroth graph
//CPU gets the energy for smaller graphs
GENHAM HV(fileGraphs.at(i)->Order, J, h, fileGraphs.at(i)->AdjacencyList, TypeFlag);
LANCZOS lancz(HV.Vdim); //dimension of reduced Hilbert space (Sz sector)
HV.SparseHamJQ(); //generates sparse matrix Hamiltonian for Lanczos
energy = lancz.Diag(HV, 1, prm.valvec_, eVec);
i++;
}
if( argv[0] == "--gpu" || argv[0] == "-g" )
{
while ( i < fileGraphs.size() )
{
i += 30;
for( int j = 0; j < HowMany; j++)
{
Bonds[ j ] = (int*)malloc(sizeof(int)*3*fileGraphs.at(i - j)->Order);
for(unsigned int k = 0; k < fileGraphs.at(i - j)->Order; k++)
{
Bonds[ j ][ k ] = k;
Bonds[ j ][ k + fileGraphs.at(i - j)->Order ] = fileGraphs.at(i - j)->AdjacencyList.at(2*k).second;
Bonds[ j ][ k + 2*fileGraphs.at(i - j)->Order ] = fileGraphs.at(i - j)->AdjacencyList.at(2*k + 1).second;
}
data[ j ].Sz = 0;
data[ j ].dimension = 2;
data[ j ].J1 = J;
data[ j ].J2 = h;
data[ j ].modelType = 2;
eigenvalues[ j ] = (double*)malloc(sizeof(double));
}
//Calls the CPU functions which will talk to the GPU, including Thrust
ConstructSparseMatrix(HowMany, Bonds, HamilLancz, data, NumElem, 1);
lanczos(HowMany, NumElem, HamilLancz, groundstates, eigenvalues, 200, 1, 1e-12);
So there's nothing with an std::pair that's getting passed to the GPU. Here are the thrust calls:
for(int i = 0; i < howMany; i++)
{
thrust::device_ptr<int> red_ptr(d_H[i].set);
numElem[i] = thrust::reduce(red_ptr, red_ptr + rawSize[i]);
}
#defined some macros before your thrust#includes? It's possible that there's a collision which confuses the parser.