When multiple CPU thread dispatch jobs to a single GPU, what's the best way to differentiate GPU threads so that the multiple CPU thread does not simply repeat each other
the following code calculate the sum of two large arrays element by element. The correct result is: 3.0. When using 1 CPU, the code do the right thing. Then running with 8 CPUs, the output becomes 10 because the kernel duplicate the calculation 8 times. I'm looking for a way such that each CPU calculate 1/8 of the sum that not duplicate each other.
#include <iostream>
#include <math.h>
#include <thread>
#include <vector>
#include <cuda.h>
using namespace std;
const unsigned NUM_THREADS = std::thread::hardware_concurrency();
// Kernel function to add the elements of two arrays
__global__
void add_2(int n, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i < n) {
y[i] = x[i] + y[i];
}
}
//
void thread_func(int N, float *x, float *y, int idx_thread)
{
cudaSetDevice(0);
int blockSize;
int minGridSize;
int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add_2, 0, N);
// Round up according to array size
gridSize = (N + blockSize - 1) / blockSize;
//gridSize /= NUM_THREADS +1;
cout<<"blockSize: "<<blockSize<<" minGridSize: "<<minGridSize<<" gridSize: "<<gridSize<<endl;
// Run kernel on 1M elements on the GPU
add_2<<<gridSize, blockSize>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
}
//
int main()
{
int N = 1<<20;
float *x, *y;
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
//.. begin multithreading ..
vector<std::thread> t;
for(int i = 0; i<NUM_THREADS; i++)
t.push_back(thread(thread_func, N, x, y, i));
for(int i = 0; i<NUM_THREADS; i++)
t[i].join();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++) {
if(!(i%10000))
std::cout<<i<<" "<<y[i]<<std::endl;
maxError = fmax(maxError, fabs(y[i]-3.0f));
}
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
return 0;
}
Output:
blockSize: 1024 minGridSize: 16 gridSize: 1024
..........
blockSize: 1024 minGridSize: 16 gridSize: 1024
0 10
10000 10
20000 10
...
1020000 10
1030000 10
1040000 10
Max error: 7