I have been trying to write a program that counts rationals on the Cantor set with certain denominators. I found that with my computer it takes 20 hours or more to compute the number between 3^14 and 3^15. I thought that since this is testing a large number of separate values it would be a good thing to implement on a graphics card with OpenCL. When I tried to implement it though I get performance a few orders of magnitude slower than my CPU implementation. Here is the code of my attempt.
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <functional>
#include <ctime>
#include <iostream>
#include <fstream>
#include <exception>
#include <cstdlib>
#include <vector>
#include <thread>
#include <cmath>
#include <string>
#include <algorithm>
#include <thread>
#include <cmath>
#include <sstream>
#define SUCCESS 0
#define FAILURE 1
#define EXPECTED_FAILURE 2
const int NUM_ELEMENTS = 32768;
void printOutput(unsigned long long start, unsigned long long *values){
for(unsigned int i = 0; i < NUM_ELEMENTS; i++)
if (values[i] != 0)
std::cout << start+i << ',' << values[i] << std::endl;
}
void newList(unsigned long long start, unsigned long long *dataList){
for(int i=0; i < NUM_ELEMENTS; ++i)
dataList[i] = start + i;
}
using namespace cl;
Kernel kernelA;
Context context;
CommandQueue queue;
Buffer inputBuffer;
Buffer outputBuffer;
int init() {
cl_int status = 0;
const char* buildOption ="-x clc++ ";
std::vector<Platform> platforms;
status = Platform::get(&platforms);
if (status != CL_SUCCESS){
std::cout<<"Error: Getting platforms!"<<std::endl;
return FAILURE;
}
std::vector<cl::Platform>::iterator iter;
for(iter = platforms.begin(); iter != platforms.end(); ++iter)
if(!strcmp((*iter).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc."))
break;
cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(*iter) (), 0};
bool gpuNotFound = false;
try{
context = cl::Context(CL_DEVICE_TYPE_GPU, cps, NULL, NULL, &status);
}
catch(std::exception e){
gpuNotFound = true;
}
if(gpuNotFound){
std::cout<<"GPU not found, falling back to CPU!"<<std::endl;
context = cl::Context(CL_DEVICE_TYPE_CPU, cps, NULL, NULL, &status);
if (status != CL_SUCCESS){
std::cout<<"Error: Creating context!"<<std::endl;
return FAILURE;
}
}
Program program;
try{
std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
queue = CommandQueue(context, devices[0]);
std::ifstream sourceFile("Rationals.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
program = Program(context, source);
program.build(devices, buildOption);
kernelA = Kernel(program, "countRationals");
inputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
outputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
}catch(cl::Error e){
std::cout << e.what() << std::endl;
std::cout << "Build Status: " << program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(cl::Device::getDefault()) << std::endl;
std::cout << "Build Options:\t" << program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(cl::Device::getDefault()) << std::endl;
std::cout << "Build Log:\t " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(cl::Device::getDefault()) << std::endl;
return FAILURE;
}
return SUCCESS;
}
int execute(unsigned long long* inputList, unsigned long long* outputList) {
try{
queue.enqueueWriteBuffer(inputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), inputList);
kernelA.setArg(0, inputBuffer);
kernelA.setArg(1, outputBuffer);
NDRange global(NUM_ELEMENTS/2);
NDRange local(256);
queue.enqueueNDRangeKernel(kernelA, NullRange, global, local);
queue.enqueueReadBuffer(outputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), outputList);
}catch(cl::Error e){
std::cout << "Line "<< __LINE__<<": Error in "<<e.what() <<std::endl;
return FAILURE;
}
return SUCCESS;
}
using namespace std;
int main(int argc, char* argv[]){
unsigned long long minNum, maxNum;
if (argc == 2){
minNum = pow(3, atoi(argv[1]));
maxNum = pow(3, atoi(argv[1]) + 1);
}
else if (argc == 3){
minNum = pow(3, atoi(argv[1]));
maxNum = pow(3, atoi(argv[2]));
}
else if (argc == 4){
minNum = pow(3, atoi(argv[1]));
maxNum = pow(3, atoi(argv[2]));
}
else return -1;
unsigned long long *q = nullptr, *result = nullptr, *old = nullptr, *newq = nullptr;
thread workThread, outThread, genThread;
q = new unsigned long long[NUM_ELEMENTS];
newList(minNum, q);
result = new unsigned long long[NUM_ELEMENTS];
newq = new unsigned long long[NUM_ELEMENTS];
init();
genThread = thread(newList, minNum+NUM_ELEMENTS, newq);
workThread = thread(execute, q, result);
workThread.join();
genThread.join();
for(unsigned long long i = minNum + NUM_ELEMENTS; i < maxNum + NUM_ELEMENTS; i += NUM_ELEMENTS){
old = result;
q = newq;
result = new unsigned long long[NUM_ELEMENTS];
newq = new unsigned long long[NUM_ELEMENTS];
genThread = thread(newList, i+NUM_ELEMENTS, newq);
workThread = thread(execute, q, result);
outThread = thread(printOutput, i-NUM_ELEMENTS, old);
workThread.join();
outThread.join();
genThread.join();
delete[] old;
delete[] q;
q = old = nullptr;
}
delete[] newq;
delete[] result;
return 0;
}
And the kernel code
bool testCantor(unsigned long p, unsigned long q){
while(q % 3 == 0){
q /= 3;
if (p/q == 1) return p==q;
p %= q;
}
unsigned long p_start = p;
do{
unsigned long p3 = p * 3;
if(p3/q == 1) return false;
p = p3 % q;
} while(p != p_start);
return true;
}
int coprime(unsigned long a, unsigned long b){
unsigned long c;
while (a != 0){
c = a;
a = b % a;
b = c;
}
return 2*((b == 1)&1);
}
__kernel
void countRationals(__global unsigned long *input, __global unsigned long *output){
int gid = get_global_id(0);
unsigned long q = input[gid], p = 1;
output[gid] = 0;
for(p = 1; p <= q/3; p++){
if(p % 3 != 0 && testCantor(p, q))
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
gid = 32767 - get_global_id(0);
q = input[gid];
output[gid] = 0;
for(p = 1; p <= q/3; p++){
if(p % 3 != 0 && testCantor(p, q))
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
}
Is there a better way for me to implement this? I'm pretty new to OpenCL(I started with it less than 24 hours ago) so am probably making some rather obvious mistakes.
EDIT: I figured out that I was only spawning 2 threads. I've changed it to spawn 32 threads with 256 q each. It crashes when I run it from 13 to 14 now though and I have no idea why. It doesn't crash from 10 to 11
EDIT2: I implemented most of the suggestions(couldn't figure out how to remove the if(coprime(p,q))) and now it runs a little bit faster(less than a second difference at n=10). Is there much else I can do to speed it up? it's only running 33% faster than my processor on the same task.
EDIT3: managed to implement it with bit twiddling. Not sure if there are any other conditionals I can do that to though. Still not seeing a very big performance boost(any suggestions?)