2

I have been trying to write a program that counts rationals on the Cantor set with certain denominators. I found that with my computer it takes 20 hours or more to compute the number between 3^14 and 3^15. I thought that since this is testing a large number of separate values it would be a good thing to implement on a graphics card with OpenCL. When I tried to implement it though I get performance a few orders of magnitude slower than my CPU implementation. Here is the code of my attempt.

#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <functional>
#include <ctime>
#include <iostream>
#include <fstream>
#include <exception>
#include <cstdlib>
#include <vector>
#include <thread>
#include <cmath>
#include <string>
#include <algorithm>
#include <thread>
#include <cmath>
#include <sstream>

#define SUCCESS 0
#define FAILURE 1
#define EXPECTED_FAILURE 2

const int NUM_ELEMENTS = 32768;

void printOutput(unsigned long long start, unsigned long long *values){
    for(unsigned int i = 0; i < NUM_ELEMENTS; i++)
       if (values[i] != 0)
            std::cout << start+i << ',' << values[i] << std::endl;
}

void newList(unsigned long long start, unsigned long long *dataList){
    for(int i=0; i < NUM_ELEMENTS; ++i)
        dataList[i] = start + i;
}

using namespace cl;

Kernel kernelA;
Context context;
CommandQueue queue;
Buffer inputBuffer;
Buffer outputBuffer;

int init() {
    cl_int status = 0;
    const char* buildOption ="-x clc++ ";
    std::vector<Platform> platforms;
    status = Platform::get(&platforms);
    if (status != CL_SUCCESS){
        std::cout<<"Error: Getting platforms!"<<std::endl;
        return FAILURE;
    }
    std::vector<cl::Platform>::iterator iter;
    for(iter = platforms.begin(); iter != platforms.end(); ++iter)
        if(!strcmp((*iter).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc."))
            break;
    cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(*iter) (), 0};
    bool gpuNotFound = false;
    try{
        context = cl::Context(CL_DEVICE_TYPE_GPU, cps, NULL, NULL, &status);
    }
    catch(std::exception e){
        gpuNotFound = true;
    }
    if(gpuNotFound){
        std::cout<<"GPU not found, falling back to CPU!"<<std::endl;
        context = cl::Context(CL_DEVICE_TYPE_CPU, cps, NULL, NULL, &status);
        if (status != CL_SUCCESS){
            std::cout<<"Error: Creating context!"<<std::endl;
            return FAILURE;
        }
    }
    Program program;
    try{
        std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
        queue = CommandQueue(context, devices[0]);
        std::ifstream sourceFile("Rationals.cl");
        std::string sourceCode(
            std::istreambuf_iterator<char>(sourceFile),
            (std::istreambuf_iterator<char>()));
        Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
        program = Program(context, source);
        program.build(devices, buildOption);
        kernelA = Kernel(program, "countRationals");
        inputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
        outputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
    }catch(cl::Error e){
        std::cout << e.what() << std::endl;
        std::cout << "Build Status: " << program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(cl::Device::getDefault()) << std::endl;
        std::cout << "Build Options:\t" << program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(cl::Device::getDefault()) << std::endl;
        std::cout << "Build Log:\t " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(cl::Device::getDefault()) << std::endl;
        return FAILURE;
    }
    return SUCCESS;
}

int execute(unsigned long long* inputList, unsigned long long* outputList) {
    try{
        queue.enqueueWriteBuffer(inputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), inputList);
        kernelA.setArg(0, inputBuffer);
        kernelA.setArg(1, outputBuffer);
        NDRange global(NUM_ELEMENTS/2);
        NDRange local(256);
        queue.enqueueNDRangeKernel(kernelA, NullRange, global, local);
        queue.enqueueReadBuffer(outputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), outputList);
    }catch(cl::Error e){
        std::cout << "Line "<< __LINE__<<": Error in "<<e.what() <<std::endl;
        return FAILURE;
    }
    return SUCCESS;
}

using namespace std;

int main(int argc, char* argv[]){
    unsigned long long minNum, maxNum;
    if (argc == 2){
        minNum = pow(3, atoi(argv[1]));
        maxNum = pow(3, atoi(argv[1]) + 1);
    }
    else if (argc == 3){
        minNum = pow(3, atoi(argv[1]));
        maxNum = pow(3, atoi(argv[2]));
    }
    else if (argc == 4){
        minNum = pow(3, atoi(argv[1]));
        maxNum = pow(3, atoi(argv[2]));
    }
    else return -1;
    unsigned long long *q = nullptr, *result = nullptr, *old = nullptr, *newq = nullptr;
    thread workThread, outThread, genThread;
    q = new unsigned long long[NUM_ELEMENTS];
    newList(minNum, q);
    result = new unsigned long long[NUM_ELEMENTS];
    newq = new unsigned long long[NUM_ELEMENTS];
    init();
    genThread = thread(newList, minNum+NUM_ELEMENTS, newq);
    workThread = thread(execute, q, result);
    workThread.join();
    genThread.join();
    for(unsigned long long i = minNum + NUM_ELEMENTS; i < maxNum  + NUM_ELEMENTS; i += NUM_ELEMENTS){
        old = result;
        q = newq;
        result = new unsigned long long[NUM_ELEMENTS];
        newq = new unsigned long long[NUM_ELEMENTS];
        genThread = thread(newList, i+NUM_ELEMENTS, newq);
        workThread = thread(execute, q, result);
        outThread = thread(printOutput, i-NUM_ELEMENTS, old);
        workThread.join();
        outThread.join();
        genThread.join();
        delete[] old;
        delete[] q;
        q = old = nullptr;
    }
    delete[] newq;
    delete[] result;
    return 0;
}

And the kernel code

bool testCantor(unsigned long p, unsigned long q){
    while(q % 3 == 0){
        q /= 3;
        if (p/q == 1) return p==q;
        p %= q;
    }
    unsigned long p_start = p;
    do{
        unsigned long p3 = p * 3;
        if(p3/q == 1) return false;
        p = p3 % q;
    } while(p != p_start);
    return true;
}

int coprime(unsigned long a, unsigned long b){
    unsigned long c;
    while (a != 0){
        c = a;
        a = b % a;
        b = c;
    }
    return 2*((b == 1)&1);
}

__kernel
void countRationals(__global unsigned long *input, __global unsigned long *output){
    int gid = get_global_id(0);
    unsigned long q = input[gid], p = 1;
    output[gid] = 0;
    for(p = 1; p <= q/3; p++){
        if(p % 3 != 0 && testCantor(p, q))
            for(unsigned long i = p; i <= q/3; i *= 3)
                    output[gid] += coprime(i,q);
    }
    gid = 32767 - get_global_id(0);
    q = input[gid];
    output[gid] = 0;
    for(p = 1; p <= q/3; p++){
        if(p % 3 != 0 && testCantor(p, q))
            for(unsigned long i = p; i <= q/3; i *= 3)
                    output[gid] +=  coprime(i,q);
    }
}

Is there a better way for me to implement this? I'm pretty new to OpenCL(I started with it less than 24 hours ago) so am probably making some rather obvious mistakes.

EDIT: I figured out that I was only spawning 2 threads. I've changed it to spawn 32 threads with 256 q each. It crashes when I run it from 13 to 14 now though and I have no idea why. It doesn't crash from 10 to 11

EDIT2: I implemented most of the suggestions(couldn't figure out how to remove the if(coprime(p,q))) and now it runs a little bit faster(less than a second difference at n=10). Is there much else I can do to speed it up? it's only running 33% faster than my processor on the same task.

EDIT3: managed to implement it with bit twiddling. Not sure if there are any other conditionals I can do that to though. Still not seeing a very big performance boost(any suggestions?)

1 Answer 1

1
int execute(unsigned long long* inputList, unsigned long long* outputList) {
    try
    {
       ...
    }
    catch(cl::Error e)
    {
       ...
    }
    return SUCCESS;

is creating buffers. If you are using execute() many times, it will have a buffer create/garbage-collect overhead. Also your global range is just two times the local range which means only two compute unit of your gpu will be used. If your card has 20 compute units then global range should be at least 40*local range. Just 512 elements are not enough to keep gpu busy. At least for half of the cores. The for(p = 1; p <= q/3; p++) loop is not same for all cores. Some cores counts to 10 while another one counts to 100, this destroys the order of execution between cores. You should make a more balanced kernel. For example:

Feed the first core to calculate first and last elements, second core to calculate second and N-1st, .... so all cores do nearly equal jobs instead of being idle waiting the latter cores.

__kernel
void countRationals(__global unsigned long *input, __global unsigned long *output)
{
    // computing first element (least workload among the array)
    int gid = get_global_id(0);
    unsigned long q = input[gid], p = 1;
    output[gid] = 0;
    for(p = 1; p <= q/3; p++) // counts to 10 ....
    {
        if(p % 3 != 0 && testCantor(p, q))
            for(unsigned long i = p; i <= q/3; i *= 3)
                if(coprime(i,q))
                    output[gid] += 2;
    }

    //+ computing (N-gid) element (heaviest workload among the array)
    int N_gid = findOtherIndex(get_global_id(0));
    unsigned long N_q = input[N_gid], N_p = 1;
    output[N_gid] = 0;
    for(N_p = 1; N_p <= N_q/3; N_p++) // counts to 100? 
    {
        if(N_p % 3 != 0 && testCantor(N_p, N_q))
            for(unsigned long i = p; i <= q/3; i *= 3)
                if(coprime(i,N_q))
                    output[N_gid] += 2;
    }

     //this way, adjacent cores will have "closer to equal"  work. 

}

So, if you have 4096 elements, first core will compute 1st and 4096th elements, second core will compute 2nd and 4095th elements, .... local range of 64 and global range of 4096 should be fine for a start. If you use too many "if", then you should put an "else" for each of them to do dummy work to keep order of computation between cores. Or you can remove some "if" if they are as simple as:

 if(a>b)c+=d;

can be intercepted as

 c+=d*bitTwiddle_and_absoluteValue(a,b); // does only computation, not branching is good for gpu.
 implement bitTwiddle_and_absoluteValue(a,b) such that it returns zero when a<=b and 1 when a>b

Edit:

 giving global size a multiple of number of cores of GPU could give an extre performance.

Edit: lets optimize the

 for(p = 1; p <= q/3; p++){
        if(p % 3 != 0 && testCantor(p, q))
            for(unsigned long i = p; i <= q/3; i *= 3)
                    output[gid] +=  coprime(i,q);
    }

p%3!=0 means only 1 or 2.

p%3 == 1 is satisfied by p=1,4,7,10,... => our first loop

p%3 == 2 is satisfied by p=2,5,8,11,... => our second loop

Lets concatenate these:

 for(p = 1; p <= q/3; p+=3){ // p%3==1 is satisfied
        if(testCantor(p, q)) // so no need for testing modulus
            for(unsigned long i = p; i <= q/3; i *= 3)
                    output[gid] +=  coprime(i,q);
    }

 for(p = 2; p <= q/3; p+=3){ // p%3==2 is satisfied
        if(testCantor(p, q)) // so no need for testing modulus 
            for(unsigned long i = p; i <= q/3; i *= 3)
                    output[gid] +=  coprime(i,q);
    }

//so we got rid of this part:
for(p = 0; p <= q/3; p+=3){      // p%3==0 is not engaging "if" so we dont need
            if(testCantor(p, q))                     // this loop anymore lol :D
                for(unsigned long i = p; i <= q/3; i *= 3)
                        output[gid] +=  coprime(i,q);
        }

As a bonus, total loop iterations are reduced by 1/3 which should boost a little bit.

Edit: the while-loop has a modulus and does not use floating point potential of GPU.

//here convert integers to floats a,b,c
while (a != 0){ // this will need a tolerance range, exact zero is nearly impossible
        c = a;
        a = b % a; //emulate this using fp
         // example: 5%3 --> 5.0 / 3.0 gives 1.yyy so we have 1 at least
         // then we subtract like: 5.0 - floor(5.0/3.0)*3.0
         // we have 2.0 which is 5%3
         // this is just a single condition
         // looks like b%a can be b-floor(b/a)*a but Im not sure
         // good luck!
        b = c;
    }
// here convert floats back to integers again

How can one emulate modulus with using only fp arithmetics without losing precision?
Sign up to request clarification or add additional context in comments.

10 Comments

I'm not sure how to implement bitTwiddle, but otherwise I implemented most of your suggestions and got a pretty major speed boost.
So, the "data array folding" onto "thread array" worked? Bittwiddling needs some kind of signum functions with an absolute value function as in this question stackoverflow.com/questions/21465367/… .
I managed to implement it with 2*((b==1)&1). OpenCL formats bool values really weirdly
What is your CPU and GPU? Emulating integer calculations withh floating-point calculations can give extra performance if you are not memory bottlencked. Can you give us kernel times, buffer read/write times please?
I have an i7-4770k at 4.5ghz and a Sapphire 7970 Reference(running at default clocks 925/1375). Using floating point might work, but there is a loss of precision(12 bits lost when using double as compared to unsigned long). What are good ways to get those times on Windows? I know how to time on linux, but I don't think there is a similar command on windows.
|

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.