1

this is my first time posting so I will appologise for my novice mistakes. Please also excuse the fact that not all variable names are in english. My problem is the following: i've written this code using openMP in both Visual Studio 2010 and in eclipse for c/c++ using the cygwin gcc compiler toolchain. In visual I get a speed-up but in eclipse I get a slow down twice the amount of the serial version. Can someone please explain what I have done wrong please? In short i'm just simulating the speed-up from when I copy from an array of 3D vectors into a double array in order to send over MPI.

#include <omp.h>
#include <time.h>
#include <stdio.h>
#include <vector>
const int NUMAR_FORME=10;
  const int NUMAR_SECUNDE_SIMULATE=60; //number of buffers
  const int dimensiuni_forme[10]={100,200,300,400,500,600,700,800,900,10000}; //size of each buffer
  //-------- the buffers, cuurently only worker_buffer and buff is used
     std::vector<std::vector<std::vector<double> > > worker_buffer;
  std::vector<std::vector<double> > send_buffer,corect;
  double **buff;
   double **worker_buffer1;
  long i,j,k,l;
int flag=0;
int numarator=0; //number of tests runed
clock_t start;
start=clock();
worker_buffer.resize(1);
buff = new double* [2];
int de_scris=0; //this tells me in which buffer to store, nou I alternate buff[0], buff[1], buff[0], buff[1]
worker_buffer[0].resize(NUMAR_SECUNDE_SIMULATE);
for(i=0;i<NUMAR_SECUNDE_SIMULATE;i++)
    worker_buffer[0][i].resize(dimensiuni_forme[9]);



while(numarator<60)
{

        if(numarator!=0)
            delete [] buff[de_scris];

    if(numarator!=0)
        de_scris=(de_scris+1)%2;
    long limita;
    limita=NUMAR_SECUNDE_SIMULATE*dimensiuni_forme[9]*3; //3-comes from the fact that I will have a 3D vector structure
    buff[de_scris]= new double [limita];
    for(i=0;i<NUMAR_SECUNDE_SIMULATE;i++)
    {   for(j=0;j<dimensiuni_forme[9];j++)
        {
            worker_buffer[0][i][j]=(i*dimensiuni_forme[9]+j)*3;
            buff[de_scris][(i*dimensiuni_forme[9]+j)*3]=worker_buffer[0][i][j];
            buff[de_scris][(i*dimensiuni_forme[9]+j)*3+1]=worker_buffer[0][i][j]+0.5;
            buff[de_scris][(i*dimensiuni_forme[9]+j)*3+2]=worker_buffer[0][i][j]+0.75;
        }
    }
    numarator++;

}

start=clock()-start;
printf("TICKS TOTAL %ld \n",start);
bool ad=true;
long nr;
for(i=0;i<NUMAR_SECUNDE_SIMULATE*dimensiuni_forme[9]*3;i++)
{
    if(i%3==0)
        nr=i;
    if(i%3==0 && buff[de_scris][i]!=i)
        ad=false;
    else
        if(i%3==1 &&buff[de_scris][i]!=(nr+0.5))
            ad=false;
        else
            if(i%3==2 && buff[de_scris][i]!=(nr+0.75))
                ad=false;
}
if(ad==false)
    printf("not correct \n");
start=clock();

    numarator=0;
//parallel version
while(numarator<60)
{


        if(numarator!=0)
            delete [] buff[de_scris];

    long index, limita,id;
    omp_set_num_threads(2);

    if(numarator!=0)
        de_scris=(de_scris+1)%2;
    limita=NUMAR_SECUNDE_SIMULATE*dimensiuni_forme[9]*3; //3-
    buff[de_scris]= new double [limita];
 #pragma omp parallel shared(worker_buffer,limita,buff) private(index,id)
    {
        printf("intram cu %d threaduri \n", omp_get_num_threads());
        id=omp_get_thread_num();
        //index=id;
        for(index=id;(index*3)<limita;index+=omp_get_num_threads())
        {
            buff[de_scris][index*3]=worker_buffer[0][index/dimensiuni_forme[9]][index%dimensiuni_forme[9]];  //aici va veni send_buff[index].x
            buff[de_scris][index*3+1]=buff[de_scris][index*3]+0.5;
            buff[de_scris][index*3+2]=buff[de_scris][index*3]+0.75;
        }

    //  index+=omp_get_num_threads();


    }//end parallel zone
    numarator++;
}

start=clock()-start;
printf("TICKS TOTAL %ld \n",start);
 ad=true;
    //testing for correctness
for(i=0;i<NUMAR_SECUNDE_SIMULATE*dimensiuni_forme[9]*3;i++)
{
    if(i%3==0)
        nr=i;
    if(i%3==0 && buff[de_scris][i]!=i)
        ad=false;
    else
        if(i%3==1 &&buff[de_scris][i]!=(nr+0.5))
            ad=false;
        else
            if(i%3==2 && buff[de_scris][i]!=(nr+0.75))
                ad=false;
}
if(ad==false)
    printf("not correct \n");
 return 0;
 }
2
  • Do you measure the time only with the clock function in your code? If so, you should know that clock returns the cpu time used, meaning if you use 2 prossesors the time returned by clock() will increase twice as fast as real time. I am not 100% sure if visucal c++ implements the clock() function the same way, could be the reason you measure different times. Commented Mar 24, 2012 at 11:36
  • I've tried using time() and clock_gettime() the result was the same Commented Mar 25, 2012 at 7:43

1 Answer 1

1

Judging by how you organized this for loop:

    for(index=id;(index*3)<limita;index+=omp_get_num_threads())
    {
        buff[de_scris][index*3]=worker_buffer[0][index/dimensiuni_forme[9]][index%dimensiuni_forme[9]];  //aici va veni send_buff[index].x
        buff[de_scris][index*3+1]=buff[de_scris][index*3]+0.5;
        buff[de_scris][index*3+2]=buff[de_scris][index*3]+0.75;
    }

and assuming that you have 4 threads, your threads will get interleaved index values:

thread 0: 0, 4,  8, 12,...
thread 1: 1, 5,  9, 13,...
thread 2: 2, 6, 10, 14,...
thread 3: 3, 7, 11, 15,...

which may be causing cache ping-pong effects, since values written by different threads may land on the same cache line, thus slowing down your execution.

Try to use a simple for loop with static partitioning instead, in order to get continuous partitions:

    #pragma omp parallel for
    for(index = 0; index < limita / 3;index++)
    {
        buff[de_scris][index*3]=worker_buffer[0][index/dimensiuni_forme[9]][index%dimensiuni_forme[9]];  //aici va veni send_buff[index].x
        buff[de_scris][index*3+1]=buff[de_scris][index*3]+0.5;
        buff[de_scris][index*3+2]=buff[de_scris][index*3]+0.75;
    }
Sign up to request clarification or add additional context in comments.

1 Comment

You were right thank-you very much , I used the loop that you have written and it showed a speed up. Strange how in visual it showed a speed without using the for directive.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.