I need help to parallelize this code using OpenMP

Question

I wrote a C code that I would like to parallelize using OpenMP (I am a beginner and I have just a few days to solve this task); let's start from the main: first of all I have initialized 6 vectors (Vx,Vy,Vz,thetap,phip,theta); then there is a for loop that cycles over Nmax; inside of this loop I allocate some memory for the structure I have defined at the very top of the code; the structure is called coll_CPU and increases its size every cycle; then I pick some of the values from the vectors I have mentioned before and I place them into the structure; so at this point my structure coll_CPU is filled with Ncoll elements; during this process I used some of the functions declared outside of the main (these functions are random number generators). Now comes the important part: in my serial code I use a for loop to pass every single element of the structure to a function called collisionCPU (this function just gets the inputs and multiplies them by 2); My goal is to parallelize this loop so that each of my CPUs gives its contribution to do this operation and speed up the process.

Here are the codes:

main.c

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <memory.h>
#include <string.h>
#include <time.h>
#include <omp.h>

#define pi2 6.283185307
#define pi  3.141592654
#define IMUL(a,b) __mul24(a,b)

typedef struct {
   int seme;
} iniran;

typedef struct{
   int jp1;
   int jp2;
   float kx;
   float ky;
   float kz;
   float vAx;
   float vAy;
   float vAz;
   float vBx;
   float vBy;
   float vBz;
   float tetaAp;
   float phiAp;
   float tetaA;
   float tetaBp;
   float phiBp;
   float tetaB;
   float kAx;
   float kAy;
   float kAz;
   float kBx;
   float kBy;
   float kBz;
   int caso;
} stato_struct;

stato_struct *coll_CPU=0;

unsigned int timer;

#include "DSMC_kernel_float.c"

//=============================================================
float min(float *a, float*b){
   if(*a<*b){
    return *a;
   }
   else{
    return *b;
   }
}
//=============================================================
float max(float *a, float*b){
  if(*a>*b){
    return *a;
  }
  else{
    return *b;
  }
}
//=============================================================
float rf(int *idum){
  static int iff=0;
  static int inext, inextp, ma[55];
  int mj, mk;
  int i, k, ii;
  float ret_val;

  if (*idum<0 || iff==0) {
      iff=1;
      mj=161803398 - abs(*idum);
      mj %= 1000000000;
      ma[54]=mj;
      mk=1;
      for (i=1; i<=54; ++i){
        ii=(i*21)%55;
        ma[ii-1]=mk;
        mk=mj-mk;
        if (mk<0) {
            mk += 1000000000;
        }
      mj= ma[ii-1];
      }
    for(k=1; k<=4; ++k) {
        for(i=1; i<=55; ++i){
            ma[i-1] -= ma[(i+30)%55];
            if (ma[i-1]<0){
                ma[i-1] += 1000000000;
            }
        }
    }
    inext=0;
    inextp=31;
    *idum=1;
    }
   ++inext;
   if (inext==56){
    inext=1;
   }
  ++inextp;
  if (inextp==56){
    inextp=1;
  }
  mj=ma[inext-1]-ma[inextp-1];
  if (mj<0){
    mj += 1000000000;
  }
  ma[inext-1]=mj;
  ret_val=mj*1.0000000000000001e-9;
  return ret_val;
  }
      //============================================================            
 int genk(float *kx, float *ky, float *kz, int *p2seme){
 // float sqrtf(float), sinf(float), cosf(float);
 extern float rf(int *);
 static float phi;

 *kx=rf(p2seme) * 2. -1.f;
 *ky= sqrtf(1. - *kx * *kx);
 phi=pi2*rf(p2seme);
 *kz=*ky * sinf(phi);
 *ky *= cosf(phi);
 return 0;
}
//==============================================================
int main(void){

float msec_kernel;  
int Np=10000, Nmax=512; 
int id,jp,jcoll,Ncoll,jp1, jp2, ind;
float Vx[Np],Vy[Np],Vz[Np],teta[Np],tetap[Np],phip[Np];
float kx, ky, kz, Vrx, Vry, Vrz, scalprod, fk;
float kAx, kAy, kAz, kBx, kBy, kBz;

iniran1.seme=7593;

for(jp=1;jp<=Np;jp++){
  if(jp<=Np/2){
  Vx[jp-1]=2.5;
  Vy[jp-1]=0;
  Vz[jp-1]=0;
  tetap[jp-1]=0;
  phip[jp-1]=0;
  teta[jp-1]=0;
}

for (Ncoll=1;Ncoll<=Nmax;Ncoll += 10){

  coll_CPU=(stato_struct*) malloc(Ncoll*sizeof(stato_struct));
  jcoll=0;

  while (jcoll<Ncoll){
     jp1=1+floorf(Np*rf(&iniran1.seme));
     jp2=1+floorf(Np*rf(&iniran1.seme));
     genk(&kx,&ky,&kz,&iniran1.seme);
     Vrx=Vx[jp2-1]-Vx[jp1-1];
     Vry=Vy[jp2-1]-Vy[jp1-1];
     Vrz=Vz[jp2-1]-Vz[jp1-1];
     scalprod=Vrx*kx+Vry*ky+Vrz*kz;

 if (scalprod<0) {
    genk(&kAx,&kAy,&kAz,&iniran1.seme);
    genk(&kBx,&kBy,&kBz,&iniran1.seme);
    coll_CPU[jcoll].jp1= jp1;
    coll_CPU[jcoll].jp2=jp2;
    coll_CPU[jcoll].kx=kx;
    coll_CPU[jcoll].ky=ky;
    coll_CPU[jcoll].kz=kz;
    coll_CPU[jcoll].vAx=Vx[jp1-1];
    coll_CPU[jcoll].vAy=Vy[jp1-1];              
    coll_CPU[jcoll].vAz=Vz[jp1-1];              
    coll_CPU[jcoll].vBx=Vx[jp2-1];              
    coll_CPU[jcoll].vBy=Vy[jp2-1];              
    coll_CPU[jcoll].vBz=Vz[jp2-1];              
    coll_CPU[jcoll].tetaAp=tetap[jp1-1];                
    coll_CPU[jcoll].phiAp=phip[jp1-1];              
    coll_CPU[jcoll].tetaA=teta[jp1-1];              
    coll_CPU[jcoll].tetaBp=tetap[jp2-1];                
    coll_CPU[jcoll].phiBp=phip[jp2-1];              
    coll_CPU[jcoll].tetaB=teta[jp2-1];
    coll_CPU[jcoll].kAx=kAx;
    coll_CPU[jcoll].kAy=kAy;
    coll_CPU[jcoll].kAz=kAz;
    coll_CPU[jcoll].kBx=kBx;
    coll_CPU[jcoll].kBy=kBy;
    coll_CPU[jcoll].kBz=kBz;
    coll_CPU[jcoll].caso=1;

    jcoll++;
   }
  }

  clock_t t;
  t = clock();
  #pragma omp parallel for private(id)  //HERE IS WHERE I TRIED TO DO THE PARALLELIZATION BUT WITH NO SUCCESS. WHAT DO I HAVE TO TYPE INSTEAD???
  for(id=0;id<Nmax;id++){   
     CollisioniCPU(coll_CPU,id);
  } 
  t = clock() - t;
  msec_kernel = ((float)t*1000)/CLOCKS_PER_SEC; 
  printf("Tempo esecuzione kernel:%e s\n",msec_kernel*1e-03);

  for (ind=0;ind<Ncoll;ind++){
     if (coll_CPU[ind].caso==4)
    Ncoll_eff++;
    else if (coll_CPU[ind].caso==0)
       Ncoll_div++;
    else 
       Ncoll_dim++;
    }
  free(coll_CPU);
  }
return 0;
}

DSMC_kernel_float.c

void CollisioniCPU(stato_struct *coll_CPU, int id){
float vettA[6], vettB[6];

vettA[0]=coll_CPU[id].vAx;
vettA[1]=coll_CPU[id].vAy;
vettA[2]=coll_CPU[id].vAz;
vettA[3]=coll_CPU[id].tetaAp;
vettA[4]=coll_CPU[id].phiAp;
vettA[5]=coll_CPU[id].tetaA;
vettB[0]=coll_CPU[id].vBx;
vettB[1]=coll_CPU[id].vBy;
vettB[2]=coll_CPU[id].vBz;
vettB[3]=coll_CPU[id].tetaBp;
vettB[4]=coll_CPU[id].phiBp;
vettB[5]=coll_CPU[id].tetaB;

coll_CPU[id].vAx=2*vettA[0];    
coll_CPU[id].vAy=2*vettA[1];
coll_CPU[id].vAz=2*vettA[2];
coll_CPU[id].tetaAp=2*vettA[3];
coll_CPU[id].phiAp=2*vettA[4];
coll_CPU[id].tetaA=2*vettA[5];
coll_CPU[id].vBx=2*vettB[0];
coll_CPU[id].vBy=2*vettB[1];
coll_CPU[id].vBz=2*vettB[2];
coll_CPU[id].tetaBp=2*vettB[3];
coll_CPU[id].phiBp=2*vettB[4];
coll_CPU[id].tetaB=2*vettB[5];
}

In order to compile the program I type this line on the terminal: gcc -fopenmp time_analysis.c -o time_analysis -lm fallowed by export OMP_NUM_THREADS=1; however once I run the executable I get this error message:

Error in `./time_analysis': double free or corruption (!prev): 0x00000000009602c0 ***
Aborted

What does this error mean? what I have done wrong in the main function when I tried to parallelize the for loop? and most important: what should I type instead in order to make my code go on parallel? please help me out if you can because I seriously have no time to study OpenMP from scratch and I need to get this job done right away.

user0815 · Accepted Answer · 2015-03-02 18:05:34Z

1

Changing the inner loop as follows should bring you one step further.

#pragma omp parallel for private(id)
for(id=0;id<Ncoll;id++){   
  CollisioniCPU(coll_CPU,id);
}

Your OpenMP line seems okay, but I doubt that it will lead to significant improvements in runtime. You should optimize the surrounding code as well. Allocating the memory once outside of your loops would be a good start.

By the way, is there any reason for this verbose coding style and not using a more compact and readable version as this one?

void CollisioniCPU(stato_struct *coll_CPU, int id) {
  stato_struct *ptr = coll_CPU + id;
  ptr->vAx *= 2;
  ptr->vAy *= 2;
  ptr->vAz *= 2;
  ptr->tetaAp *= 2;
  ptr->phiAp *= 2;
  ptr->tetaA *= 2;
  ptr->vBx *= 2;
  ptr->vBy *= 2;
  ptr->vBz *= 2;
  ptr->tetaBp *= 2;
  ptr->phiBp *= 2;
  ptr->tetaB *= 2;
}

answered Mar 2, 2015 at 18:05

user0815

1,4088 silver badges11 bronze badges

Sign up to request clarification or add additional context in comments.

1 Comment

Federico Gentile Over a year ago

Perfect! that was a dumb mistake I made! for what concerns the function it had just a random with no meaning; thanks anyways for your advice!!!!

Collectives™ on Stack Overflow

I need help to parallelize this code using OpenMP

1 Answer 1

1 Comment

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

1 Comment

Your Answer

Sign up or log in

Post as a guest

Related