0

I wrote a C code that I would like to parallelize using OpenMP (I am a beginner and I have just a few days to solve this task); let's start from the main: first of all I have initialized 6 vectors (Vx,Vy,Vz,thetap,phip,theta); then there is a for loop that cycles over Nmax; inside of this loop I allocate some memory for the structure I have defined at the very top of the code; the structure is called coll_CPU and increases its size every cycle; then I pick some of the values from the vectors I have mentioned before and I place them into the structure; so at this point my structure coll_CPU is filled with Ncoll elements; during this process I used some of the functions declared outside of the main (these functions are random number generators). Now comes the important part: in my serial code I use a for loop to pass every single element of the structure to a function called collisionCPU (this function just gets the inputs and multiplies them by 2); My goal is to parallelize this loop so that each of my CPUs gives its contribution to do this operation and speed up the process.

Here are the codes:

main.c

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <memory.h>
#include <string.h>
#include <time.h>
#include <omp.h>

#define pi2 6.283185307
#define pi  3.141592654
#define IMUL(a,b) __mul24(a,b)

typedef struct {
   int seme;
} iniran;

typedef struct{
   int jp1;
   int jp2;
   float kx;
   float ky;
   float kz;
   float vAx;
   float vAy;
   float vAz;
   float vBx;
   float vBy;
   float vBz;
   float tetaAp;
   float phiAp;
   float tetaA;
   float tetaBp;
   float phiBp;
   float tetaB;
   float kAx;
   float kAy;
   float kAz;
   float kBx;
   float kBy;
   float kBz;
   int caso;
} stato_struct;

stato_struct *coll_CPU=0;

unsigned int timer;

#include "DSMC_kernel_float.c"

//=============================================================
float min(float *a, float*b){
   if(*a<*b){
    return *a;
   }
   else{
    return *b;
   }
}
//=============================================================
float max(float *a, float*b){
  if(*a>*b){
    return *a;
  }
  else{
    return *b;
  }
}
//=============================================================
float rf(int *idum){
  static int iff=0;
  static int inext, inextp, ma[55];
  int mj, mk;
  int i, k, ii;
  float ret_val;

  if (*idum<0 || iff==0) {
      iff=1;
      mj=161803398 - abs(*idum);
      mj %= 1000000000;
      ma[54]=mj;
      mk=1;
      for (i=1; i<=54; ++i){
        ii=(i*21)%55;
        ma[ii-1]=mk;
        mk=mj-mk;
        if (mk<0) {
            mk += 1000000000;
        }
      mj= ma[ii-1];
      }
    for(k=1; k<=4; ++k) {
        for(i=1; i<=55; ++i){
            ma[i-1] -= ma[(i+30)%55];
            if (ma[i-1]<0){
                ma[i-1] += 1000000000;
            }
        }
    }
    inext=0;
    inextp=31;
    *idum=1;
    }
   ++inext;
   if (inext==56){
    inext=1;
   }
  ++inextp;
  if (inextp==56){
    inextp=1;
  }
  mj=ma[inext-1]-ma[inextp-1];
  if (mj<0){
    mj += 1000000000;
  }
  ma[inext-1]=mj;
  ret_val=mj*1.0000000000000001e-9;
  return ret_val;
  }
      //============================================================            
 int genk(float *kx, float *ky, float *kz, int *p2seme){
 // float sqrtf(float), sinf(float), cosf(float);
 extern float rf(int *);
 static float phi;

 *kx=rf(p2seme) * 2. -1.f;
 *ky= sqrtf(1. - *kx * *kx);
 phi=pi2*rf(p2seme);
 *kz=*ky * sinf(phi);
 *ky *= cosf(phi);
 return 0;
}
//==============================================================
int main(void){

float msec_kernel;  
int Np=10000, Nmax=512; 
int id,jp,jcoll,Ncoll,jp1, jp2, ind;
float Vx[Np],Vy[Np],Vz[Np],teta[Np],tetap[Np],phip[Np];
float kx, ky, kz, Vrx, Vry, Vrz, scalprod, fk;
float kAx, kAy, kAz, kBx, kBy, kBz;

iniran1.seme=7593;

for(jp=1;jp<=Np;jp++){
  if(jp<=Np/2){
  Vx[jp-1]=2.5;
  Vy[jp-1]=0;
  Vz[jp-1]=0;
  tetap[jp-1]=0;
  phip[jp-1]=0;
  teta[jp-1]=0;
}

for (Ncoll=1;Ncoll<=Nmax;Ncoll += 10){

  coll_CPU=(stato_struct*) malloc(Ncoll*sizeof(stato_struct));
  jcoll=0;

  while (jcoll<Ncoll){
     jp1=1+floorf(Np*rf(&iniran1.seme));
     jp2=1+floorf(Np*rf(&iniran1.seme));
     genk(&kx,&ky,&kz,&iniran1.seme);
     Vrx=Vx[jp2-1]-Vx[jp1-1];
     Vry=Vy[jp2-1]-Vy[jp1-1];
     Vrz=Vz[jp2-1]-Vz[jp1-1];
     scalprod=Vrx*kx+Vry*ky+Vrz*kz;

 if (scalprod<0) {
    genk(&kAx,&kAy,&kAz,&iniran1.seme);
    genk(&kBx,&kBy,&kBz,&iniran1.seme);
    coll_CPU[jcoll].jp1= jp1;
    coll_CPU[jcoll].jp2=jp2;
    coll_CPU[jcoll].kx=kx;
    coll_CPU[jcoll].ky=ky;
    coll_CPU[jcoll].kz=kz;
    coll_CPU[jcoll].vAx=Vx[jp1-1];
    coll_CPU[jcoll].vAy=Vy[jp1-1];              
    coll_CPU[jcoll].vAz=Vz[jp1-1];              
    coll_CPU[jcoll].vBx=Vx[jp2-1];              
    coll_CPU[jcoll].vBy=Vy[jp2-1];              
    coll_CPU[jcoll].vBz=Vz[jp2-1];              
    coll_CPU[jcoll].tetaAp=tetap[jp1-1];                
    coll_CPU[jcoll].phiAp=phip[jp1-1];              
    coll_CPU[jcoll].tetaA=teta[jp1-1];              
    coll_CPU[jcoll].tetaBp=tetap[jp2-1];                
    coll_CPU[jcoll].phiBp=phip[jp2-1];              
    coll_CPU[jcoll].tetaB=teta[jp2-1];
    coll_CPU[jcoll].kAx=kAx;
    coll_CPU[jcoll].kAy=kAy;
    coll_CPU[jcoll].kAz=kAz;
    coll_CPU[jcoll].kBx=kBx;
    coll_CPU[jcoll].kBy=kBy;
    coll_CPU[jcoll].kBz=kBz;
    coll_CPU[jcoll].caso=1;

    jcoll++;
   }
  }

  clock_t t;
  t = clock();
  #pragma omp parallel for private(id)  //HERE IS WHERE I TRIED TO DO THE PARALLELIZATION BUT WITH NO SUCCESS. WHAT DO I HAVE TO TYPE INSTEAD???
  for(id=0;id<Nmax;id++){   
     CollisioniCPU(coll_CPU,id);
  } 
  t = clock() - t;
  msec_kernel = ((float)t*1000)/CLOCKS_PER_SEC; 
  printf("Tempo esecuzione kernel:%e s\n",msec_kernel*1e-03);

  for (ind=0;ind<Ncoll;ind++){
     if (coll_CPU[ind].caso==4)
    Ncoll_eff++;
    else if (coll_CPU[ind].caso==0)
       Ncoll_div++;
    else 
       Ncoll_dim++;
    }
  free(coll_CPU);
  }
return 0;
}

DSMC_kernel_float.c

void CollisioniCPU(stato_struct *coll_CPU, int id){
float vettA[6], vettB[6];

vettA[0]=coll_CPU[id].vAx;
vettA[1]=coll_CPU[id].vAy;
vettA[2]=coll_CPU[id].vAz;
vettA[3]=coll_CPU[id].tetaAp;
vettA[4]=coll_CPU[id].phiAp;
vettA[5]=coll_CPU[id].tetaA;
vettB[0]=coll_CPU[id].vBx;
vettB[1]=coll_CPU[id].vBy;
vettB[2]=coll_CPU[id].vBz;
vettB[3]=coll_CPU[id].tetaBp;
vettB[4]=coll_CPU[id].phiBp;
vettB[5]=coll_CPU[id].tetaB;

coll_CPU[id].vAx=2*vettA[0];    
coll_CPU[id].vAy=2*vettA[1];
coll_CPU[id].vAz=2*vettA[2];
coll_CPU[id].tetaAp=2*vettA[3];
coll_CPU[id].phiAp=2*vettA[4];
coll_CPU[id].tetaA=2*vettA[5];
coll_CPU[id].vBx=2*vettB[0];
coll_CPU[id].vBy=2*vettB[1];
coll_CPU[id].vBz=2*vettB[2];
coll_CPU[id].tetaBp=2*vettB[3];
coll_CPU[id].phiBp=2*vettB[4];
coll_CPU[id].tetaB=2*vettB[5];
}

In order to compile the program I type this line on the terminal: gcc -fopenmp time_analysis.c -o time_analysis -lm fallowed by export OMP_NUM_THREADS=1; however once I run the executable I get this error message:

Error in `./time_analysis': double free or corruption (!prev): 0x00000000009602c0 ***
Aborted

What does this error mean? what I have done wrong in the main function when I tried to parallelize the for loop? and most important: what should I type instead in order to make my code go on parallel? please help me out if you can because I seriously have no time to study OpenMP from scratch and I need to get this job done right away.

1 Answer 1

1

Changing the inner loop as follows should bring you one step further.

#pragma omp parallel for private(id)
for(id=0;id<Ncoll;id++){   
  CollisioniCPU(coll_CPU,id);
}

Your OpenMP line seems okay, but I doubt that it will lead to significant improvements in runtime. You should optimize the surrounding code as well. Allocating the memory once outside of your loops would be a good start.

By the way, is there any reason for this verbose coding style and not using a more compact and readable version as this one?

void CollisioniCPU(stato_struct *coll_CPU, int id) {
  stato_struct *ptr = coll_CPU + id;
  ptr->vAx *= 2;
  ptr->vAy *= 2;
  ptr->vAz *= 2;
  ptr->tetaAp *= 2;
  ptr->phiAp *= 2;
  ptr->tetaA *= 2;
  ptr->vBx *= 2;
  ptr->vBy *= 2;
  ptr->vBz *= 2;
  ptr->tetaBp *= 2;
  ptr->phiBp *= 2;
  ptr->tetaB *= 2;
}
Sign up to request clarification or add additional context in comments.

1 Comment

Perfect! that was a dumb mistake I made! for what concerns the function it had just a random with no meaning; thanks anyways for your advice!!!!

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.