I wrote a C code that I would like to parallelize using OpenMP (I am a beginner and I have just a few days to solve this task); let's start from the main: first of all I have initialized 6 vectors (Vx,Vy,Vz,thetap,phip,theta); then there is a for loop that cycles over Nmax; inside of this loop I allocate some memory for the structure I have defined at the very top of the code; the structure is called coll_CPU and increases its size every cycle; then I pick some of the values from the vectors I have mentioned before and I place them into the structure; so at this point my structure coll_CPU is filled with Ncoll elements; during this process I used some of the functions declared outside of the main (these functions are random number generators). Now comes the important part: in my serial code I use a for loop to pass every single element of the structure to a function called collisionCPU (this function just gets the inputs and multiplies them by 2); My goal is to parallelize this loop so that each of my CPUs gives its contribution to do this operation and speed up the process.
Here are the codes:
main.c
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <memory.h>
#include <string.h>
#include <time.h>
#include <omp.h>
#define pi2 6.283185307
#define pi 3.141592654
#define IMUL(a,b) __mul24(a,b)
typedef struct {
int seme;
} iniran;
typedef struct{
int jp1;
int jp2;
float kx;
float ky;
float kz;
float vAx;
float vAy;
float vAz;
float vBx;
float vBy;
float vBz;
float tetaAp;
float phiAp;
float tetaA;
float tetaBp;
float phiBp;
float tetaB;
float kAx;
float kAy;
float kAz;
float kBx;
float kBy;
float kBz;
int caso;
} stato_struct;
stato_struct *coll_CPU=0;
unsigned int timer;
#include "DSMC_kernel_float.c"
//=============================================================
float min(float *a, float*b){
if(*a<*b){
return *a;
}
else{
return *b;
}
}
//=============================================================
float max(float *a, float*b){
if(*a>*b){
return *a;
}
else{
return *b;
}
}
//=============================================================
float rf(int *idum){
static int iff=0;
static int inext, inextp, ma[55];
int mj, mk;
int i, k, ii;
float ret_val;
if (*idum<0 || iff==0) {
iff=1;
mj=161803398 - abs(*idum);
mj %= 1000000000;
ma[54]=mj;
mk=1;
for (i=1; i<=54; ++i){
ii=(i*21)%55;
ma[ii-1]=mk;
mk=mj-mk;
if (mk<0) {
mk += 1000000000;
}
mj= ma[ii-1];
}
for(k=1; k<=4; ++k) {
for(i=1; i<=55; ++i){
ma[i-1] -= ma[(i+30)%55];
if (ma[i-1]<0){
ma[i-1] += 1000000000;
}
}
}
inext=0;
inextp=31;
*idum=1;
}
++inext;
if (inext==56){
inext=1;
}
++inextp;
if (inextp==56){
inextp=1;
}
mj=ma[inext-1]-ma[inextp-1];
if (mj<0){
mj += 1000000000;
}
ma[inext-1]=mj;
ret_val=mj*1.0000000000000001e-9;
return ret_val;
}
//============================================================
int genk(float *kx, float *ky, float *kz, int *p2seme){
// float sqrtf(float), sinf(float), cosf(float);
extern float rf(int *);
static float phi;
*kx=rf(p2seme) * 2. -1.f;
*ky= sqrtf(1. - *kx * *kx);
phi=pi2*rf(p2seme);
*kz=*ky * sinf(phi);
*ky *= cosf(phi);
return 0;
}
//==============================================================
int main(void){
float msec_kernel;
int Np=10000, Nmax=512;
int id,jp,jcoll,Ncoll,jp1, jp2, ind;
float Vx[Np],Vy[Np],Vz[Np],teta[Np],tetap[Np],phip[Np];
float kx, ky, kz, Vrx, Vry, Vrz, scalprod, fk;
float kAx, kAy, kAz, kBx, kBy, kBz;
iniran1.seme=7593;
for(jp=1;jp<=Np;jp++){
if(jp<=Np/2){
Vx[jp-1]=2.5;
Vy[jp-1]=0;
Vz[jp-1]=0;
tetap[jp-1]=0;
phip[jp-1]=0;
teta[jp-1]=0;
}
for (Ncoll=1;Ncoll<=Nmax;Ncoll += 10){
coll_CPU=(stato_struct*) malloc(Ncoll*sizeof(stato_struct));
jcoll=0;
while (jcoll<Ncoll){
jp1=1+floorf(Np*rf(&iniran1.seme));
jp2=1+floorf(Np*rf(&iniran1.seme));
genk(&kx,&ky,&kz,&iniran1.seme);
Vrx=Vx[jp2-1]-Vx[jp1-1];
Vry=Vy[jp2-1]-Vy[jp1-1];
Vrz=Vz[jp2-1]-Vz[jp1-1];
scalprod=Vrx*kx+Vry*ky+Vrz*kz;
if (scalprod<0) {
genk(&kAx,&kAy,&kAz,&iniran1.seme);
genk(&kBx,&kBy,&kBz,&iniran1.seme);
coll_CPU[jcoll].jp1= jp1;
coll_CPU[jcoll].jp2=jp2;
coll_CPU[jcoll].kx=kx;
coll_CPU[jcoll].ky=ky;
coll_CPU[jcoll].kz=kz;
coll_CPU[jcoll].vAx=Vx[jp1-1];
coll_CPU[jcoll].vAy=Vy[jp1-1];
coll_CPU[jcoll].vAz=Vz[jp1-1];
coll_CPU[jcoll].vBx=Vx[jp2-1];
coll_CPU[jcoll].vBy=Vy[jp2-1];
coll_CPU[jcoll].vBz=Vz[jp2-1];
coll_CPU[jcoll].tetaAp=tetap[jp1-1];
coll_CPU[jcoll].phiAp=phip[jp1-1];
coll_CPU[jcoll].tetaA=teta[jp1-1];
coll_CPU[jcoll].tetaBp=tetap[jp2-1];
coll_CPU[jcoll].phiBp=phip[jp2-1];
coll_CPU[jcoll].tetaB=teta[jp2-1];
coll_CPU[jcoll].kAx=kAx;
coll_CPU[jcoll].kAy=kAy;
coll_CPU[jcoll].kAz=kAz;
coll_CPU[jcoll].kBx=kBx;
coll_CPU[jcoll].kBy=kBy;
coll_CPU[jcoll].kBz=kBz;
coll_CPU[jcoll].caso=1;
jcoll++;
}
}
clock_t t;
t = clock();
#pragma omp parallel for private(id) //HERE IS WHERE I TRIED TO DO THE PARALLELIZATION BUT WITH NO SUCCESS. WHAT DO I HAVE TO TYPE INSTEAD???
for(id=0;id<Nmax;id++){
CollisioniCPU(coll_CPU,id);
}
t = clock() - t;
msec_kernel = ((float)t*1000)/CLOCKS_PER_SEC;
printf("Tempo esecuzione kernel:%e s\n",msec_kernel*1e-03);
for (ind=0;ind<Ncoll;ind++){
if (coll_CPU[ind].caso==4)
Ncoll_eff++;
else if (coll_CPU[ind].caso==0)
Ncoll_div++;
else
Ncoll_dim++;
}
free(coll_CPU);
}
return 0;
}
DSMC_kernel_float.c
void CollisioniCPU(stato_struct *coll_CPU, int id){
float vettA[6], vettB[6];
vettA[0]=coll_CPU[id].vAx;
vettA[1]=coll_CPU[id].vAy;
vettA[2]=coll_CPU[id].vAz;
vettA[3]=coll_CPU[id].tetaAp;
vettA[4]=coll_CPU[id].phiAp;
vettA[5]=coll_CPU[id].tetaA;
vettB[0]=coll_CPU[id].vBx;
vettB[1]=coll_CPU[id].vBy;
vettB[2]=coll_CPU[id].vBz;
vettB[3]=coll_CPU[id].tetaBp;
vettB[4]=coll_CPU[id].phiBp;
vettB[5]=coll_CPU[id].tetaB;
coll_CPU[id].vAx=2*vettA[0];
coll_CPU[id].vAy=2*vettA[1];
coll_CPU[id].vAz=2*vettA[2];
coll_CPU[id].tetaAp=2*vettA[3];
coll_CPU[id].phiAp=2*vettA[4];
coll_CPU[id].tetaA=2*vettA[5];
coll_CPU[id].vBx=2*vettB[0];
coll_CPU[id].vBy=2*vettB[1];
coll_CPU[id].vBz=2*vettB[2];
coll_CPU[id].tetaBp=2*vettB[3];
coll_CPU[id].phiBp=2*vettB[4];
coll_CPU[id].tetaB=2*vettB[5];
}
In order to compile the program I type this line on the terminal: gcc -fopenmp time_analysis.c -o time_analysis -lm fallowed by export OMP_NUM_THREADS=1; however once I run the executable I get this error message:
Error in `./time_analysis': double free or corruption (!prev): 0x00000000009602c0 ***
Aborted
What does this error mean? what I have done wrong in the main function when I tried to parallelize the for loop? and most important: what should I type instead in order to make my code go on parallel? please help me out if you can because I seriously have no time to study OpenMP from scratch and I need to get this job done right away.