I have optimized as much as I could my function for sequential running.
When I use openMP I see no gain in performance.
I tried my program on a machine with 1 cores and on a machine with 8 cores, and the performance is the same.
With year set to 20, I have
1 core: 1 sec.
8 core: 1 sec.
With year set to 25 I have
1 core: 40 sec.
8 core: 40 sec.
1 core machine: my laptop's intel core 2 duo 1.8 GHz, ubuntu linux
8 core machine: 3.25 GHz, ubuntu linux
My program enumerate all the possible path of a binomial tree and do some work on each path. So my loop size increase exponentially and I would expect the footprint of openMP thread to be zero. In my loop, I only do a reduction of one variable. All other variable are read-only. I only use function I wrote, and I think they are thread safe.
I also run Valgrind cachegrind on my program. I don't fully understand the output but there seems to be no cache miss or false sharing.
I compile with
gcc -O3 -g3 -Wall -c -fmessage-length=0 -lm -fopenmp -ffast-math
My complete program is as below. Sorry for posting a lot of code. I'm not familiar with openMP nor C, and I couldn't resume my code more without loosing the main task.
How can I improve performance when I use openMP?
Are they some compiler flags or C tricks that will make the program run faster?
test.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include "test.h"
int main(){
printf("starting\n");
int year=20;
int tradingdate0=1;
globalinit(year,tradingdate0);
int i;
float v=0;
long n=pow(tradingdate0+1,year);
#pragma omp parallel for reduction(+:v)
for(i=0;i<n;i++)
v+=pathvalue(i);
globaldel();
printf("finished\n");
return 0;
}
//***function on which openMP is applied
float pathvalue(long pathindex) {
float value = -ctx.firstpremium;
float personalaccount = ctx.personalaccountat0;
float account = ctx.firstpremium;
int i;
for (i = 0; i < ctx.year-1; i++) {
value *= ctx.accumulationfactor;
double index = getindex(i,pathindex);
account = account * index;
double death = fmaxf(account,ctx.guarantee[i]);
value += qx(i) * death;
if (haswithdraw(i)){
double withdraw = personalaccount*ctx.allowed;
value += px(i) * withdraw;
personalaccount = fmaxf(personalaccount-withdraw,0);
account = fmaxf(account-withdraw,0);
}
}
//last year
double index = getindex(ctx.year-1,pathindex);
account = account * index;
value+=fmaxf(account,ctx.guarantee[ctx.year-1]);
return value * ctx.discountfactor;
}
int haswithdraw(int period){
return 1;
}
float getindex(int period, long pathindex){
int ndx = (pathindex/ctx.chunksize[period])%ctx.tradingdate;
return ctx.stock[ndx];
}
float qx(int period){
return 0;
}
float px(int period){
return 1;
}
//****global
struct context ctx;
void globalinit(int year, int tradingdate0){
ctx.year = year;
ctx.tradingdate0 = tradingdate0;
ctx.firstpremium = 1;
ctx.riskfreerate = 0.06;
ctx.volatility=0.25;
ctx.personalaccountat0 = 1;
ctx.allowed = 0.07;
ctx.guaranteerate = 0.03;
ctx.alpha=1;
ctx.beta = 1;
ctx.tradingdate=tradingdate0+1;
ctx.discountfactor = exp(-ctx.riskfreerate * ctx.year);
ctx.accumulationfactor = exp(ctx.riskfreerate);
ctx.guaranteefactor = 1+ctx.guaranteerate;
ctx.upmove=exp(ctx.volatility/sqrt(ctx.tradingdate0));
ctx.downmove=1/ctx.upmove;
ctx.stock=(float*)malloc(sizeof(float)*ctx.tradingdate);
int i;
for(i=0;i<ctx.tradingdate;i++)
ctx.stock[i]=pow(ctx.upmove,ctx.tradingdate0-i)*pow(ctx.downmove,i);
ctx.chunksize=(long*)malloc(sizeof(long)*ctx.year);
for(i=0;i<year;i++)
ctx.chunksize[i]=pow(ctx.tradingdate,ctx.year-i-1);
ctx.guarantee=(float*)malloc(sizeof(float)*ctx.year);
for(i=0;i<ctx.year;i++)
ctx.guarantee[i]=ctx.beta*pow(ctx.guaranteefactor,i+1);
}
void globaldel(){
free(ctx.stock);
free(ctx.chunksize);
free(ctx.guarantee);
}
test.h
float pathvalue(long pathindex);
int haswithdraw(int period);
float getindex(int period, long pathindex);
float qx(int period);
float px(int period);
//***global
struct context{
int year;
int tradingdate0;
float firstpremium;
float riskfreerate;
float volatility;
float personalaccountat0;
float allowed;
float guaranteerate;
float alpha;
float beta;
int tradingdate;
float discountfactor;
float accumulationfactor;
float guaranteefactor;
float upmove;
float downmove;
float* stock;
long* chunksize;
float* guarantee;
};
struct context ctx;
void globalinit();
void globaldel();
EDIT I simplify all global variables as constant. For 20 year, the program run two time faster (great!). I tried to set the number of thread with OMP_NUM_THREADS=4 ./test for example. But it didn't give me any performance gain.
Can my gcc have some problem?
test.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <omp.h>
#include "test.h"
int main(){
starttimer();
printf("starting\n");
int i;
float v=0;
#pragma omp parallel for reduction(+:v)
for(i=0;i<numberofpath;i++)
v+=pathvalue(i);
printf("v:%f\nfinished\n",v);
endtimer();
return 0;
}
//function on which openMP is applied
float pathvalue(long pathindex) {
float value = -firstpremium;
float personalaccount = personalaccountat0;
float account = firstpremium;
int i;
for (i = 0; i < year-1; i++) {
value *= accumulationfactor;
double index = getindex(i,pathindex);
account = account * index;
double death = fmaxf(account,guarantee[i]);
value += death;
double withdraw = personalaccount*allowed;
value += withdraw;
personalaccount = fmaxf(personalaccount-withdraw,0);
account = fmaxf(account-withdraw,0);
}
//last year
double index = getindex(year-1,pathindex);
account = account * index;
value+=fmaxf(account,guarantee[year-1]);
return value * discountfactor;
}
float getindex(int period, long pathindex){
int ndx = (pathindex/chunksize[period])%tradingdate;
return stock[ndx];
}
//timing
clock_t begin;
void starttimer(){
begin = clock();
}
void endtimer(){
clock_t end = clock();
double elapsed = (double)(end - begin) / CLOCKS_PER_SEC;
printf("\nelapsed: %f\n",elapsed);
}
test.h
float pathvalue(long pathindex);
int haswithdraw(int period);
float getindex(int period, long pathindex);
float qx(int period);
float px(int period);
//timing
void starttimer();
void endtimer();
//***constant
const int year= 20 ;
const int tradingdate0= 1 ;
const float firstpremium= 1 ;
const float riskfreerate= 0.06 ;
const float volatility= 0.25 ;
const float personalaccountat0= 1 ;
const float allowed= 0.07 ;
const float guaranteerate= 0.03 ;
const float alpha= 1 ;
const float beta= 1 ;
const int tradingdate= 2 ;
const int numberofpath= 1048576 ;
const float discountfactor= 0.301194211912 ;
const float accumulationfactor= 1.06183654655 ;
const float guaranteefactor= 1.03 ;
const float upmove= 1.28402541669 ;
const float downmove= 0.778800783071 ;
const float stock[2]={1.2840254166877414, 0.7788007830714049};
const long chunksize[20]={524288, 262144, 131072, 65536, 32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1};
const float guarantee[20]={1.03, 1.0609, 1.092727, 1.1255088100000001, 1.1592740743, 1.1940522965290001, 1.2298738654248702, 1.2667700813876164, 1.304773183829245, 1.3439163793441222, 1.384233870724446, 1.4257608868461793, 1.4685337134515648, 1.512589724855112, 1.557967416600765, 1.6047064390987882, 1.6528476322717518, 1.7024330612399046, 1.7535060530771016, 1.8061112346694148};
enumfor integers or #define for floating point) and pass all run time parameters as argument to your function. The way you are doing it the compiler cannot be sure that some other part of the program doesn't change particular values of thestruct, so it can't do constant propagation. Cleaning that up will also help the parallel compilation.-O3 -S) and compare the resulting code with and without-fopenmp.externdeclaration. Otherwise, can you sketch the right approach? My gcc was ok finally, I was not measuring openMP performance correctly as pointed by Hristo Iliev.extern) can't have an initialization. So some of your code wouldn't see the value and the optimization potential would be much less.