I've gotten stuck writing some parallel c code using OpenMP for a concurrency course.
Heres a snippet
#include <stdio.h>
#include <time.h>
#include <math.h>
#define FALSE 0
#define TRUE 1
int count_primes_0(int);
int count_primes_1(int);
int count_primes_2(int);
int main(int argc, char *argv[]){
int n;
if (argc != 2){
printf("Incorrect Invocation, use: \nq1 N");
return 0;
} else {
n = atoi(argv[1]);
}
if (n < 0){
printf("N cannot be negative");
return 0;
}
printf("N = %d\n", n);
//omp_set_num_threads(1);
time_it(count_primes_0, n, "Method 0");
time_it(count_primes_1, n, "Method 1");
time_it(count_primes_2, n, "Method 2");
return 0;
}
int is_prime(int n){
for(int i = 2; i <= (int)(sqrt((double) n)); i++){
if ((n % i) == 0){
return FALSE;
}
}
return n > 1;
}
void time_it( int (*f)(int), int n, char *string){
clock_t start_clock;
clock_t end_clock;
double calc_time;
int nprimes;
struct timeval start_val;
struct timeval end_val;
start_clock = clock();
nprimes = (*f)(n);
end_clock = clock();
calc_time = ((double)end_clock - (double)start_clock) / CLOCKS_PER_SEC;
printf("\tNumber of primes: %d \t Time taken: %fs\n\n", nprimes, calc_time);
}
// METHOD 0
// Base Case no parallelization
int count_primes_0(int n){
int nprimes = 0;
for(int i = 1; i <= n; i++){
if (is_prime(i)) {
nprimes++;
}
}
return nprimes;
}
//METHOD 1
// Use only For and Critical Constructs
int count_primes_1(int n){
int nprimes = 0;
#pragma omp parallel for
for(int i = 1; i <= n; i++){
if (is_prime(i)) {
#pragma omp critical
nprimes++;
}
}
return nprimes;
}
//METHOD 2
// Use Reduction
int count_primes_2(int n){
int nprimes = 0;
#pragma omp parallel for reduction(+:nprimes)
for(int i = 1; i <= n; i++){
if (is_prime(i)) {
nprimes++;
}
}
return nprimes;
}
The problem I'm facing is that when I use omp_set_num_threads() the less threads I use the faster my functions run -- or get closer to the runtime of the base unparallelized case
Time Results: These are run on an 8 core machine
8 Threads: Method 0: 0.07s; Method 1: 1.63s; Method 2: 1.4s
4 Threads: Method 0: 0.07s; Method 1: 0.16s; Method 2: 0.16s
2 Threads: Method 0: 0.07s; Method 1: 0.10; Method 2: 0.09
1 Thread: Method 0: 0.07s; Method 1: 0.08s; Method 2: 0.07s
I've tried disabling optimization and using a different gcc version with no difference
Any help is appreciated.
EDIT: Using clock in Linux returns the 'incorrect' time, wall clock time is what I needed so using ether omp_get_wtime() or the Linux function timeit would produce the proper results.