4

I am trying to use the Intel C compiler for the first time and am getting completely the wrong answers. What am I doing wrong?

I have some code as follows:

#include <stdint.h>
#include <stdio.h>

#define CHUNK_SIZE 12
#define NUM_THREADS 8

#define popcnt __builtin_popcountll
#define BILLION (1000 * 1000 * 1000)
#define UPDATE_ROW_PPROD() \
    update_row_pprod(row_pprod, row, rows, row_sums, mask, mask_popcnt)

typedef __int128 int128_t;

static inline int64_t update_row_pprod
(
    int64_t* row_pprod, int64_t row, int64_t* rows,
    int64_t* row_sums, int64_t mask, int64_t mask_popcnt
)
{
    int64_t temp = 2 * popcnt(rows[row] & mask) - mask_popcnt;

    row_pprod[0] *= temp;
    temp -= 1;
    row_pprod[1] *= temp;
    temp -= row_sums[row];
    row_pprod[2] *= temp;
    temp += 1;
    row_pprod[3] *= temp;

    return row + 1;
}

int main(int argc, char* argv[])
{
    int64_t size = argc - 1, rows[argc - 1];
    int64_t row_sums[argc - 1];
    int128_t permanent = 0, sign = size & 1 ? -1 : 1;

    if (argc == 2)
    {
        printf("%d\n", argv[1][0] == '-' ? -1 : 1);
        return 0;
    }

    for (int64_t row = 0; row < size; row++)
    {
        char positive = argv[row + 1][0] == '+' ? '-' : '+';

        sign *= ',' - positive;
        rows[row] = row_sums[row] = 0;

        for (char* p = &argv[row + 1][1]; *p; p++)
        {
            rows[row] <<= 1;
            rows[row] |= *p == positive;
            row_sums[row] += *p == positive;
        }

        row_sums[row] = 2 * row_sums[row] - size;
    }

    #pragma omp parallel for reduction(+:permanent) num_threads(NUM_THREADS)
    for (int64_t mask = 1; mask < 1LL << (size - 1); mask += 2)
    {
        int64_t mask_popcnt = popcnt(mask);
        int64_t row = 0;
        int128_t row_prod = 1 - 2 * (mask_popcnt & 1);
        int128_t row_prod_high = -row_prod;
        int128_t row_prod_inv = row_prod;
        int128_t row_prod_inv_high = -row_prod;

        for (int64_t chunk = 0; chunk < size / CHUNK_SIZE; chunk++)
        {
            int64_t row_pprod[4] = {1, 1, 1, 1};

            for (int64_t i = 0; i < CHUNK_SIZE; i++)
                row = UPDATE_ROW_PPROD();

            row_prod *= row_pprod[0], row_prod_high *= row_pprod[1];
            row_prod_inv *= row_pprod[3], row_prod_inv_high *= row_pprod[2];
        }

        int64_t row_pprod[4] = {1, 1, 1, 1};

        while (row < size)
            row = UPDATE_ROW_PPROD();

        row_prod *= row_pprod[0], row_prod_high *= row_pprod[1];
        row_prod_inv *= row_pprod[3], row_prod_inv_high *= row_pprod[2];
        permanent += row_prod + row_prod_high + row_prod_inv + row_prod_inv_high;
    }

    permanent *= sign;

    if (permanent < 0)
        printf("-"), permanent *= -1;

    int32_t output[5], print = 0;

    output[0] = permanent % BILLION, permanent /= BILLION;
    output[1] = permanent % BILLION, permanent /= BILLION;
    output[2] = permanent % BILLION, permanent /= BILLION;
    output[3] = permanent % BILLION, permanent /= BILLION;
    output[4] = permanent % BILLION;

    if (output[4])
        printf("%u", output[4]), print = 1;
    if (print)
        printf("%09u", output[3]);
    else if (output[3])
        printf("%u", output[3]), print = 1;
    if (print)
        printf("%09u", output[2]);
    else if (output[2])
        printf("%u", output[2]), print = 1;
    if (print)
        printf("%09u", output[1]);
    else if (output[1])
        printf("%u", output[1]), print = 1;
    if (print)
        printf("%09u\n", output[0]);
    else
        printf("%u\n", output[0]);
}

If I compile it with

gcc -Wall -std=c99 -fopenmp  -o permanent permanent.c

I can then run it with

permanent -+ -+

and get the output

-2

which is correct.

If I instead compile using the Intel C compiler (17.0.1) with

icc -std=c99 -qopenmp -Wall permanent.c

and then do

a.out -+ -+

I get

11910984139051480114196905982

As noted in the comments, if you remove the -qopenmp then icc produces a version that runs correctly, albeit only on one core.

8
  • @squeamishossifrage I did include a sample input and the correct output. The code computes the permanent of a +-1 matrix. I am not sure how to get useful info from a debugger for this problem. Any help much appreciated. Maybe I just have the wrong compile flags? Commented Dec 7, 2016 at 11:37
  • @eleanora do you get the correct result if you remove -qopenmp? Commented Dec 7, 2016 at 11:40
  • @davmac Yes! That's a great spot. (It of course now warns "warning #3180: unrecognized OpenMP #pragma" and only runs on one core.) Commented Dec 7, 2016 at 11:41
  • 1
    @eleanora I don't know much about OpenMP, so I can only suggest then that it's either a bug in Intel's implementation or that your code breaks some restriction that doesn't matter in GCC's implementation. Maybe someone else here on SO will know better. Commented Dec 7, 2016 at 11:48
  • 1
    @squeamishossifrage Sure, there are things to improve, but the original question already contained a complete and verifiable example with compilation instruction, compiler versions, expected vs actual output. That easily makes it into the top10% questions on [openmp]. Commented Dec 7, 2016 at 16:26

2 Answers 2

7

Looks like a bug in Intel's handling of OpenMP reduction on __int128 type variables. This can be easily reproduced:

#include <stdio.h>
#include <inttypes.h>

int main() {
    __int128 sum = 0;
    #pragma omp parallel for reduction(+:sum)
    for (int i = 0; i < 0; i++) {
        sum += 1;
    }
    printf("%" PRIX64 " %" PRIX64 "\n", (uint64_t)sum, (uint64_t)(sum >> 64));
}

Output with -fopenmp:

14000000000 78778300

Correct output without, or with gcc -fopenmp:

0 0

You can work around that (and keep the __int128 type) by using a trivial custom reduction declaration:

#pragma omp declare reduction(add128: int128_t: omp_out = omp_out + omp_in) initializer(omp_priv = 0)
[...]
#pragma omp parallel for reduction(add128:permanent) num_threads(NUM_THREADS)

As far as I can tell the standard does not restrict the type of a reduction list element. At least the compiler should tell you if it's not supported.

Sign up to request clarification or add additional context in comments.

2 Comments

Out of interest, is this a known bug? For example, should I report it?
Please report it and keep the question updated. I can't tell whether it is known or not. At least it seems like a trivial fix for Intel.
5

Thanks for the test case, this is definitely a compiler bug in the initialization of the private copy of the reduction variable. You are free to also report the bug, though we will try to file internal bug against appropriate compiler developer.

So, another (not-very-elegant) workaround could be to explicitly initialize the private copy, that is replace

#pragma omp parallel for reduction(+:permanent) num_threads(NUM_THREADS)

with

#pragma omp parallel reduction(+:permanent) num_threads(NUM_THREADS)
{
  permanent = 0; // that should be done by compiler actually
  #pragma omp for
  ...
}

1 Comment

If you have a way to file an internal bug report that would be great.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.