3

Today I stumbled across a weird problem. Consider this simple program where I try to emulate MMX's PADDW instruction:

#include <cstdint>
#include <cstdio>

int main()
{
    uint64_t a = 0;
    uint64_t b = 0x1234123412341234;

    uint64_t c = 0;
    uint16_t *a_words = reinterpret_cast<uint16_t*>(&a);
    uint16_t *b_words = reinterpret_cast<uint16_t*>(&b);
    uint16_t *c_words = reinterpret_cast<uint16_t*>(&c);

    for (size_t i = 0; i < 4; i ++)
        c_words[i] = a_words[i] + b_words[i];

    printf("%d %d %d %d\n", a_words[0], a_words[1], a_words[2], a_words[3]);
    printf("%d %d %d %d\n", b_words[0], b_words[1], b_words[2], b_words[3]);
    printf("%d %d %d %d\n", c_words[0], c_words[1], c_words[2], c_words[3]);
    printf("%016llx\n", c);
    return 0;
}

Compiling this and running with g++ -std=c++11 test.cpp -o test && ./test results in following:

0 0 0 0
4660 4660 4660 4660
4660 4660 4660 4660
1234123412341234

However, if I enable -O2, it displays wrong value (on -O1 it still works):

0 0 0 0
4660 4660 4660 4660
4660 4660 4660 4660
0000000000000000

Why is that?


Other observations:

  1. If I unroll the loop, compiling with -O2 works (!!):

    #include <cstdint>
    #include <cstdio>
    
    int main()
    {
        uint64_t a = 0;
        uint64_t b = 0x1234123412341234;
    
        uint64_t c = 0;
        uint16_t *a_words = reinterpret_cast<uint16_t*>(&a);
        uint16_t *b_words = reinterpret_cast<uint16_t*>(&b);
        uint16_t *c_words = reinterpret_cast<uint16_t*>(&c);
    
        c_words[0] = a_words[0] + b_words[0];
        c_words[1] = a_words[1] + b_words[1];
        c_words[2] = a_words[2] + b_words[2];
        c_words[3] = a_words[3] + b_words[3];
    
        printf("%d %d %d %d\n", a_words[0], a_words[1], a_words[2], a_words[3]);
        printf("%d %d %d %d\n", b_words[0], b_words[1], b_words[2], b_words[3]);
        printf("%d %d %d %d\n", c_words[0], c_words[1], c_words[2], c_words[3]);
        printf("%016llx\n", c);
        return 0;
    }
    
  2. If I work with very similar problem but for 32-bit integers instead of 64-bit ones, it works as well:

    #include <cstdint>
    #include <cstdio>
    
    int main()
    {
        uint32_t a = 0;
        uint32_t b = 0x12121212;
    
        uint32_t c = 0;
        uint8_t *a_words = reinterpret_cast<uint8_t*>(&a);
        uint8_t *b_words = reinterpret_cast<uint8_t*>(&b);
        uint8_t *c_words = reinterpret_cast<uint8_t*>(&c);
    
        for (size_t i = 0; i < 4; i ++)
            c_words[i] = a_words[i] + b_words[i];
    
        printf("%d %d %d %d\n", a_words[0], a_words[1], a_words[2], a_words[3]);
        printf("%d %d %d %d\n", b_words[0], b_words[1], b_words[2], b_words[3]);
        printf("%d %d %d %d\n", c_words[0], c_words[1], c_words[2], c_words[3]);
        printf("%08x\n", c);
        return 0;
    }
    

The problem recurs on both 32-bit and 64-bit machines. Tried g++ (GCC) 4.9.2 on Cygwin and g++ (Debian 4.9.1-19) 4.9.1 on GNU/Linux.

6
  • 4
    You violate strict aliasing, which results in undefined behaviour, which your compiler exploits. Commented Feb 15, 2015 at 13:48
  • 2
    Have you tried fno-strict-aliasing? Commented Feb 15, 2015 at 13:49
  • 1
    stackoverflow.com/questions/2958633/… Commented Feb 15, 2015 at 13:49
  • It works with -fno-strict-aliasing, thank you! However, judging from the tone of your comments I feel I'm doing the whole thing totally wrong. Mind giving me a clue how could I tackle the problem in a more elegant way? Commented Feb 15, 2015 at 13:51
  • 1
    In this case, you could use the good old SWAR addition: ((a & 0x7FFF7FFF7FFF7FFF) + (b & 0x7FFF7FFF7FFF7FFF)) ^ ((a ^ b) & 0x8000800080008000) Commented Feb 15, 2015 at 13:55

1 Answer 1

4

This is strict aliasing violation. You write values of type A to memory which stores object of type B. C++ standard says you can't do that (the exception to this rule are char and its unsigned and signed variant)

This is non-portable code, but yet, if you still want to do it legally, what can you do about it?

  • copy from uint64_t to uint16_t array (by memcpy or std::copy), modify the values, copy it back.
  • OR use compiler intrisics which translate directly to vectorized instructions
  • OR disable strict aliasing.
Sign up to request clarification or add additional context in comments.

2 Comments

@rr- C++ says no, C says yes (as far as I remember)

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.