Consider the following C code (assuming 80-bit long double) (note, I do know of memcmp, this is just an experiment):
enum { sizeOfFloat80=10 }; // NOTE: sizeof(long double) != sizeOfFloat80
_Bool sameBits1(long double x, long double y)
{
for(int i=0;i<sizeOfFloat80;++i)
if(((char*)&x)[i]!=((char*)&y)[i])
return 0;
return 1;
}
All compilers I checked (gcc, clang, icc on gcc.godbolt.org) generate similar code, here's an example for gcc with options -O3 -std=c11 -fomit-frame-pointer -m32:
sameBits1:
movzx eax, BYTE PTR [esp+16]
cmp BYTE PTR [esp+4], al
jne .L11
movzx eax, BYTE PTR [esp+17]
cmp BYTE PTR [esp+5], al
jne .L11
movzx eax, BYTE PTR [esp+18]
cmp BYTE PTR [esp+6], al
jne .L11
movzx eax, BYTE PTR [esp+19]
cmp BYTE PTR [esp+7], al
jne .L11
movzx eax, BYTE PTR [esp+20]
cmp BYTE PTR [esp+8], al
jne .L11
movzx eax, BYTE PTR [esp+21]
cmp BYTE PTR [esp+9], al
jne .L11
movzx eax, BYTE PTR [esp+22]
cmp BYTE PTR [esp+10], al
jne .L11
movzx eax, BYTE PTR [esp+23]
cmp BYTE PTR [esp+11], al
jne .L11
movzx eax, BYTE PTR [esp+24]
cmp BYTE PTR [esp+12], al
jne .L11
movzx eax, BYTE PTR [esp+25]
cmp BYTE PTR [esp+13], al
sete al
ret
.L11:
xor eax, eax
ret
This looks ugly, has branch on every byte and in fact doesn't seem to have been optimized at all (but at least the loop is unrolled). It's easy to see though that this could be optimized to the code equivalent to the following (and in general for larger data to use larger strides):
#include <string.h>
_Bool sameBits2(long double x, long double y)
{
long long X=0; memcpy(&X,&x,sizeof x);
long long Y=0; memcpy(&Y,&y,sizeof y);
short Xhi=0; memcpy(&Xhi,sizeof x+(char*)&x,sizeof Xhi);
short Yhi=0; memcpy(&Yhi,sizeof y+(char*)&y,sizeof Yhi);
return X==Y && Xhi==Yhi;
}
And this code now gets much nicer compilation result:
sameBits2:
sub esp, 20
mov edx, DWORD PTR [esp+36]
mov eax, DWORD PTR [esp+40]
xor edx, DWORD PTR [esp+24]
xor eax, DWORD PTR [esp+28]
or edx, eax
movzx eax, WORD PTR [esp+48]
sete dl
cmp WORD PTR [esp+36], ax
sete al
add esp, 20
and eax, edx
ret
So my question is: why is none of the three compilers able to do this optimization? It it something very uncommon to see in the C code?
long doubleis not required to have 10 bytes. What do you want to accomplish with that code? It looks obfuscated and like a solution in search of a problem.