Return to Revisions

5 of 8

added 3 characters in body

edited Sep 6, 2018 at 13:41

9.3k
5
34
53

Instead of speculating about what may or may not happen, let's just look, shall we? I'll have to use C++ since I don't have a C# compiler handy (though see the C# example from VisualMelon), but I'm sure the same principles apply regardless.

We'll include the two alternatives you encountered in the interview. We'll also include a version that uses abs as suggested by some of the answers.

#include <cstdlib>

bool IsSumInRangeWithVar(int a, int b)
{
    int s = a + b;

    if (s > 1000 || s < -1000) return false;
    else return true;
}

bool IsSumInRangeWithoutVar(int a, int b)
{
    if (a + b > 1000 || a + b < -1000) return false;
    else return true;
}

bool IsSumInRangeSuperOptimized(int a, int b) {
    return (abs(a + b) < 1000);
}

Now compile it with no optimization whatsoever: g++ -c -o test.o test.cpp

Now we can see precisely what this generates: objdump -d test.o

0000000000000000 <_Z19IsSumInRangeWithVarii>:
   0:   55                      push   %rbp              # begin a call frame
   1:   48 89 e5                mov    %rsp,%rbp
   4:   89 7d ec                mov    %edi,-0x14(%rbp)  # save first argument (a) on stack
   7:   89 75 e8                mov    %esi,-0x18(%rbp)  # save b on stack
   a:   8b 55 ec                mov    -0x14(%rbp),%edx  # load a and b into edx
   d:   8b 45 e8                mov    -0x18(%rbp),%eax  # load b into eax
  10:   01 d0                   add    %edx,%eax         # add a and b
  12:   89 45 fc                mov    %eax,-0x4(%rbp)   # save result as s on stack
  15:   81 7d fc e8 03 00 00    cmpl   $0x3e8,-0x4(%rbp) # compare s to 1000
  1c:   7f 09                   jg     27                # jump to 27 if it's greater
  1e:   81 7d fc 18 fc ff ff    cmpl   $0xfffffc18,-0x4(%rbp) # compare s to -1000
  25:   7d 07                   jge    2e                # jump to 2e if it's greater or equal
  27:   b8 00 00 00 00          mov    $0x0,%eax         # put 0 (false) in eax, which will be the return value
  2c:   eb 05                   jmp    33 <_Z19IsSumInRangeWithVarii+0x33>
  2e:   b8 01 00 00 00          mov    $0x1,%eax         # put 1 (true) in eax
  33:   5d                      pop    %rbp
  34:   c3                      retq

0000000000000035 <_Z22IsSumInRangeWithoutVarii>:
  35:   55                      push   %rbp
  36:   48 89 e5                mov    %rsp,%rbp
  39:   89 7d fc                mov    %edi,-0x4(%rbp)
  3c:   89 75 f8                mov    %esi,-0x8(%rbp)
  3f:   8b 55 fc                mov    -0x4(%rbp),%edx
  42:   8b 45 f8                mov    -0x8(%rbp),%eax  # same as before
  45:   01 d0                   add    %edx,%eax
  # note: unlike other implementation, result is not saved
  47:   3d e8 03 00 00          cmp    $0x3e8,%eax      # compare to 1000
  4c:   7f 0f                   jg     5d <_Z22IsSumInRangeWithoutVarii+0x28>
  4e:   8b 55 fc                mov    -0x4(%rbp),%edx  # since s wasn't saved, load a and b from the stack again
  51:   8b 45 f8                mov    -0x8(%rbp),%eax
  54:   01 d0                   add    %edx,%eax
  56:   3d 18 fc ff ff          cmp    $0xfffffc18,%eax # compare to -1000
  5b:   7d 07                   jge    64 <_Z22IsSumInRangeWithoutVarii+0x2f>
  5d:   b8 00 00 00 00          mov    $0x0,%eax
  62:   eb 05                   jmp    69 <_Z22IsSumInRangeWithoutVarii+0x34>
  64:   b8 01 00 00 00          mov    $0x1,%eax
  69:   5d                      pop    %rbp
  6a:   c3                      retq

000000000000006b <_Z26IsSumInRangeSuperOptimizedii>:
  6b:   55                      push   %rbp
  6c:   48 89 e5                mov    %rsp,%rbp
  6f:   89 7d fc                mov    %edi,-0x4(%rbp)
  72:   89 75 f8                mov    %esi,-0x8(%rbp)
  75:   8b 55 fc                mov    -0x4(%rbp),%edx
  78:   8b 45 f8                mov    -0x8(%rbp),%eax
  7b:   01 d0                   add    %edx,%eax
  7d:   3d 18 fc ff ff          cmp    $0xfffffc18,%eax
  82:   7c 16                   jl     9a <_Z26IsSumInRangeSuperOptimizedii+0x2f>
  84:   8b 55 fc                mov    -0x4(%rbp),%edx
  87:   8b 45 f8                mov    -0x8(%rbp),%eax
  8a:   01 d0                   add    %edx,%eax
  8c:   3d e8 03 00 00          cmp    $0x3e8,%eax
  91:   7f 07                   jg     9a <_Z26IsSumInRangeSuperOptimizedii+0x2f>
  93:   b8 01 00 00 00          mov    $0x1,%eax
  98:   eb 05                   jmp    9f <_Z26IsSumInRangeSuperOptimizedii+0x34>
  9a:   b8 00 00 00 00          mov    $0x0,%eax
  9f:   5d                      pop    %rbp
  a0:   c3                      retq

We can see from the stack addresses (for example, the -0x4 in mov %edi,-0x4(%rbp) versus the -0x14 in mov %edi,-0x14(%rbp)) that IsSumInRangeWithVar() uses 16 extra bytes on the stack.

Because IsSumInRangeWithoutVar() allocates no space on the stack to store the intermediate value s it has to recalculate it, resulting in this implementation being 2 instructions longer.

Funny, IsSumInRangeSuperOptimized() looks a lot like IsSumInRangeWithoutVar(), except it compares to -1000 first, and 1000 second.

Now let's compile with only the most basic optimizations: g++ -O1 -c -o test.o test.cpp. The result:

0000000000000000 <_Z19IsSumInRangeWithVarii>:
   0:   8d 84 37 e8 03 00 00    lea    0x3e8(%rdi,%rsi,1),%eax
   7:   3d d0 07 00 00          cmp    $0x7d0,%eax
   c:   0f 96 c0                setbe  %al
   f:   c3                      retq

0000000000000010 <_Z22IsSumInRangeWithoutVarii>:
  10:   8d 84 37 e8 03 00 00    lea    0x3e8(%rdi,%rsi,1),%eax
  17:   3d d0 07 00 00          cmp    $0x7d0,%eax
  1c:   0f 96 c0                setbe  %al
  1f:   c3                      retq

0000000000000020 <_Z26IsSumInRangeSuperOptimizedii>:
  20:   8d 84 37 e8 03 00 00    lea    0x3e8(%rdi,%rsi,1),%eax
  27:   3d d0 07 00 00          cmp    $0x7d0,%eax
  2c:   0f 96 c0                setbe  %al
  2f:   c3                      retq

Would you look at that: each variant is identical. The compiler is able to do something quite clever: abs(a + b) <= 1000 is equivalent to a + b + 1000 <= 2000 considering setbe does an unsigned comparison, so a negative number becomes a very large positive number. The lea instruction can actually perform all these additions in one instruction, and eliminate all the conditional branches.

To answer your question, almost always the thing to optimize for is not memory or speed, but readability. Reading code is a lot harder than writing it, and reading code that's been mangled to "optimize" it is a lot harder than reading code that's been written to be clear. More often than not, these "optimizations" have negligible, or as in this case exactly zero actual impact on performance.

answered Sep 6, 2018 at 12:37

Phil Frost

1.6k
2
11
12