Looking at this benchmark about a custom std::function implementation: https://github.com/PacktPublishing/Hands-On-Design-Patterns-with-CPP-Second-Edition/blob/main/Chapter06/09_function.C
I tried to replicate the example and I noticed that despite declaring this simple function like this: __attribute__((noinline)) auto function_no_inline(int a, int b, int c, int d) -> int { return a + b + c + d; }, the time it took was the same as the inline function, while it was much more if function was actually defined in a different compilation unit. It seems that the attribute was ignored for some reason. Why? Arguments are obtained from rand().
Benchmark Time CPU Iterations
-----------------------------------------------------------------------
BM_invoke_function 1.35 ns 1.35 ns 504544141
BM_invoke_function_no_inline 0.271 ns 0.271 ns 2584830443
BM_invoke_function_inline 0.270 ns 0.270 ns 2580073503
BM_invoke_std_function 2.21 ns 2.17 ns 324669753
This is my code. It links against the google-benchmark library
#include <benchmark/benchmark.h>
#include <functional>
auto function(int a, int b, int c, int d) -> int;
__attribute__((noinline)) auto function_no_inline(int a, int b, int c, int d) -> int { return a + b + c + d; }
inline auto function_inline(int a, int b, int c, int d) { return a + b + c + d; }
template <typename Callable>
auto invoke(int a, int b, int c, int d, const Callable& callable)
{
return callable(a, b, c, d);
}
// Benchmarks
void BM_invoke_function(benchmark::State& state)
{
int a{rand()};
int b{rand()};
int c{rand()};
int d{rand()};
for (auto _ : state)
{
benchmark::DoNotOptimize(invoke(a, b, c, d, function));
benchmark::ClobberMemory();
}
}
void BM_invoke_function_no_inline(benchmark::State& state)
{
int a{rand()};
int b{rand()};
int c{rand()};
int d{rand()};
for (auto _ : state)
{
benchmark::DoNotOptimize(invoke(a, b, c, d, function_no_inline));
benchmark::ClobberMemory();
}
}
void BM_invoke_function_inline(benchmark::State& state)
{
int a{rand()};
int b{rand()};
int c{rand()};
int d{rand()};
for (auto _ : state)
{
benchmark::DoNotOptimize(invoke(a, b, c, d, function_inline));
benchmark::ClobberMemory();
}
}
void BM_invoke_std_function(benchmark::State& state)
{
int a{rand()};
int b{rand()};
int c{rand()};
int d{rand()};
std::function<int(int, int, int, int)> std_function{function};
for (auto _ : state)
{
benchmark::DoNotOptimize(invoke(a, b, c, d, std_function));
benchmark::ClobberMemory();
}
}
BENCHMARK(BM_invoke_function);
BENCHMARK(BM_invoke_function_no_inline);
BENCHMARK(BM_invoke_function_inline);
BENCHMARK(BM_invoke_std_function);
BENCHMARK_MAIN();
asm("")instruction while using __attribute__((noinline)) finally made it not inlined, although hopefully did not have other effects.