I have a bounded queue with small size that definitely fit in int. So I want to use atomic<int> instead of atomic<size_t> for indexing/counter, since int is smaller it should be faster.
However, currently my benchmark shows they have similar speed (when using std::memory_order_relaxed) when used as a counter, but I'm not sure if this is due to bad benchmarking (also this is a VM so current result isn't the most reliable)
=== System Information ===
OS: Linux 6.8.0-1043-gcp (#46~22.04.1-Ubuntu SMP Wed Oct 22 19:00:03 UTC 2025)
CPU:
Architecture: x86_64
Logical CPUs: 8
Node name: Check
Machine: x86_64
g++ 13.3
atomic<int> vs atomic<size_t> micro-benchmark
OPS_PER_BENCH = 50000000
int shared relaxed threads=1 ops=50000000 time=0.3271s ops/s=152878203.38 ns/op=6.54
size_t shared relaxed threads=1 ops=50000000 time=0.3222s ops/s=155182395.55 ns/op=6.44
int shared seq_cst threads=1 ops=50000000 time=0.3209s ops/s=155793958.46 ns/op=6.42
size_t shared seq_cst threads=1 ops=50000000 time=0.3210s ops/s=155787302.95 ns/op=6.42
int shared CAS threads=1 ops=50000000 time=0.5689s ops/s=87883570.52 ns/op=11.38
size_t shared CAS threads=1 ops=50000000 time=0.5593s ops/s=89389628.93 ns/op=11.19
int per-thread relaxed threads=1 ops=50000000 time=0.3255s ops/s=153615305.49 ns/op=6.51
size_t per-thread relaxed threads=1 ops=50000000 time=0.3205s ops/s=155990264.51 ns/op=6.41
------------------------------------------------------------
int shared relaxed threads=2 ops=50000000 time=1.5077s ops/s=33162809.24 ns/op=30.15
size_t shared relaxed threads=2 ops=50000000 time=1.4249s ops/s=35091189.36 ns/op=28.50
int shared seq_cst threads=2 ops=50000000 time=1.5050s ops/s=33222696.53 ns/op=30.10
size_t shared seq_cst threads=2 ops=50000000 time=1.8041s ops/s=27714312.85 ns/op=36.08
int shared CAS threads=2 ops=50000000 time=2.7661s ops/s=18075669.44 ns/op=55.32
size_t shared CAS threads=2 ops=50000000 time=3.2245s ops/s=15506267.21 ns/op=64.49
int per-thread relaxed threads=2 ops=50000000 time=1.3883s ops/s=36016218.37 ns/op=27.77
size_t per-thread relaxed threads=2 ops=50000000 time=1.3574s ops/s=36835752.33 ns/op=27.15
------------------------------------------------------------
int shared relaxed threads=3 ops=50000000 time=2.3341s ops/s=21421597.47 ns/op=46.68
size_t shared relaxed threads=3 ops=50000000 time=1.8018s ops/s=27750189.59 ns/op=36.04
int shared seq_cst threads=3 ops=50000000 time=2.2140s ops/s=22583354.75 ns/op=44.28
size_t shared seq_cst threads=3 ops=50000000 time=3.3697s ops/s=14838202.43 ns/op=67.39
int shared CAS threads=3 ops=50000000 time=4.1281s ops/s=12112147.93 ns/op=82.56
size_t shared CAS threads=3 ops=50000000 time=5.2940s ops/s=9444581.91 ns/op=105.88
int per-thread relaxed threads=3 ops=50000000 time=2.2324s ops/s=22397277.11 ns/op=44.65
size_t per-thread relaxed threads=3 ops=50000000 time=2.3911s ops/s=20911079.76 ns/op=47.82
------------------------------------------------------------
int shared relaxed threads=4 ops=50000000 time=2.5003s ops/s=19997924.19 ns/op=50.01
size_t shared relaxed threads=4 ops=50000000 time=2.2927s ops/s=21808223.78 ns/op=45.85
int shared seq_cst threads=4 ops=50000000 time=2.4988s ops/s=20009473.73 ns/op=49.98
size_t shared seq_cst threads=4 ops=50000000 time=3.5007s ops/s=14282674.37 ns/op=70.01
int shared CAS threads=4 ops=50000000 time=4.6832s ops/s=10676373.71 ns/op=93.66
size_t shared CAS threads=4 ops=50000000 time=14.6833s ops/s=3405230.01 ns/op=293.67
int per-thread relaxed threads=4 ops=50000000 time=10.0129s ops/s=4993537.05 ns/op=200.26
size_t per-thread relaxed threads=4 ops=50000000 time=10.0845s ops/s=4958110.80 ns/op=201.69
------------------------------------------------------------
int shared relaxed threads=5 ops=50000000 time=11.6976s ops/s=4274378.64 ns/op=233.95
size_t shared relaxed threads=5 ops=50000000 time=11.1007s ops/s=4504231.02 ns/op=222.01
int shared seq_cst threads=5 ops=50000000 time=11.6111s ops/s=4306222.53 ns/op=232.22
size_t shared seq_cst threads=5 ops=50000000 time=18.5938s ops/s=2689067.57 ns/op=371.88
int shared CAS threads=5 ops=50000000 time=24.1049s ops/s=2074265.29 ns/op=482.10
size_t shared CAS threads=5 ops=50000000 time=31.7937s ops/s=1572638.73 ns/op=635.87
int per-thread relaxed threads=5 ops=50000000 time=9.3022s ops/s=5375057.17 ns/op=186.04
size_t per-thread relaxed threads=5 ops=50000000 time=12.7956s ops/s=3907601.15 ns/op=255.91
Is there a better way to benchmark their performance?
#include <atomic>
#include <chrono>
#include <cstddef>
#include <iostream>
#include <thread>
#include <vector>
#include <string>
#include <iomanip>
#include <type_traits>
#include <utility>
using steady_clock_t = std::chrono::steady_clock;
static constexpr uint64_t OPS_PER_BENCH = 50'000'000ULL;
static constexpr int MAX_THREADS = 8;
struct Result {
std::string name;
int threads;
double seconds;
uint64_t ops;
};
template <typename F>
Result run_bench(const std::string& name, int nthreads, F&& f) {
auto start = steady_clock_t::now();
f();
auto end = steady_clock_t::now();
double secs = std::chrono::duration<double>(end - start).count();
return Result{name, nthreads, secs, OPS_PER_BENCH};
}
void print_result(const Result& r) {
double ops_per_sec = r.ops / r.seconds;
double ns_per_op = (r.seconds * 1e9) / r.ops;
std::cout << std::left << std::setw(30) << r.name
<< " threads=" << r.threads
<< " ops=" << r.ops
<< " time=" << std::fixed << std::setprecision(4) << r.seconds << "s"
<< " ops/s=" << std::fixed << std::setprecision(2) << ops_per_sec
<< " ns/op=" << std::fixed << std::setprecision(2) << ns_per_op
<< "\n";
}
// ---------------------------------------------------------
// 1) shared counter, relaxed fetch_add
// ---------------------------------------------------------
template <typename AtomicT>
Result bench_shared_relaxed(int nthreads, const std::string& label) {
alignas(64) AtomicT counter{0};
uint64_t ops_per_thread = OPS_PER_BENCH / nthreads;
auto worker = [&](int) {
for (uint64_t i = 0; i < ops_per_thread; ++i) {
counter.fetch_add(1, std::memory_order_relaxed);
}
};
return run_bench(label, nthreads, [&] {
std::vector<std::thread> ths;
ths.reserve(nthreads);
for (int t = 0; t < nthreads; ++t)
ths.emplace_back(worker, t);
for (auto& th : ths) th.join();
});
}
// ---------------------------------------------------------
// 2) shared counter, seq_cst fetch_add
// ---------------------------------------------------------
template <typename AtomicT>
Result bench_shared_seqcst(int nthreads, const std::string& label) {
alignas(64) AtomicT counter{0};
uint64_t ops_per_thread = OPS_PER_BENCH / nthreads;
auto worker = [&](int) {
for (uint64_t i = 0; i < ops_per_thread; ++i) {
counter.fetch_add(1, std::memory_order_seq_cst);
}
};
return run_bench(label, nthreads, [&] {
std::vector<std::thread> ths;
ths.reserve(nthreads);
for (int t = 0; t < nthreads; ++t)
ths.emplace_back(worker, t);
for (auto& th : ths) th.join();
});
}
// ---------------------------------------------------------
// 3) CAS loop on shared variable
// ---------------------------------------------------------
template <typename AtomicT>
Result bench_shared_cas(int nthreads, const std::string& label) {
alignas(64) AtomicT counter{0};
uint64_t ops_per_thread = OPS_PER_BENCH / nthreads;
// deduce underlying type from load()
using T = typename std::remove_reference<
decltype(std::declval<AtomicT&>().load(std::memory_order_relaxed))
>::type;
auto worker = [&](int) {
for (uint64_t i = 0; i < ops_per_thread; ++i) {
T expected = counter.load(std::memory_order_relaxed);
for (;;) {
T desired = static_cast<T>(expected + 1);
if (counter.compare_exchange_weak(
expected, desired,
std::memory_order_acq_rel,
std::memory_order_relaxed)) {
break;
}
// expected was updated, try again
}
}
};
return run_bench(label, nthreads, [&] {
std::vector<std::thread> ths;
ths.reserve(nthreads);
for (int t = 0; t < nthreads; ++t)
ths.emplace_back(worker, t);
for (auto& th : ths) th.join();
});
}
// ---------------------------------------------------------
// 4) per-thread atomics (low contention)
// ---------------------------------------------------------
template <typename AtomicT>
Result bench_perthread_relaxed(int nthreads, const std::string& label) {
uint64_t ops_per_thread = OPS_PER_BENCH / nthreads;
std::vector<AtomicT> counters(nthreads);
for (auto& c : counters)
c.store(0, std::memory_order_relaxed);
auto worker = [&](int id) {
auto& my = counters[id];
for (uint64_t i = 0; i < ops_per_thread; ++i) {
my.fetch_add(1, std::memory_order_relaxed);
}
};
return run_bench(label, nthreads, [&] {
std::vector<std::thread> ths;
ths.reserve(nthreads);
for (int t = 0; t < nthreads; ++t)
ths.emplace_back(worker, t);
for (auto& th : ths) th.join();
});
}
int main() {
std::cout << "atomic<int> vs atomic<size_t> micro-benchmark\n";
std::cout << "OPS_PER_BENCH = " << OPS_PER_BENCH << "\n\n";
for (int threads = 1; threads <= MAX_THREADS; ++threads) {
{
auto r1 = bench_shared_relaxed<std::atomic<int>>(threads, "int shared relaxed");
auto r2 = bench_shared_relaxed<std::atomic<size_t>>(threads, "size_t shared relaxed");
print_result(r1);
print_result(r2);
}
{
auto r1 = bench_shared_seqcst<std::atomic<int>>(threads, "int shared seq_cst");
auto r2 = bench_shared_seqcst<std::atomic<size_t>>(threads, "size_t shared seq_cst");
print_result(r1);
print_result(r2);
}
{
auto r1 = bench_shared_cas<std::atomic<int>>(threads, "int shared CAS");
auto r2 = bench_shared_cas<std::atomic<size_t>>(threads, "size_t shared CAS");
print_result(r1);
print_result(r2);
}
{
auto r1 = bench_perthread_relaxed<std::atomic<int>>(threads, "int per-thread relaxed");
auto r2 = bench_perthread_relaxed<std::atomic<size_t>>(threads, "size_t per-thread relaxed");
print_result(r1);
print_result(r2);
}
std::cout << "------------------------------------------------------------\n";
}
return 0;
}
xchgormovorlock cmpxchgor any other atomic RMWs between 32 and 64-bit, although 64-bit operand-size does cost a REX prefix which wouldn't be needed if the compiler avoided any registers R8-R15. I wouldn't expect any differences on other ISAs either for aligned access to L1d cache with 32-bit or pointer width.