Skip to main content
Add auto-vectorized answer
Source Link
Anders Kaseorg
  • 40.7k
  • 3
  • 76
  • 149

C (gccGCC), 2.65× faster than mat_mul_fast

(Compiled with gcc -O3 -mavx2 -mfma, measured on my Intel Comet Lake i7-10710U.)

#include <x86intrin.h>

enum { MAT_N = 64 };

void mat_mul_simd(void *_m, void *_x, void *_y) {
  enum { n = MAT_N };
  float (*m)[n] = _m, (*x)[n] = _x, (*y)[n] = _y;
  for (int i = 0; i < n; ++i) {
    __m256 mi[n / 8];
    for (int k = 0; k < n / 8; ++k)
      mi[k] = _mm256_setzero_ps();
    for (int j = 0; j < n; ++j) {
      __m256 xij = _mm256_broadcast_ss(&x[i * n + j]&x[i][j]);
      for (int k = 0; k < n / 8; ++k)
        mi[k] = _mm256_fmadd_ps(xij, _mm256_load_ps(&y[j * n + k&y[j][k * 8]), mi[k]);
    }
    for (int k = 0; k < n / 8; ++k)
      _mm256_store_ps(&m[i&m[i][k * 8], mi[k]);
  }
}

https://godbolt.org/z/xxvr45b67

C (GCC 11)

This completely auto-vectorized code is very nearly as fast as the above in GCC 11 (gcc -O3 -mavx2 -mfma), but regresses significantly in GCC 12. Auto-vectorization is fragile. 😞

enum { MAT_N = 64 };

void mat_mul_auto(void *_m, void *_x, void *_y) {
  enum { n += kMAT_N *};
 8] float(*restrict m)[n] = __builtin_assume_aligned(_m, mi[k]32);
  float(*restrict x)[n] = __builtin_assume_aligned(_x, 32);
  float(*restrict y)[n] = __builtin_assume_aligned(_y, 32);
  for (int i = 0; i < n; ++i) {
    float mi[n] = {};
    for (int j = 0; j < n; ++j)
      for (int k = 0; k < n; ++k)
        mi[k] += x[i][j] * y[j][k];
    for (int k = 0; k < n; ++k)
      m[i][k] = mi[k];
  }
}

https://godbolt.org/z/r5enWqPqThttps://godbolt.org/z/h4GEqGjGW

C (gcc), 2.65× faster than mat_mul_fast

(Compiled with gcc -O3 -mavx2 -mfma, measured on my Intel Comet Lake i7-10710U.)

#include <x86intrin.h>

enum { MAT_N = 64 };

void mat_mul_simd(void *_m, void *_x, void *_y) {
  enum { n = MAT_N };
  float *m = _m, *x = _x, *y = _y;
  for (int i = 0; i < n; ++i) {
    __m256 mi[n / 8];
    for (int k = 0; k < n / 8; ++k)
      mi[k] = _mm256_setzero_ps();
    for (int j = 0; j < n; ++j) {
      __m256 xij = _mm256_broadcast_ss(&x[i * n + j]);
      for (int k = 0; k < n / 8; ++k)
        mi[k] = _mm256_fmadd_ps(xij, _mm256_load_ps(&y[j * n + k * 8]), mi[k]);
    }
    for (int k = 0; k < n / 8; ++k)
      _mm256_store_ps(&m[i * n + k * 8], mi[k]);
  }
}

https://godbolt.org/z/r5enWqPqT

C (GCC), 2.65× faster than mat_mul_fast

(Compiled with gcc -O3 -mavx2 -mfma, measured on my Intel Comet Lake i7-10710U.)

#include <x86intrin.h>

enum { MAT_N = 64 };

void mat_mul_simd(void *_m, void *_x, void *_y) {
  enum { n = MAT_N };
  float(*m)[n] = _m, (*x)[n] = _x, (*y)[n] = _y;
  for (int i = 0; i < n; ++i) {
    __m256 mi[n / 8];
    for (int k = 0; k < n / 8; ++k)
      mi[k] = _mm256_setzero_ps();
    for (int j = 0; j < n; ++j) {
      __m256 xij = _mm256_broadcast_ss(&x[i][j]);
      for (int k = 0; k < n / 8; ++k)
        mi[k] = _mm256_fmadd_ps(xij, _mm256_load_ps(&y[j][k * 8]), mi[k]);
    }
    for (int k = 0; k < n / 8; ++k)
      _mm256_store_ps(&m[i][k * 8], mi[k]);
  }
}

https://godbolt.org/z/xxvr45b67

C (GCC 11)

This completely auto-vectorized code is very nearly as fast as the above in GCC 11 (gcc -O3 -mavx2 -mfma), but regresses significantly in GCC 12. Auto-vectorization is fragile. 😞

enum { MAT_N = 64 };

void mat_mul_auto(void *_m, void *_x, void *_y) {
  enum { n = MAT_N };
  float(*restrict m)[n] = __builtin_assume_aligned(_m, 32);
  float(*restrict x)[n] = __builtin_assume_aligned(_x, 32);
  float(*restrict y)[n] = __builtin_assume_aligned(_y, 32);
  for (int i = 0; i < n; ++i) {
    float mi[n] = {};
    for (int j = 0; j < n; ++j)
      for (int k = 0; k < n; ++k)
        mi[k] += x[i][j] * y[j][k];
    for (int k = 0; k < n; ++k)
      m[i][k] = mi[k];
  }
}

https://godbolt.org/z/h4GEqGjGW

added 38 characters in body
Source Link
Anders Kaseorg
  • 40.7k
  • 3
  • 76
  • 149

C (gcc), 2.65× faster than mat_mul_fast

(MeasuredCompiled with gcc -O3 -mavx2 -mfma, measured on my Intel Comet Lake i7-10710U.)

#include <x86intrin.h>

enum { MAT_N = 64 };

void mat_mul_simd(void *_m, void *_x, void *_y) {
  enum { n = MAT_N };
  float *m = _m, *x = _x, *y = _y;
  for (int i = 0; i < n; ++i) {
    __m256 mi[n / 8];
    for (int k = 0; k < n / 8; ++k)
      mi[k] = _mm256_setzero_ps();
    for (int j = 0; j < n; ++j) {
      __m256 xij = _mm256_broadcast_ss(&x[i * n + j]);
      for (int k = 0; k < n / 8; ++k)
        mi[k] = _mm256_fmadd_ps(xij, _mm256_load_ps(&y[j * n + k * 8]), mi[k]);
    }
    for (int k = 0; k < n / 8; ++k)
      _mm256_store_ps(&m[i * n + k * 8], mi[k]);
  }
}

https://godbolt.org/z/r5enWqPqT

C (gcc), 2.65× faster than mat_mul_fast

(Measured on my Intel Comet Lake i7-10710U.)

#include <x86intrin.h>

enum { MAT_N = 64 };

void mat_mul_simd(void *_m, void *_x, void *_y) {
  enum { n = MAT_N };
  float *m = _m, *x = _x, *y = _y;
  for (int i = 0; i < n; ++i) {
    __m256 mi[n / 8];
    for (int k = 0; k < n / 8; ++k)
      mi[k] = _mm256_setzero_ps();
    for (int j = 0; j < n; ++j) {
      __m256 xij = _mm256_broadcast_ss(&x[i * n + j]);
      for (int k = 0; k < n / 8; ++k)
        mi[k] = _mm256_fmadd_ps(xij, _mm256_load_ps(&y[j * n + k * 8]), mi[k]);
    }
    for (int k = 0; k < n / 8; ++k)
      _mm256_store_ps(&m[i * n + k * 8], mi[k]);
  }
}

https://godbolt.org/z/r5enWqPqT

C (gcc), 2.65× faster than mat_mul_fast

(Compiled with gcc -O3 -mavx2 -mfma, measured on my Intel Comet Lake i7-10710U.)

#include <x86intrin.h>

enum { MAT_N = 64 };

void mat_mul_simd(void *_m, void *_x, void *_y) {
  enum { n = MAT_N };
  float *m = _m, *x = _x, *y = _y;
  for (int i = 0; i < n; ++i) {
    __m256 mi[n / 8];
    for (int k = 0; k < n / 8; ++k)
      mi[k] = _mm256_setzero_ps();
    for (int j = 0; j < n; ++j) {
      __m256 xij = _mm256_broadcast_ss(&x[i * n + j]);
      for (int k = 0; k < n / 8; ++k)
        mi[k] = _mm256_fmadd_ps(xij, _mm256_load_ps(&y[j * n + k * 8]), mi[k]);
    }
    for (int k = 0; k < n / 8; ++k)
      _mm256_store_ps(&m[i * n + k * 8], mi[k]);
  }
}

https://godbolt.org/z/r5enWqPqT

Source Link
Anders Kaseorg
  • 40.7k
  • 3
  • 76
  • 149

C (gcc), 2.65× faster than mat_mul_fast

(Measured on my Intel Comet Lake i7-10710U.)

#include <x86intrin.h>

enum { MAT_N = 64 };

void mat_mul_simd(void *_m, void *_x, void *_y) {
  enum { n = MAT_N };
  float *m = _m, *x = _x, *y = _y;
  for (int i = 0; i < n; ++i) {
    __m256 mi[n / 8];
    for (int k = 0; k < n / 8; ++k)
      mi[k] = _mm256_setzero_ps();
    for (int j = 0; j < n; ++j) {
      __m256 xij = _mm256_broadcast_ss(&x[i * n + j]);
      for (int k = 0; k < n / 8; ++k)
        mi[k] = _mm256_fmadd_ps(xij, _mm256_load_ps(&y[j * n + k * 8]), mi[k]);
    }
    for (int k = 0; k < n / 8; ++k)
      _mm256_store_ps(&m[i * n + k * 8], mi[k]);
  }
}

https://godbolt.org/z/r5enWqPqT