Neural network with static std::array is slower than neural network using dynamic C-array

Question

There is a minimalistic (around 200 lines) neural network C library on github called Tinn. Tinn uses dynamic C arrays for representing weights, biases, neurons. I tried to implement it partially in C++ but using static std::array. I thought the static std::array would be much faster. However it is exactly the opposite after doing some measurements. Could anybody tell me if I am doing something wrong or tell me a reason why static array is beaten by dynamic even with -O3 optimizations?

Neural network with static arrays MLP_1.h

#pragma once    

#include <cmath>
#include <array>
#include <iostream>
#include <fstream>  

template<class Type, size_t nIn, size_t nHid, size_t nOut>
class MLP_1
{
public:
    static constexpr size_t nInputs = nIn;
    static constexpr size_t nHiddens = nHid;
    static constexpr size_t nOutputs = nOut;    

    static constexpr size_t nWeights = nHiddens * (nInputs + nOutputs);
    static constexpr size_t nBiases = 2;
    static constexpr size_t weightIndexOffset = nHiddens * nInputs; 

    std::array<Type, nWeights> weights;
    std::array<Type, nBiases> biases;   

    std::array<Type, nHiddens> hiddenNeurons;
    std::array<Type, nOut> outputNeurons;   


    static Type activationFunction(const Type x) noexcept
    {
        //return x / (1 + std::abs(x)); // faster
        return 1.0 / (1.0 + std::exp(-x));
    }   


    void forwardPropagation(const Type* const input) noexcept
    {
        // Calculate hidden layer neuron values.
        for(size_t i = 0; i < nHiddens; ++i)
        {
            Type sum = 0.0;
            for(size_t j = 0; j < nInputs; ++j)
            {
                const size_t weightIndex = (i * nInputs) + j;
                sum += input[j] * weights[weightIndex];
            }
            hiddenNeurons[i] = activationFunction(sum + biases[0]);
        }
        // Calculate output layer neuron values.
        for(size_t i = 0; i < nOutputs; ++i)
        {
            Type sum = 0.0;
            for(size_t j = 0; j < nHiddens; ++j)
            {
                const size_t weightIndex = weightIndexOffset + (i * nHiddens) + j;
                sum += hiddenNeurons[j] * weights[weightIndex];
            }
            outputNeurons[i] = activationFunction(sum + biases[1]);
        }   
    }   

    const Type* const predict(const Type* const input) noexcept
    {
        forwardPropagation(input);
        return outputNeurons.data();
    }   

    const std::array<Type, nOutputs>& predict(const std::array<Type, nInputs>& inputArray)
    {
        forwardPropagation(inputArray.data());
        return outputNeurons;
    }   

    void load(const char* const path) noexcept
    {
        std::ifstream inputFile(path);
        size_t nInputsFile, nHiddensFile, nOutputsFile;
        std::string ignoreString;   

        inputFile >> nInputsFile >> nHiddensFile >> nOutputsFile;   

        if ((nInputs != nInputsFile) || (nHiddens != nHiddensFile) || (nOutputs != nOutputsFile))
        {
            std::cout << "Size missmatch.\n";
            std::cout << nInputs << ", " << nHiddens << ", " << nOutputs << std::endl;
            std::cout << nInputsFile << ", " << nHiddensFile << ", " << nOutputsFile << std::endl;
        }   

        for (auto& bias : biases)
        {
            Type biasFile;
            inputFile >> biasFile;
            bias = biasFile;
        }   

        for (auto& weight : weights)
        {
            Type weightFile;
            inputFile >> weightFile;
            weight = weightFile;
        }
    }   

    void printWeights() const
    {
        std::cout << "weights: ";
        for (const auto& w : weights) { std::cout << w << " "; }
        std::cout << "\n";
    }
    void printBiases() const
    {
        std::cout << "biases: ";
        for (const auto& b : biases) { std::cout << b << " "; }
        std::cout << "\n";
    }   

    void print() const
    {
        printWeights();
        printBiases();
    }
};

Neural network with dynamic arrays - Tinn.h

#pragma once    

#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>   


typedef struct
{
    // All the weights.
    float* w;
    // Hidden to output layer weights.
    float* x;
    // Biases.
    float* b;
    // Hidden layer.
    float* h;
    // Output layer.
    float* o;
    // Number of biases - always two - Tinn only supports a single hidden layer.
    int nb;
    // Number of weights.
    int nw;
    // Number of inputs.
    int nips;
    // Number of hidden neurons.
    int nhid;
    // Number of outputs.
    int nops;
}
Tinn;   

// Returns floating point random from 0.0 - 1.0.
static float frand()
{
    return rand() / (float) RAND_MAX;
}   

// Activation function.
static float act(const float a)
{
    return 1.0f / (1.0f + expf(-a));
}   

// Performs forward propagation.
static void fprop(const Tinn t, const float* const in)
{
    // Calculate hidden layer neuron values.
    for(int i = 0; i < t.nhid; i++)
    {
        float sum = 0.0f;
        for(int j = 0; j < t.nips; j++)
            sum += in[j] * t.w[i * t.nips + j];
        t.h[i] = act(sum + t.b[0]);
    }
    // Calculate output layer neuron values.
    for(int i = 0; i < t.nops; i++)
    {
        float sum = 0.0f;
        for(int j = 0; j < t.nhid; j++)
            sum += t.h[j] * t.x[i * t.nhid + j];
        t.o[i] = act(sum + t.b[1]);
    }
}   

// Randomizes tinn weights and biases.
static void wbrand(const Tinn t)
{
    for(int i = 0; i < t.nw; i++) t.w[i] = frand() - 0.5f;
    for(int i = 0; i < t.nb; i++) t.b[i] = frand() - 0.5f;
}   

// Returns an output prediction given an input.
float* xtpredict(const Tinn t, const float* const in)
{
    fprop(t, in);
    return t.o;
}   


// Constructs a tinn with number of inputs, number of hidden neurons, and number of outputs
Tinn xtbuild(const int nips, const int nhid, const int nops)
{
    Tinn t;
    // Tinn only supports one hidden layer so there are two biases.
    t.nb = 2;
    t.nw = nhid * (nips + nops);
    t.w = (float*) calloc(t.nw, sizeof(*t.w));
    t.x = t.w + nhid * nips;
    t.b = (float*) calloc(t.nb, sizeof(*t.b));
    t.h = (float*) calloc(nhid, sizeof(*t.h));
    t.o = (float*) calloc(nops, sizeof(*t.o));
    t.nips = nips;
    t.nhid = nhid;
    t.nops = nops;
    wbrand(t);
    return t;
}   

// Saves a tinn to disk.
void xtsave(const Tinn t, const char* const path)
{
    FILE* const file = fopen(path, "w");
    // Save header.
    fprintf(file, "%d %d %d\n", t.nips, t.nhid, t.nops);
    // Save biases and weights.
    for(int i = 0; i < t.nb; i++) fprintf(file, "%f\n", (double) t.b[i]);
    for(int i = 0; i < t.nw; i++) fprintf(file, "%f\n", (double) t.w[i]);
    fclose(file);
}   

// Loads a tinn from disk.
Tinn xtload(const char* const path)
{
    FILE* const file = fopen(path, "r");
    int nips = 0;
    int nhid = 0;
    int nops = 0;
    // Load header.
    fscanf(file, "%d %d %d\n", &nips, &nhid, &nops);
    // Build a new tinn.
    const Tinn t = xtbuild(nips, nhid, nops);
    // Load biaes and weights.
    for(int i = 0; i < t.nb; i++) fscanf(file, "%f\n", &t.b[i]);
    for(int i = 0; i < t.nw; i++) fscanf(file, "%f\n", &t.w[i]);
    fclose(file);
    return t;
}   

// Frees object from heap.
void xtfree(const Tinn t)
{
    free(t.w);
    free(t.b);
    free(t.h);
    free(t.o);
}   

// Prints an array of floats. Useful for printing predictions.
void xtprint(const float* arr, const int size)
{
    for(int i = 0; i < size; i++)
        printf("%f ", (double) arr[i]);
    printf("\n");
}   

void xtprint(const Tinn& tinn)
{
    printf("weights: ");
    xtprint(tinn.w, tinn.nw);   

    printf("biases: ");
    xtprint(tinn.b, tinn.nb);
}

Main with tests main.cpp

#include <iostream>
#include "MLP_1.h"
#include "Tinn.h"
#include <array>
#include <iterator>
#include <random>
#include <algorithm>    

#include <chrono>   

constexpr size_t in = 748;
constexpr size_t hid = 20;
constexpr size_t out = 5;   

const char* const path = "tinn01.txt";  

template< class Iter >
void fill_with_random_values( Iter start, Iter end, int min, int max)
{
    static std::random_device rd;    // you only need to initialize it once
    static std::mt19937 mte(rd());   // this is a relative big object to create 

    std::uniform_real_distribution<float> dist(min, max);   

    std::generate(start, end, [&] () { return dist(mte); });
}   

void testMLP(MLP_1<float, in, hid, out>& mlp, const std::array<float, in>& array)
{
    std::cout << "------MLP------\n";
    float sum = 0;
    const float* data = array.data();   

    auto start = std::chrono::system_clock::now();
    for (size_t i = 0; i < 60000; ++i)
    {
        const float* inputRes1 = mlp.predict(data);
        sum += inputRes1[0];
    }
    auto end = std::chrono::system_clock::now();
    auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);  

    std::cout << "sum:" << sum << "\n";
    std::cout << "elapsed time: " << elapsed.count() << "ms" << "\n";
    std::cout << "------MLP------\n";
}   

void testTinn(Tinn& tinn, const std::array<float, in>& array)
{
    std::cout << "------TINN------\n";
    float sum = 0;
    const float* data = array.data();   

    auto start = std::chrono::system_clock::now();
    for (size_t i = 0; i < 60000; ++i)
    {
        const float* inputRes1 = xtpredict(tinn, data);
        sum += inputRes1[0];
    }
    auto end = std::chrono::system_clock::now();
    auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);  

    std::cout << "sum:" << sum << "\n";
    std::cout << "elapsed time: " << elapsed.count() << "ms" << "\n";
    std::cout << "------TINN------\n";
}   

int main()
{

    Tinn sTinn = xtbuild(in, hid, out);
    xtsave(sTinn, path);    

    Tinn tinn1 = xtload(path);  

    MLP_1<float, in, hid, out> mlp;
    mlp.load(path); 

    std::array<float, in> inputTest;    

    fill_with_random_values(inputTest.begin(), inputTest.end(), -10.0, 10.0);   

    testMLP(mlp, inputTest);
    std::cout << "\n";
    testTinn(tinn1, inputTest); 

    return 0;
}

With g++ -std=c++14 -O0 main.cpp I get:

------MLP------
sum:33171.4
elapsed time: 6524ms
------MLP------

------TINN------
sum:33171.4
elapsed time: 2256ms
------TINN------

With g++ -std=c++14 -O3 main.cpp I get:

------MLP------
sum:19567.4
elapsed time: 758ms
------MLP------

------TINN------
sum:19567.4
elapsed time: 739ms
------TINN------

-O0 makes the performance numbers irrelevant. The -O3 numbers are pretty close. — Eljay
– Eljay, Commented Sep 27, 2019 at 2:41
One guess could be the operator[] for arrays doing some range checks in debug mode. — lolando
– lolando, Commented Sep 27, 2019 at 3:07

Eugene · Accepted Answer · 2019-09-27 02:56:05Z

1

With dynamic memory allocation, the slow part is allocating and freeing memory. There is no memory allocation in the loop you measure, so there is no reason to expect the dynamically allocated version to be slower. And indeed, with -O3 optimization, the runtimes are almost identical.

One difference between the programs that could affect runtime is the use of different random number generators. std::mt19937 is vastly better than rand(), but might be slower.

answered Sep 27, 2019 at 2:56

Eugene

7,7762 gold badges26 silver badges37 bronze badges

Sign up to request clarification or add additional context in comments.

Collectives™ on Stack Overflow

Neural network with static std::array is slower than neural network using dynamic C-array

1 Answer 1

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Related