I read this other SO question and answer and it seems to make sense to me but I had one additional question to add it it.
The most up voted answer says
For small functions that are called frequently that can make a big performance difference.
okay, so what would be considered a small function?
The reason that I am asking is that I am looking at using a math library, vectormath from the bullet physics framework. All their math functions are static inline but while some are fairly short some are pretty long.
Here's what I consider short:
static inline void vmathM3Copy( VmathMatrix3 *result, const VmathMatrix3 *mat )
{
vmathV3Copy( &result->col0, &mat->col0 );
vmathV3Copy( &result->col1, &mat->col1 );
vmathV3Copy( &result->col2, &mat->col2 );
}
but even that would embed this function 3 time:
static inline void vmathV3Copy( VmathVector3 *result, const VmathVector3 *vec )
{
result->x = vec->x;
result->y = vec->y;
result->z = vec->z;
}
Here's what seems to be long to me:
static inline float vmathM4Determinant( const VmathMatrix4 *mat )
{
float dx, dy, dz, dw, mA, mB, mC, mD, mE, mF, mG, mH, mI, mJ, mK, mL, mM, mN, mO, mP, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
mA = mat->col0.x;
mB = mat->col0.y;
mC = mat->col0.z;
mD = mat->col0.w;
mE = mat->col1.x;
mF = mat->col1.y;
mG = mat->col1.z;
mH = mat->col1.w;
mI = mat->col2.x;
mJ = mat->col2.y;
mK = mat->col2.z;
mL = mat->col2.w;
mM = mat->col3.x;
mN = mat->col3.y;
mO = mat->col3.z;
mP = mat->col3.w;
tmp0 = ( ( mK * mD ) - ( mC * mL ) );
tmp1 = ( ( mO * mH ) - ( mG * mP ) );
tmp2 = ( ( mB * mK ) - ( mJ * mC ) );
tmp3 = ( ( mF * mO ) - ( mN * mG ) );
tmp4 = ( ( mJ * mD ) - ( mB * mL ) );
tmp5 = ( ( mN * mH ) - ( mF * mP ) );
dx = ( ( ( mJ * tmp1 ) - ( mL * tmp3 ) ) - ( mK * tmp5 ) );
dy = ( ( ( mN * tmp0 ) - ( mP * tmp2 ) ) - ( mO * tmp4 ) );
dz = ( ( ( mD * tmp3 ) + ( mC * tmp5 ) ) - ( mB * tmp1 ) );
dw = ( ( ( mH * tmp2 ) + ( mG * tmp4 ) ) - ( mF * tmp0 ) );
return ( ( ( ( mA * dx ) + ( mE * dy ) ) + ( mI * dz ) ) + ( mM * dw ) );
}
or even this one
static inline void vmathM4Inverse( VmathMatrix4 *result, const VmathMatrix4 *mat )
{
VmathVector4 res0, res1, res2, res3;
float mA, mB, mC, mD, mE, mF, mG, mH, mI, mJ, mK, mL, mM, mN, mO, mP, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, detInv;
mA = mat->col0.x;
mB = mat->col0.y;
mC = mat->col0.z;
mD = mat->col0.w;
mE = mat->col1.x;
mF = mat->col1.y;
mG = mat->col1.z;
mH = mat->col1.w;
mI = mat->col2.x;
mJ = mat->col2.y;
mK = mat->col2.z;
mL = mat->col2.w;
mM = mat->col3.x;
mN = mat->col3.y;
mO = mat->col3.z;
mP = mat->col3.w;
tmp0 = ( ( mK * mD ) - ( mC * mL ) );
tmp1 = ( ( mO * mH ) - ( mG * mP ) );
tmp2 = ( ( mB * mK ) - ( mJ * mC ) );
tmp3 = ( ( mF * mO ) - ( mN * mG ) );
tmp4 = ( ( mJ * mD ) - ( mB * mL ) );
tmp5 = ( ( mN * mH ) - ( mF * mP ) );
vmathV4SetX( &res0, ( ( ( mJ * tmp1 ) - ( mL * tmp3 ) ) - ( mK * tmp5 ) ) );
vmathV4SetY( &res0, ( ( ( mN * tmp0 ) - ( mP * tmp2 ) ) - ( mO * tmp4 ) ) );
vmathV4SetZ( &res0, ( ( ( mD * tmp3 ) + ( mC * tmp5 ) ) - ( mB * tmp1 ) ) );
vmathV4SetW( &res0, ( ( ( mH * tmp2 ) + ( mG * tmp4 ) ) - ( mF * tmp0 ) ) );
detInv = ( 1.0f / ( ( ( ( mA * res0.x ) + ( mE * res0.y ) ) + ( mI * res0.z ) ) + ( mM * res0.w ) ) );
vmathV4SetX( &res1, ( mI * tmp1 ) );
vmathV4SetY( &res1, ( mM * tmp0 ) );
vmathV4SetZ( &res1, ( mA * tmp1 ) );
vmathV4SetW( &res1, ( mE * tmp0 ) );
vmathV4SetX( &res3, ( mI * tmp3 ) );
vmathV4SetY( &res3, ( mM * tmp2 ) );
vmathV4SetZ( &res3, ( mA * tmp3 ) );
vmathV4SetW( &res3, ( mE * tmp2 ) );
vmathV4SetX( &res2, ( mI * tmp5 ) );
vmathV4SetY( &res2, ( mM * tmp4 ) );
vmathV4SetZ( &res2, ( mA * tmp5 ) );
vmathV4SetW( &res2, ( mE * tmp4 ) );
tmp0 = ( ( mI * mB ) - ( mA * mJ ) );
tmp1 = ( ( mM * mF ) - ( mE * mN ) );
tmp2 = ( ( mI * mD ) - ( mA * mL ) );
tmp3 = ( ( mM * mH ) - ( mE * mP ) );
tmp4 = ( ( mI * mC ) - ( mA * mK ) );
tmp5 = ( ( mM * mG ) - ( mE * mO ) );
vmathV4SetX( &res2, ( ( ( mL * tmp1 ) - ( mJ * tmp3 ) ) + res2.x ) );
vmathV4SetY( &res2, ( ( ( mP * tmp0 ) - ( mN * tmp2 ) ) + res2.y ) );
vmathV4SetZ( &res2, ( ( ( mB * tmp3 ) - ( mD * tmp1 ) ) - res2.z ) );
vmathV4SetW( &res2, ( ( ( mF * tmp2 ) - ( mH * tmp0 ) ) - res2.w ) );
vmathV4SetX( &res3, ( ( ( mJ * tmp5 ) - ( mK * tmp1 ) ) + res3.x ) );
vmathV4SetY( &res3, ( ( ( mN * tmp4 ) - ( mO * tmp0 ) ) + res3.y ) );
vmathV4SetZ( &res3, ( ( ( mC * tmp1 ) - ( mB * tmp5 ) ) - res3.z ) );
vmathV4SetW( &res3, ( ( ( mG * tmp0 ) - ( mF * tmp4 ) ) - res3.w ) );
vmathV4SetX( &res1, ( ( ( mK * tmp3 ) - ( mL * tmp5 ) ) - res1.x ) );
vmathV4SetY( &res1, ( ( ( mO * tmp2 ) - ( mP * tmp4 ) ) - res1.y ) );
vmathV4SetZ( &res1, ( ( ( mD * tmp5 ) - ( mC * tmp3 ) ) + res1.z ) );
vmathV4SetW( &res1, ( ( ( mH * tmp4 ) - ( mG * tmp2 ) ) + res1.w ) );
vmathV4ScalarMul( &result->col0, &res0, detInv );
vmathV4ScalarMul( &result->col1, &res1, detInv );
vmathV4ScalarMul( &result->col2, &res2, detInv );
vmathV4ScalarMul( &result->col3, &res3, detInv );
}
The guys who wrote the library obviously understand the math very well but if your doing a lot of math operations and the compiler probably inlining all these functions wouldn't you get a bigger file?
inlinedoes not mean that the compiler must inline the function, but merely "suggests that calls to the function be as fast as possible" (says the final draft of the C11 standard in 6.7.4 clause 6).