Skip to main content
Fixed small typo
Source Link
NIZGTR
  • 21
  • 1
  • 2
FVector4 rest(REST_DISTANCE); // Set the constant simd vector outside the loop
FVector4 stiffness(STIFFNESS);

for (int i = 0; i < TEST_SIZE; i++)
{
    const auto& constraint = Constraints[i];
    auto& position1 = Particle[constraint.first].Position;
    auto& position2 = Particle[constraint.second)]second].Position;

    for (int j = 0; j < ITERATION_COUNT; j++)
    {
        auto delta = position2 - position1;
        auto distance = delta.norm();

        auto correctionDistance = (distance - rest) / distance;
        auto pCorrection = correctionDistance * delta * stiffness;

        position1 += pCorrection;
        position2 -= pCorrection;
    }
}
FVector4 rest(REST_DISTANCE); // Set the constant simd vector outside the loop
FVector4 stiffness(STIFFNESS);

for (int i = 0; i < TEST_SIZE; i++)
{
    const auto& constraint = Constraints[i];
    auto& position1 = Particle[constraint.first].Position;
    auto& position2 = Particle[constraint.second)].Position;

    for (int j = 0; j < ITERATION_COUNT; j++)
    {
        auto delta = position2 - position1;
        auto distance = delta.norm();

        auto correctionDistance = (distance - rest) / distance;
        auto pCorrection = correctionDistance * delta * stiffness;

        position1 += pCorrection;
        position2 -= pCorrection;
    }
}
FVector4 rest(REST_DISTANCE); // Set the constant simd vector outside the loop
FVector4 stiffness(STIFFNESS);

for (int i = 0; i < TEST_SIZE; i++)
{
    const auto& constraint = Constraints[i];
    auto& position1 = Particle[constraint.first].Position;
    auto& position2 = Particle[constraint.second].Position;

    for (int j = 0; j < ITERATION_COUNT; j++)
    {
        auto delta = position2 - position1;
        auto distance = delta.norm();

        auto correctionDistance = (distance - rest) / distance;
        auto pCorrection = correctionDistance * delta * stiffness;

        position1 += pCorrection;
        position2 -= pCorrection;
    }
}
removed unnecessary chatty stuff
Source Link
janos
  • 113.1k
  • 15
  • 154
  • 396

This was originally posted on Stackoverflow but it was suggested to post this here for a better answer.


I can't seem to figure out a wayI'm especially interested in ways to further optimize a constraint algorithm. So far I've switched to SIMD sse4 vector operations which nearly doubled the performance. I'm hoping there is more performance to squeeze out that I simply don't see.

I can post the actual structs if needed.

Thank you.

This was originally posted on Stackoverflow but it was suggested to post this here for a better answer.


I can't seem to figure out a way to further optimize a constraint algorithm. So far I've switched to SIMD sse4 vector operations which nearly doubled the performance. I'm hoping there is more performance to squeeze out that I simply don't see.

I can post the actual structs if needed.

Thank you.

I'm especially interested in ways to further optimize a constraint algorithm. So far I've switched to SIMD sse4 vector operations which nearly doubled the performance. I'm hoping there is more performance to squeeze out that I simply don't see.

I can post the actual structs if needed.

Source Link
NIZGTR
  • 21
  • 1
  • 2

Constraint Algorithm Optimization in C++

This was originally posted on Stackoverflow but it was suggested to post this here for a better answer.


I can't seem to figure out a way to further optimize a constraint algorithm. So far I've switched to SIMD sse4 vector operations which nearly doubled the performance. I'm hoping there is more performance to squeeze out that I simply don't see.

Here is the algorithm:

FVector4 rest(REST_DISTANCE); // Set the constant simd vector outside the loop
FVector4 stiffness(STIFFNESS);

for (int i = 0; i < TEST_SIZE; i++)
{
    const auto& constraint = Constraints[i];
    auto& position1 = Particle[constraint.first].Position;
    auto& position2 = Particle[constraint.second)].Position;

    for (int j = 0; j < ITERATION_COUNT; j++)
    {
        auto delta = position2 - position1;
        auto distance = delta.norm();

        auto correctionDistance = (distance - rest) / distance;
        auto pCorrection = correctionDistance * delta * stiffness;

        position1 += pCorrection;
        position2 -= pCorrection;
    }
}

Constraints is an array of a simple struct that holds two unsigned int's, Particle is an array of FParticle structs which holds three FVector4 fields and a Fvector4 simply wraps a __m128 and provides operators and functions like norm() which utilize _mm_* intrinsic functions.

Currently the algorithm takes 55 cycles per Constraint per Iteration, this is with TEST_SIZE of 150,000 and ITERATION_COUNT at 16. I know there is performance to gain from making it parallel but I'm hoping to get more performance out of the single threaded version specifically.

I can post the actual structs if needed.

Thank you.

FVector4 code:

struct FVector4
{
    FVector4()
    {}
    explicit FVector4(const __m128& InitData) : Data(InitData)
    {
    }
    explicit FVector4(__m128&& InitData) : Data(InitData)
    {
    }
    explicit FVector4(float InitValue)
    {
        Data = _mm_set_ps1(InitValue);
    }
    explicit FVector4(float X, float Y, float Z, float W = 0.f)
    {
        Data = _mm_setr_ps(X, Y, Z, W);
    }

    __forceinline float& operator[](int index)
    {
        return Data.m128_f32[index];
    }
    __forceinline const float& operator[](int index) const
    {
        return Data.m128_f32[index];
    }
    __forceinline __m128 squaredNorm() const
    {
        return _mm_dp_ps(Data, Data, 0x7F);
    }
    __forceinline __m128 norm() const
    {
        auto sqNorm = squaredNorm();
        return _mm_mul_ps(sqNorm, _mm_rsqrt_ps(sqNorm));
    }
    __forceinline __m128 normalized() const
    {
        auto dp = _mm_dp_ps(Data, Data, 0x7F);
        dp = _mm_rsqrt_ps(dp);
        return _mm_mul_ps(Data, dp);
    }
    __forceinline void normalize()
    {
        Data = normalized();
    }
    __m128 Data;
};

Operators as used in the algorithm:

__forceinline FVector4 operator-(const FVector4& a, const FVector4& b)
{
    return FVector4(_mm_sub_ps(a.Data, b.Data));
}
__forceinline FVector4 operator*(const FVector4& a, const FVector4& b)
{
    return FVector4(_mm_mul_ps(a.Data, b.Data));
}
__forceinline FVector4 operator/(const FVector4& a, const FVector4& b)
{
    return FVector4(_mm_mul_ps(a.Data, _mm_rcp_ps(b.Data)));
}
__forceinline FVector4& operator+=(FVector4& a, const FVector4& b)
{
    a.Data = _mm_add_ps(a.Data, b.Data);
    return a;
}
__forceinline FVector4& operator-=(FVector4& a, const FVector4& b)
{
    a.Data = _mm_sub_ps(a.Data, b.Data);
    return a;
}