ChaseRLewis -3 Junior Poster in Training

So am doing a run down of different 3DMath Libraries. I am noticing functions with return types are HORRIBLY inefficient compared to void returns. I decided to do a rundown of how move constructor/assignment effects this. Well having a move constructor is seeing an average of 3.5-4x improvement, but doing the void method is 20x improvement. In fact comparing this to the DirectX Math library (fastest actual library) my void function is about 20% faster than even that and all it returns is a 4-byte pointer from it's equation.

Gonna modify the equation to make it more efficienct since I've noticed 3d applications don't need to do complete matrix multiplication to get accurate results, but is there anyway around this deficiency? It would seem to me there is no reason that

PsuedoCode

 Matrix4x4 m0,m1,m2;
m2 = m1*m0 //<- overhead from move should be smaller than measured
//Shouldn't be different 
void MatMultiply(m0,m1,m2) 

Even if it has to create a temporary number because the compiler needs to create a tempr r-value in some situations, but should only infer that for multiple multiplications such as. This should be able to be completed adnauseum so no more than 1 rvalue for any number of arithmetic operations returning rvalues of the same type

    Matrix4x4 rvalue;
    MatMultiply(m0,m0,rvalue)
    MatMultiply(rvalue,m1,m2) //<- 1 r-value Would Expect to be equivalent to m2 = m0*m0*m1

Actual Class

#include <memory>
#include <iostream>
#include <xmmintrin.h>

using namespace std;

class Matrix4x4
{
protected:
    float* Data;
public:
    #pragma region  OPERATORS
    Matrix4x4& operator=(Matrix4x4&& a) //Move Constructor
    {
            if(this != &a)
            {
                _aligned_free(Data);
                this->Data = std::move(a.Data);
                a.Data = nullptr;
            }       

            return *this;
    }

    Matrix4x4 operator*(const float& b)
    {
        Matrix4x4 r;
        for(int i = 0; i < 16;i++)
        {
            r.Data[i] *= b;
        }
        return r;
    }

    Matrix4x4 Multiply(const Matrix4x4& a)
    {
        Matrix4x4 r;
        __m128 a_line,b_line,r_line;
        for(int i = 0; i < 16; i += 4)
        {
            //unroll the first step of the loop to avoid having to initialize r_line to zero
            a_line = _mm_load_ps(&a.Data[i]);
            b_line = _mm_set1_ps(Data[i]);
            r_line = _mm_mul_ps(a_line,b_line);
            for(int j = 0; j < 4;j++)
            {
                a_line = _mm_load_ps(&(a.Data[j*4]));
                b_line = _mm_set1_ps(Data[i+j]);
                r_line = _mm_add_ps(_mm_mul_ps(a_line,b_line),r_line);
            }
            _mm_store_ps(&r.Data[i],r_line);
        }
        return r;
    }
    static void Multiply(const Matrix4x4& a,const Matrix4x4& b,Matrix4x4& c)
    {
        __m128 a_line,b_line,r_line;
        for(int i = 0; i < 16; i += 4)
        {
            //unroll the first step of the loop to avoid having to initialize r_line to zero
            a_line = _mm_load_ps(&a.Data[i]);
            b_line = _mm_set1_ps(b.Data[i]);
            r_line = _mm_mul_ps(a_line,b_line);
            for(int j = 0; j < 4;j++)
            {
                a_line = _mm_load_ps(&(a.Data[j*4]));
                b_line = _mm_set1_ps(b.Data[i+j]);
                r_line = _mm_add_ps(_mm_mul_ps(a_line,b_line),r_line);
            }
            _mm_store_ps(&c.Data[i],r_line);
        }
    }

    Matrix4x4 operator*(const Matrix4x4& a)
    {   
        Matrix4x4 r;
        __m128 a_line,b_line,r_line;
        for(int i = 0; i < 16; i += 4)
        {
            //unroll the first step of the loop to avoid having to initialize r_line to zero
            a_line = _mm_load_ps(&a.Data[i]);
            b_line = _mm_set1_ps(Data[i]);
            r_line = _mm_mul_ps(a_line,b_line);
            for(int j = 0; j < 4;j++)
            {
                a_line = _mm_load_ps(&(a.Data[j*4]));
                b_line = _mm_set1_ps(Data[i+j]);
                r_line = _mm_add_ps(_mm_mul_ps(a_line,b_line),r_line);
            }
            _mm_store_ps(&r.Data[i],r_line);
        }
        return r;
    }
#pragma endregion 
    Matrix4x4(const Matrix4x4& a)   : Data((float*)_aligned_malloc(16*sizeof(float),16))        //Copy Constructor
    {
        //Can Most Likely Vectorize Assignments
            Data[0] = a.Data[0];  
            Data[1] = a.Data[1];
            Data[2] = a.Data[2];
            Data[3] = a.Data[3];
            Data[4] = a.Data[4];
            Data[5] = a.Data[5];
            Data[6] = a.Data[6];
            Data[7] = a.Data[7];
            Data[8] = a.Data[8];
            Data[9] = a.Data[9];
            Data[10] = a.Data[10];
            Data[11] = a.Data[11];
            Data[12] = a.Data[12];
            Data[13] = a.Data[13];
            Data[14] = a.Data[14];
            Data[15] = a.Data[15];      
    } 

    Matrix4x4(Matrix4x4&& a) : Data(std::move(a.Data))              //Move Constructor
    {
        //Move Constructor
            a.Data = nullptr;
    }
    //Consider Comparing Initialization of a few different versions
    Matrix4x4(float d) : Data((float*)_aligned_malloc(16*sizeof(float),16))                 //Diagonal Constructor
    {
        memset(Data,0,sizeof(float)*16);
        Data[0] = d;
        Data[5] = d;
        Data[10] = d;
        Data[15] = 1;
    }
    Matrix4x4() : Data((float*)_aligned_malloc(16*sizeof(float),16))                                //Empty Constructor
    {

    }

    ~Matrix4x4()
    {
        if(Data)
        _aligned_free(Data);
    }

};

How I'm testing. Numbers are really consistent +/- .003 seconds for a milion iterations which is ~1% change or less.

    int main(void)
    {
        Matrix4x4 m0(1);
        Matrix4x4 m1(2);
        Matrix4x4 m3(1); //My Matrices

        XMMATRIX xm0 = XMMatrixIdentity();
        XMMATRIX xm1 = XMMatrixIdentity(); //DirectX Class
        XMMATRIX xm2 = XMMatrixIdentity();

        XMFLOAT4X4 xmf0;
        XMFLOAT4X4 xmf1; //This is the holding class for DirectX whereas XMMATRIX is a __m128[4] which can't be
                         //accessed directly. To ship to GPU I need to have an array in this form.
        GameTimer timer;
        timer.Reset();

        for(int i = 0; i < iterations;i++)
        {
                    m3 = m0*m1;     //Debug Time: 1.67 sec, Release Time: .67 sec
        }
        timer.Update();
        timer.Lap("Mine");
        for(int i = 0;i < iterations;i++)
        {
            Matrix4x4::Multiply(m0,m1,m3);   //Debug Time .23 sec, Release Time: .21 sec
        }
        timer.Update();
        timer.Lap("glm");

        for(int i = 0;i < iterations;i++)
        {
            xm0 = XMLoadFloat4x4(&xmf0);
            xm1 = XMLoadFloat4x4(&xmf1);
            xm2 = XMMatrixMultiply(xm0,xm1);
            XMStoreFloat4x4(&xmf0,xm2);      //Debug Time .43 sec, Release Time: .43 sec
        }

        timer.Update();
        timer.Lap("DirectX-Store");

        for(int i = 0;i < iterations;i++)
        {
            xm2 = XMMatrixMultiply(xm0,xm1);
            XMStoreFloat4x4(&xmf0,xm2);       //Debug Time .27 sec, Release Time: .26 sec
        }

        timer.Update();
        timer.Lap("DirectX-PreStored");
        PrintTimer(timer);

        return 0;
    }
Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.