Optimizing for SSE: Code Samples

jijo 2009-09-17
展开全文

#include "Matrix4f.h"
#include "Vector4f.h"

// MatrixMultiply1 -- a naive C++ matrix-vector multiplication function.
// It's correct, but that's about the only thing impressive about it.
//
// Performance: ~90 cycles/vector
Vector4f MatrixMultiply1(Matrix4f &m, Vector4f &vin)
{
float v0 =   m.elts[0][0]*vin[0] + m.elts[0][1]*vin[1] +
m.elts[0][2]*vin[2] + m.elts[0][3]*vin[3];
float v1 =  m.elts[1][0]*vin[0] + m.elts[1][1]*vin[1] +
m.elts[1][2]*vin[2] + m.elts[1][3]*vin[3];
float v2 =  m.elts[2][0]*vin[0] + m.elts[2][1]*vin[1] +
m.elts[2][2]*vin[2] + m.elts[2][3]*vin[3];
float v3 =  m.elts[3][0]*vin[0] + m.elts[3][1]*vin[1] +
m.elts[3][2]*vin[2] + m.elts[3][3]*vin[3];
return Vector4f(v0,v1,v2,v3);
}

// MatrixMultiply2 -- a faster version of MatrixMultiply1, still in C++.
//
// Performance: 70 cycles/vector
void MatrixMultiply2(Matrix4f &m, Vector4f *vin, Vector4f *vout)
{
float *in = vin->Ref();
float *out = vout->Ref();
out[0] =   m.elts[0][0]*in[0] + m.elts[0][1]*in[1] +
m.elts[0][2]*in[2] + m.elts[0][3]*in[3];
out[1] =  m.elts[1][0]*in[0] + m.elts[1][1]*in[1] +
m.elts[1][2]*in[2] + m.elts[1][3]*in[3];
out[2] =  m.elts[2][0]*in[0] + m.elts[2][1]*in[1] +
m.elts[2][2]*in[2] + m.elts[2][3]*in[3];
out[3] =  m.elts[3][0]*in[0] + m.elts[3][1]*in[1] +
m.elts[3][2]*in[2] + m.elts[3][3]*in[3];
}

// MatrixMultiply3 -- a C++/ASM version of MatrixMultiply2, which takes
// advantage of Intel's SSE instructions.  This version requires that
// M be in column-major order.
//
// Performance: 57 cycles/vector
void MatrixMultiply3(Matrix4f &m, Vector4f *vin, Vector4f *vout)
{
// Get a pointer to the elements of m
float *row0 = m.Ref();
__asm {
mov         esi, vin
mov         edi, vout
// load columns of matrix into xmm4-7
mov         edx, row0
movups   xmm4, [edx]
movups   xmm5, [edx+0x10]
movups   xmm6, [edx+0x20]
movups   xmm7, [edx+0x30]
// load v into xmm0.
movups   xmm0, [esi]
// we'll store the final result in xmm2; initialize it
// to zero
xorps      xmm2, xmm2
// broadcast x into xmm1, multiply it by the first
// column of the matrix (xmm4), and add it to the total
movups   xmm1, xmm0
shufps   xmm1, xmm1, 0x00
mulps      xmm1, xmm4
addps      xmm2, xmm1
// repeat the process for y, z and w
movups   xmm1, xmm0
shufps   xmm1, xmm1, 0x55
mulps      xmm1, xmm5
addps      xmm2, xmm1
movups   xmm1, xmm0
shufps   xmm1, xmm1, 0xAA
mulps      xmm1, xmm6
addps      xmm2, xmm1
movups   xmm1, xmm0
shufps   xmm1, xmm1, 0xFF
mulps      xmm1, xmm7
addps      xmm2, xmm1
// write the results to vout
movups   [edi], xmm2
}
}

// BatchMultiply1 -- A modification to MatrixMultiply2 in which we
// multiply several input vectors (vin) by the same matrix (m), storing the
// results in 'vout'.  A total of 'len' vectors are processed.  This
// prevents us from having to re-load 'm' every time through the loop.
// This also allows us to embed the tranpose operation into the function
// body, so we can continue to store our matrices in row-major order,
// if we wish.
//
// Performance: 32 cycles/vector
void BatchMultiply1(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// transpose the matrix into the xmm4-7
m.TransposeIntoXMM();
static const int vecSize = sizeof(Vector4f);
__asm {
mov         esi, vin
mov         edi, vout
mov         ecx, len
BM1_START:
// load the next input vector into xmm0, and advance the input
// pointer
movups   xmm0, [esi]
add         esi, vecSize
// we'll store the final result in xmm2; initialize it
// to zero
xorps      xmm2, xmm2
// broadcast x into xmm1, multiply it by the first
// column of the matrix (xmm4), and add it to the total
movups   xmm1, xmm0
shufps   xmm1, xmm1, 0x00
mulps      xmm1, xmm4
addps      xmm2, xmm1
// repeat the process for y, z and w
movups   xmm1, xmm0
shufps   xmm1, xmm1, 0x55
mulps      xmm1, xmm5
addps      xmm2, xmm1
movups   xmm1, xmm0
shufps   xmm1, xmm1, 0xAA
mulps      xmm1, xmm6
addps      xmm2, xmm1
movups   xmm1, xmm0
shufps   xmm1, xmm1, 0xFF
mulps      xmm1, xmm7
addps      xmm2, xmm1
// write the results to vout, and advance the output pointer
movups   [edi], xmm2
add         edi, vecSize
dec         ecx
jnz         BM1_START
}
}

// BatchMultiply2 -- A simple modification to BatchMultiply1: we now use
// aligned moves (movaps) instead of unaligned moves (movups).  This is much
// faster, but requires that Matrix4f and Vector4f objects are aligned
// on 16-byte boundaries.  We use the __declspec(align(16)) specifier in
// the Matrix4f and Vector4f class definitions to accomplish this.
//
// Performance: 28 cycles/vector
void BatchMultiply2(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// transpose the matrix into the xmm4-7
m.TransposeIntoXMM();
static const int vecSize = sizeof(Vector4f);
__asm {
mov         esi, vin
mov         edi, vout
mov         ecx, len
BM2_START:
// load the next input vector into xmm0, and advance the input
// pointer
movaps   xmm0, [esi]
add         esi, vecSize
// we'll store the final result in xmm2; initialize it
// to zero
xorps      xmm2, xmm2
// broadcast x into xmm1, multiply it by the first
// column of the matrix (xmm4), and add it to the total
movaps   xmm1, xmm0
shufps   xmm1, xmm1, 0x00
mulps      xmm1, xmm4
addps      xmm2, xmm1
// repeat the process for y, z and w
movaps   xmm1, xmm0
shufps   xmm1, xmm1, 0x55
mulps      xmm1, xmm5
addps      xmm2, xmm1
movaps   xmm1, xmm0
shufps   xmm1, xmm1, 0xAA
mulps      xmm1, xmm6
addps      xmm2, xmm1
movaps   xmm1, xmm0
shufps   xmm1, xmm1, 0xFF
mulps      xmm1, xmm7
addps      xmm2, xmm1
// write the results to vout, advance the output pointer,
// and loop
movaps   [edi], xmm2
add         edi, vecSize
dec         ecx
jnz         BM2_START
}
}

// BatchMultiply3 -- A modification to BatchMultiply2 which makes better
// use of instruction pairing.
//
// Performance: 22 cycles/vector
void BatchMultiply3(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// transpose the matrix into the xmm4-7
m.TransposeIntoXMM();
static const int vecSize = sizeof(Vector4f);
__asm {
mov         esi, vin
mov         edi, vout
mov         ecx, len
BM3_START:
// load the next input vector into xmm0, and advance the input
// and output pointers
movaps   xmm0, [esi]
add         edi, vecSize
// broadcast y into xmm1, z into xmm2, and w into xmm3 (leaving
// x in xmm0).
movaps   xmm1, xmm0
add         esi, vecSize
movaps   xmm2, xmm0
movaps   xmm3, xmm0
shufps   xmm0, xmm0, 0x00
shufps   xmm1, xmm1, 0x55
shufps   xmm2, xmm2, 0xAA
shufps   xmm3, xmm3, 0xFF
// multiply xmm0-3 by the appropriate columns of the matrix
mulps      xmm0, xmm4
mulps      xmm1, xmm5
mulps      xmm2, xmm6
mulps      xmm3, xmm7
// sum the results into xmm1
addps      xmm1, xmm0
addps      xmm2, xmm3
addps      xmm1, xmm2
// write the results to vout, and loop
movaps   [edi-0x10], xmm1
dec         ecx
jnz         BM3_START
}
}

// BatchMultiply4 -- A modification to BatchMultiply3 which uses 
// SSE prefetching instructions to improve performance with large
// input sets.
//
// Performance: 21 cycles/vector
void BatchMultiply4(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// transpose the matrix into the xmm4-7
m.TransposeIntoXMM();
static const int vecSize = sizeof(Vector4f);
__asm {
mov         esi, vin
mov         edi, vout
mov         ecx, len
BM4_START:
// load the next input vector into xmm0, and advance the input
// pointer.  Prefetch upcoming vectors into the cache
movaps   xmm0, [esi]
prefetchnta   [esi+0x30]
// broadcast y into xmm1, z into xmm2, and w into xmm3 (leaving
// x in xmm0).
movaps   xmm1, xmm0
add         esi, vecSize
movaps   xmm2, xmm0
add         edi, vecSize
movaps   xmm3, xmm0
prefetchnta [edi+0x30]
shufps   xmm0, xmm0, 0x00
shufps   xmm1, xmm1, 0x55
shufps   xmm2, xmm2, 0xAA
shufps   xmm3, xmm3, 0xFF
// multiply xmm0-3 by the appropriate columns of the matrix
// (hiding a pointer increment between the multiplies)
mulps      xmm0, xmm4
mulps      xmm1, xmm5
mulps      xmm2, xmm6
mulps      xmm3, xmm7
// sum the results into xmm1
addps      xmm1, xmm0
addps      xmm2, xmm3
addps      xmm1, xmm2
// write the results to vout, and loop
movaps   [edi-0x10], xmm1
dec         ecx
jnz         BM4_START
}
}

// BatchMultiply5 -- A modified version of BatchMultiply4 which loads
// vector components individually from memory, thereby allowing us
// to work on TWO VECTORS SIMULTANEOUSLY!
//
// Performance: 20 cycles/vector
void BatchMultiply5(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// initializations in C++ land
Matrix4f mt(m, Matrix4f::TRANSPOSE); // work from a 
float *row0 = mt.Ref();
static const int vecSize = 2 * sizeof(Vector4f);
// if there are an odd number of vectors, process the first one
// separately and advance the pointers
if (len & 0x1) {
MatrixMultiply3(mt, vin, vout);
++vin;
++vout;
}
len >>= 1; // we process two vectors at a time
__asm {
mov         esi, vin
mov         edi, vout
mov         ecx, len
// load columns of matrix into xmm4-7
mov         edx, row0
movaps   xmm4, [edx]
movaps   xmm5, [edx+0x10]
movaps   xmm6, [edx+0x20]
movaps   xmm7, [edx+0x30]
BM5_START:
// process x
movss      xmm1, [esi+0x00]
movss      xmm3, [esi+0x10]
shufps   xmm1, xmm1, 0x00
prefetchnta   [esi+0x30]
shufps   xmm3, xmm3, 0x00
mulps      xmm1, xmm4
prefetchnta [edi+0x30]
mulps      xmm3, xmm4
// process y
movss      xmm0, [esi+0x04]
movss      xmm2, [esi+0x14]
shufps   xmm0, xmm0, 0x00
shufps   xmm2, xmm2, 0x00
mulps      xmm0, xmm5
mulps      xmm2, xmm5
addps      xmm1, xmm0
addps      xmm3, xmm2
// process z
movss      xmm0, [esi+0x08]
movss      xmm2, [esi+0x18]
shufps   xmm0, xmm0, 0x00
shufps   xmm2, xmm2, 0x00
mulps      xmm0, xmm6
mulps      xmm2, xmm6
addps      xmm1, xmm0
addps      xmm3, xmm2
// process w (hiding some pointer increments between the
// multiplies)
movss      xmm0, [esi+0x0C]
movss      xmm2, [esi+0x1C]
shufps   xmm0, xmm0, 0x00
shufps   xmm2, xmm2, 0x00
mulps      xmm0, xmm7
add         esi, vecSize
mulps      xmm2, xmm7
add         edi, vecSize
addps      xmm1, xmm0
addps      xmm3, xmm2
// write output vectors to memory, and loop
movaps   [edi-0x20], xmm1
movaps   [edi-0x10], xmm3
dec         ecx
jnz         BM5_START
}
}

// BatchTransform1 -- A modified version of BatchMultiply4 which makes
// an additional assumption about the vectors in vin: if each vector's
// 4th element (the homogenous coordinate w) is assumed to be 1.0 (as is
// the case for 3D vertices), we can eliminate a move, a shuffle and a
// multiply instruction.
//
// Performance: 17 cycles/vector
void BatchTransform1(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// initializations in C++ land
Matrix4f mt(m, Matrix4f::TRANSPOSE); // work from a 
float *row0 = mt.Ref();
static const int vecSize = 2 * sizeof(Vector4f);
// if there are an odd number of vectors, process the first one
// separately and advance the pointers
if (len & 0x1) {
MatrixMultiply3(mt, vin, vout);
++vin;
++vout;
}
len >>= 1; // we process two vectors at a time
__asm {
mov      esi, vin
mov      edi, vout
mov      ecx, len
// load columns of matrix into xmm4-7
mov      edx, row0
movaps   xmm4, [edx]
movaps   xmm5, [edx+0x10]
movaps   xmm6, [edx+0x20]
movaps   xmm7, [edx+0x30]
BT2_START:
// process x (hiding the prefetches in the delays)
movss      xmm1, [esi+0x00]
movss      xmm3, [esi+0x10]
shufps   xmm1, xmm1, 0x00
prefetchnta [edi+0x30]
shufps   xmm3, xmm3, 0x00
mulps      xmm1, xmm4
prefetchnta   [esi+0x30]
mulps      xmm3, xmm4
// process y
movss      xmm0, [esi+0x04]
movss      xmm2, [esi+0x14]
shufps   xmm0, xmm0, 0x00
shufps   xmm2, xmm2, 0x00
mulps      xmm0, xmm5
mulps      xmm2, xmm5
addps      xmm1, xmm0
addps      xmm3, xmm2
// process z (hiding some pointer arithmetic between
// the multiplies)
movss      xmm0, [esi+0x08]
movss      xmm2, [esi+0x18]
shufps   xmm0, xmm0, 0x00
shufps   xmm2, xmm2, 0x00
mulps      xmm0, xmm6
add         esi, vecSize
mulps      xmm2, xmm6
add         edi, vecSize
addps      xmm1, xmm0
addps      xmm3, xmm2
// process w
addps      xmm1, xmm7
addps      xmm3, xmm7
// write output vectors to memory and loop
movaps   [edi-0x20], xmm1
movaps   [edi-0x10], xmm3
dec         ecx
jnz         BT2_START
}
}