#include "Matrix4f.h"
#include "Vector4f.h"
// MatrixMultiply1 -- a naive C++ matrix-vector multiplication function.
// It's correct, but that's about the only thing impressive about it.
//
// Performance: ~90 cycles/vector
Vector4f MatrixMultiply1(Matrix4f &m, Vector4f &vin)
{
float v0 = m.elts[0][0]*vin[0] + m.elts[0][1]*vin[1] +
m.elts[0][2]*vin[2] + m.elts[0][3]*vin[3];
float v1 = m.elts[1][0]*vin[0] + m.elts[1][1]*vin[1] +
m.elts[1][2]*vin[2] + m.elts[1][3]*vin[3];
float v2 = m.elts[2][0]*vin[0] + m.elts[2][1]*vin[1] +
m.elts[2][2]*vin[2] + m.elts[2][3]*vin[3];
float v3 = m.elts[3][0]*vin[0] + m.elts[3][1]*vin[1] +
m.elts[3][2]*vin[2] + m.elts[3][3]*vin[3];
return Vector4f(v0,v1,v2,v3);
}
// MatrixMultiply2 -- a faster version of MatrixMultiply1, still in C++.
//
// Performance: 70 cycles/vector
void MatrixMultiply2(Matrix4f &m, Vector4f *vin, Vector4f *vout)
{
float *in = vin->Ref();
float *out = vout->Ref();
out[0] = m.elts[0][0]*in[0] + m.elts[0][1]*in[1] +
m.elts[0][2]*in[2] + m.elts[0][3]*in[3];
out[1] = m.elts[1][0]*in[0] + m.elts[1][1]*in[1] +
m.elts[1][2]*in[2] + m.elts[1][3]*in[3];
out[2] = m.elts[2][0]*in[0] + m.elts[2][1]*in[1] +
m.elts[2][2]*in[2] + m.elts[2][3]*in[3];
out[3] = m.elts[3][0]*in[0] + m.elts[3][1]*in[1] +
m.elts[3][2]*in[2] + m.elts[3][3]*in[3];
}
// MatrixMultiply3 -- a C++/ASM version of MatrixMultiply2, which takes
// advantage of Intel's SSE instructions. This version requires that
// M be in column-major order.
//
// Performance: 57 cycles/vector
void MatrixMultiply3(Matrix4f &m, Vector4f *vin, Vector4f *vout)
{
// Get a pointer to the elements of m
float *row0 = m.Ref();
__asm {
mov esi, vin
mov edi, vout
// load columns of matrix into xmm4-7
mov edx, row0
movups xmm4, [edx]
movups xmm5, [edx+0x10]
movups xmm6, [edx+0x20]
movups xmm7, [edx+0x30]
// load v into xmm0.
movups xmm0, [esi]
// we'll store the final result in xmm2; initialize it
// to zero
xorps xmm2, xmm2
// broadcast x into xmm1, multiply it by the first
// column of the matrix (xmm4), and add it to the total
movups xmm1, xmm0
shufps xmm1, xmm1, 0x00
mulps xmm1, xmm4
addps xmm2, xmm1
// repeat the process for y, z and w
movups xmm1, xmm0
shufps xmm1, xmm1, 0x55
mulps xmm1, xmm5
addps xmm2, xmm1
movups xmm1, xmm0
shufps xmm1, xmm1, 0xAA
mulps xmm1, xmm6
addps xmm2, xmm1
movups xmm1, xmm0
shufps xmm1, xmm1, 0xFF
mulps xmm1, xmm7
addps xmm2, xmm1
// write the results to vout
movups [edi], xmm2
}
}
// BatchMultiply1 -- A modification to MatrixMultiply2 in which we
// multiply several input vectors (vin) by the same matrix (m), storing the
// results in 'vout'. A total of 'len' vectors are processed. This
// prevents us from having to re-load 'm' every time through the loop.
// This also allows us to embed the tranpose operation into the function
// body, so we can continue to store our matrices in row-major order,
// if we wish.
//
// Performance: 32 cycles/vector
void BatchMultiply1(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// transpose the matrix into the xmm4-7
m.TransposeIntoXMM();
static const int vecSize = sizeof(Vector4f);
__asm {
mov esi, vin
mov edi, vout
mov ecx, len
BM1_START:
// load the next input vector into xmm0, and advance the input
// pointer
movups xmm0, [esi]
add esi, vecSize
// we'll store the final result in xmm2; initialize it
// to zero
xorps xmm2, xmm2
// broadcast x into xmm1, multiply it by the first
// column of the matrix (xmm4), and add it to the total
movups xmm1, xmm0
shufps xmm1, xmm1, 0x00
mulps xmm1, xmm4
addps xmm2, xmm1
// repeat the process for y, z and w
movups xmm1, xmm0
shufps xmm1, xmm1, 0x55
mulps xmm1, xmm5
addps xmm2, xmm1
movups xmm1, xmm0
shufps xmm1, xmm1, 0xAA
mulps xmm1, xmm6
addps xmm2, xmm1
movups xmm1, xmm0
shufps xmm1, xmm1, 0xFF
mulps xmm1, xmm7
addps xmm2, xmm1
// write the results to vout, and advance the output pointer
movups [edi], xmm2
add edi, vecSize
dec ecx
jnz BM1_START
}
}
// BatchMultiply2 -- A simple modification to BatchMultiply1: we now use
// aligned moves (movaps) instead of unaligned moves (movups). This is much
// faster, but requires that Matrix4f and Vector4f objects are aligned
// on 16-byte boundaries. We use the __declspec(align(16)) specifier in
// the Matrix4f and Vector4f class definitions to accomplish this.
//
// Performance: 28 cycles/vector
void BatchMultiply2(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// transpose the matrix into the xmm4-7
m.TransposeIntoXMM();
static const int vecSize = sizeof(Vector4f);
__asm {
mov esi, vin
mov edi, vout
mov ecx, len
BM2_START:
// load the next input vector into xmm0, and advance the input
// pointer
movaps xmm0, [esi]
add esi, vecSize
// we'll store the final result in xmm2; initialize it
// to zero
xorps xmm2, xmm2
// broadcast x into xmm1, multiply it by the first
// column of the matrix (xmm4), and add it to the total
movaps xmm1, xmm0
shufps xmm1, xmm1, 0x00
mulps xmm1, xmm4
addps xmm2, xmm1
// repeat the process for y, z and w
movaps xmm1, xmm0
shufps xmm1, xmm1, 0x55
mulps xmm1, xmm5
addps xmm2, xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, 0xAA
mulps xmm1, xmm6
addps xmm2, xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, 0xFF
mulps xmm1, xmm7
addps xmm2, xmm1
// write the results to vout, advance the output pointer,
// and loop
movaps [edi], xmm2
add edi, vecSize
dec ecx
jnz BM2_START
}
}
// BatchMultiply3 -- A modification to BatchMultiply2 which makes better
// use of instruction pairing.
//
// Performance: 22 cycles/vector
void BatchMultiply3(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// transpose the matrix into the xmm4-7
m.TransposeIntoXMM();
static const int vecSize = sizeof(Vector4f);
__asm {
mov esi, vin
mov edi, vout
mov ecx, len
BM3_START:
// load the next input vector into xmm0, and advance the input
// and output pointers
movaps xmm0, [esi]
add edi, vecSize
// broadcast y into xmm1, z into xmm2, and w into xmm3 (leaving
// x in xmm0).
movaps xmm1, xmm0
add esi, vecSize
movaps xmm2, xmm0
movaps xmm3, xmm0
shufps xmm0, xmm0, 0x00
shufps xmm1, xmm1, 0x55
shufps xmm2, xmm2, 0xAA
shufps xmm3, xmm3, 0xFF
// multiply xmm0-3 by the appropriate columns of the matrix
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
mulps xmm3, xmm7
// sum the results into xmm1
addps xmm1, xmm0
addps xmm2, xmm3
addps xmm1, xmm2
// write the results to vout, and loop
movaps [edi-0x10], xmm1
dec ecx
jnz BM3_START
}
}
// BatchMultiply4 -- A modification to BatchMultiply3 which uses
// SSE prefetching instructions to improve performance with large
// input sets.
//
// Performance: 21 cycles/vector
void BatchMultiply4(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// transpose the matrix into the xmm4-7
m.TransposeIntoXMM();
static const int vecSize = sizeof(Vector4f);
__asm {
mov esi, vin
mov edi, vout
mov ecx, len
BM4_START:
// load the next input vector into xmm0, and advance the input
// pointer. Prefetch upcoming vectors into the cache
movaps xmm0, [esi]
prefetchnta [esi+0x30]
// broadcast y into xmm1, z into xmm2, and w into xmm3 (leaving
// x in xmm0).
movaps xmm1, xmm0
add esi, vecSize
movaps xmm2, xmm0
add edi, vecSize
movaps xmm3, xmm0
prefetchnta [edi+0x30]
shufps xmm0, xmm0, 0x00
shufps xmm1, xmm1, 0x55
shufps xmm2, xmm2, 0xAA
shufps xmm3, xmm3, 0xFF
// multiply xmm0-3 by the appropriate columns of the matrix
// (hiding a pointer increment between the multiplies)
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
mulps xmm3, xmm7
// sum the results into xmm1
addps xmm1, xmm0
addps xmm2, xmm3
addps xmm1, xmm2
// write the results to vout, and loop
movaps [edi-0x10], xmm1
dec ecx
jnz BM4_START
}
}
// BatchMultiply5 -- A modified version of BatchMultiply4 which loads
// vector components individually from memory, thereby allowing us
// to work on TWO VECTORS SIMULTANEOUSLY!
//
// Performance: 20 cycles/vector
void BatchMultiply5(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// initializations in C++ land
Matrix4f mt(m, Matrix4f::TRANSPOSE); // work from a
float *row0 = mt.Ref();
static const int vecSize = 2 * sizeof(Vector4f);
// if there are an odd number of vectors, process the first one
// separately and advance the pointers
if (len & 0x1) {
MatrixMultiply3(mt, vin, vout);
++vin;
++vout;
}
len >>= 1; // we process two vectors at a time
__asm {
mov esi, vin
mov edi, vout
mov ecx, len
// load columns of matrix into xmm4-7
mov edx, row0
movaps xmm4, [edx]
movaps xmm5, [edx+0x10]
movaps xmm6, [edx+0x20]
movaps xmm7, [edx+0x30]
BM5_START:
// process x
movss xmm1, [esi+0x00]
movss xmm3, [esi+0x10]
shufps xmm1, xmm1, 0x00
prefetchnta [esi+0x30]
shufps xmm3, xmm3, 0x00
mulps xmm1, xmm4
prefetchnta [edi+0x30]
mulps xmm3, xmm4
// process y
movss xmm0, [esi+0x04]
movss xmm2, [esi+0x14]
shufps xmm0, xmm0, 0x00
shufps xmm2, xmm2, 0x00
mulps xmm0, xmm5
mulps xmm2, xmm5
addps xmm1, xmm0
addps xmm3, xmm2
// process z
movss xmm0, [esi+0x08]
movss xmm2, [esi+0x18]
shufps xmm0, xmm0, 0x00
shufps xmm2, xmm2, 0x00
mulps xmm0, xmm6
mulps xmm2, xmm6
addps xmm1, xmm0
addps xmm3, xmm2
// process w (hiding some pointer increments between the
// multiplies)
movss xmm0, [esi+0x0C]
movss xmm2, [esi+0x1C]
shufps xmm0, xmm0, 0x00
shufps xmm2, xmm2, 0x00
mulps xmm0, xmm7
add esi, vecSize
mulps xmm2, xmm7
add edi, vecSize
addps xmm1, xmm0
addps xmm3, xmm2
// write output vectors to memory, and loop
movaps [edi-0x20], xmm1
movaps [edi-0x10], xmm3
dec ecx
jnz BM5_START
}
}
// BatchTransform1 -- A modified version of BatchMultiply4 which makes
// an additional assumption about the vectors in vin: if each vector's
// 4th element (the homogenous coordinate w) is assumed to be 1.0 (as is
// the case for 3D vertices), we can eliminate a move, a shuffle and a
// multiply instruction.
//
// Performance: 17 cycles/vector
void BatchTransform1(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// initializations in C++ land
Matrix4f mt(m, Matrix4f::TRANSPOSE); // work from a
float *row0 = mt.Ref();
static const int vecSize = 2 * sizeof(Vector4f);
// if there are an odd number of vectors, process the first one
// separately and advance the pointers
if (len & 0x1) {
MatrixMultiply3(mt, vin, vout);
++vin;
++vout;
}
len >>= 1; // we process two vectors at a time
__asm {
mov esi, vin
mov edi, vout
mov ecx, len
// load columns of matrix into xmm4-7
mov edx, row0
movaps xmm4, [edx]
movaps xmm5, [edx+0x10]
movaps xmm6, [edx+0x20]
movaps xmm7, [edx+0x30]
BT2_START:
// process x (hiding the prefetches in the delays)
movss xmm1, [esi+0x00]
movss xmm3, [esi+0x10]
shufps xmm1, xmm1, 0x00
prefetchnta [edi+0x30]
shufps xmm3, xmm3, 0x00
mulps xmm1, xmm4
prefetchnta [esi+0x30]
mulps xmm3, xmm4
// process y
movss xmm0, [esi+0x04]
movss xmm2, [esi+0x14]
shufps xmm0, xmm0, 0x00
shufps xmm2, xmm2, 0x00
mulps xmm0, xmm5
mulps xmm2, xmm5
addps xmm1, xmm0
addps xmm3, xmm2
// process z (hiding some pointer arithmetic between
// the multiplies)
movss xmm0, [esi+0x08]
movss xmm2, [esi+0x18]
shufps xmm0, xmm0, 0x00
shufps xmm2, xmm2, 0x00
mulps xmm0, xmm6
add esi, vecSize
mulps xmm2, xmm6
add edi, vecSize
addps xmm1, xmm0
addps xmm3, xmm2
// process w
addps xmm1, xmm7
addps xmm3, xmm7
// write output vectors to memory and loop
movaps [edi-0x20], xmm1
movaps [edi-0x10], xmm3
dec ecx
jnz BT2_START
}
}