SSE patch by kyre, with runtime CPU detection and a cvar r_skeletal_use_sse if SSE...
authordivverent <divverent@d7cf8633-e32d-0410-b094-e92efae38249>
Fri, 8 Oct 2010 17:54:16 +0000 (17:54 +0000)
committerdivverent <divverent@d7cf8633-e32d-0410-b094-e92efae38249>
Fri, 8 Oct 2010 17:54:16 +0000 (17:54 +0000)
git-svn-id: svn://svn.icculus.org/twilight/trunk/darkplaces@10517 d7cf8633-e32d-0410-b094-e92efae38249

makefile.inc
mod_skeletal_animatevertices_generic.c [new file with mode: 0644]
mod_skeletal_animatevertices_generic.h [new file with mode: 0644]
mod_skeletal_animatevertices_sse.c [new file with mode: 0644]
mod_skeletal_animatevertices_sse.h [new file with mode: 0644]
model_alias.c
model_alias.h
quakedef.h

index 3f41d46..b98e2b2 100644 (file)
@@ -138,6 +138,8 @@ OBJ_COMMON= \
        mdfour.o \
        menu.o \
        meshqueue.o \
+       mod_skeletal_animatevertices_sse.o \
+       mod_skeletal_animatevertices_generic.o \
        model_alias.o \
        model_brush.o \
        model_shared.o \
@@ -188,6 +190,8 @@ CFLAGS_RELEASE=
 CFLAGS_RELEASE_PROFILE=-fbranch-probabilities
 CFLAGS_SDL=$(SDLCONFIG_CFLAGS)
 
+CFLAGS_SSE=-msse
+
 OPTIM_DEBUG=$(CPUOPTIMIZATIONS)
 #OPTIM_RELEASE=-O2 -fno-strict-aliasing -ffast-math -funroll-loops $(CPUOPTIMIZATIONS)
 #OPTIM_RELEASE=-O2 -fno-strict-aliasing -fno-math-errno -fno-trapping-math -ffinite-math-only -fno-signaling-nans -fcx-limited-range -funroll-loops $(CPUOPTIMIZATIONS)
@@ -513,6 +517,10 @@ cd_sdl.o: cd_sdl.c
        $(CHECKLEVEL2)
        $(DO_CC) $(CFLAGS_SDL)
 
+mod_skeletal_animatevertices_sse.o: mod_skeletal_animatevertices_sse.c
+       $(CHECKLEVEL2)
+       $(DO_CC) $(CFLAGS_SSE)
+
 darkplaces.o: %.o : %.rc
        $(CHECKLEVEL2)
        $(WINDRES) -o $@ $<
diff --git a/mod_skeletal_animatevertices_generic.c b/mod_skeletal_animatevertices_generic.c
new file mode 100644 (file)
index 0000000..24cc8a9
--- /dev/null
@@ -0,0 +1,213 @@
+#include "mod_skeletal_animatevertices_generic.h"
+
+typedef struct
+{
+       float f[12];
+}
+float12_t;
+
+void Mod_Skeletal_AnimateVertices_Generic(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f)
+{
+       // vertex weighted skeletal
+       int i, k;
+       int blends;
+       float12_t *bonepose;
+       float12_t *boneposerelative;
+       float m[12];
+       const blendweights_t * RESTRICT weights;
+
+       if (!model->surfmesh.num_vertices)
+               return;
+
+       //unsigned long long ts = rdtsc();
+       bonepose = (float12_t *) Mod_Skeletal_AnimateVertices_AllocBuffers(sizeof(float12_t) * (model->num_bones*2 + model->surfmesh.num_blends));
+       boneposerelative = bonepose + model->num_bones;
+
+       if (skeleton && !skeleton->relativetransforms)
+               skeleton = NULL;
+
+       // interpolate matrices
+       if (skeleton)
+       {
+               for (i = 0;i < model->num_bones;i++)
+               {
+                       Matrix4x4_ToArray12FloatD3D(&skeleton->relativetransforms[i], m);
+                       if (model->data_bones[i].parent >= 0)
+                               R_ConcatTransforms(bonepose[model->data_bones[i].parent].f, m, bonepose[i].f);
+                       else
+                               memcpy(bonepose[i].f, m, sizeof(m));
+
+                       // create a relative deformation matrix to describe displacement
+                       // from the base mesh, which is used by the actual weighting
+                       R_ConcatTransforms(bonepose[i].f, model->data_baseboneposeinverse + i * 12, boneposerelative[i].f);
+               }
+       }
+       else
+       {
+               float originscale = model->num_posescale;
+               float x,y,z,w,lerp;
+               const short * RESTRICT pose6s;
+
+               for (i = 0;i < model->num_bones;i++)
+               {
+                       memset(m, 0, sizeof(m));
+                       for (blends = 0;blends < MAX_FRAMEBLENDS && frameblend[blends].lerp > 0;blends++)
+                       {
+                               pose6s = model->data_poses6s + 6 * (frameblend[blends].subframe * model->num_bones + i);
+                               lerp = frameblend[blends].lerp;
+                               x = pose6s[3] * (1.0f / 32767.0f);
+                               y = pose6s[4] * (1.0f / 32767.0f);
+                               z = pose6s[5] * (1.0f / 32767.0f);
+                               w = 1.0f - (x*x+y*y+z*z);
+                               w = w > 0.0f ? -sqrt(w) : 0.0f;
+                               m[ 0] += (1-2*(y*y+z*z)) * lerp;
+                               m[ 1] += (  2*(x*y-z*w)) * lerp;
+                               m[ 2] += (  2*(x*z+y*w)) * lerp;
+                               m[ 3] += (pose6s[0] * originscale) * lerp;
+                               m[ 4] += (  2*(x*y+z*w)) * lerp;
+                               m[ 5] += (1-2*(x*x+z*z)) * lerp;
+                               m[ 6] += (  2*(y*z-x*w)) * lerp;
+                               m[ 7] += (pose6s[1] * originscale) * lerp;
+                               m[ 8] += (  2*(x*z-y*w)) * lerp;
+                               m[ 9] += (  2*(y*z+x*w)) * lerp;
+                               m[10] += (1-2*(x*x+y*y)) * lerp;
+                               m[11] += (pose6s[2] * originscale) * lerp;
+                       }
+                       VectorNormalize(m       );
+                       VectorNormalize(m + 4);
+                       VectorNormalize(m + 8);
+                       if (i == r_skeletal_debugbone.integer)
+                               m[r_skeletal_debugbonecomponent.integer % 12] += r_skeletal_debugbonevalue.value;
+                       m[3] *= r_skeletal_debugtranslatex.value;
+                       m[7] *= r_skeletal_debugtranslatey.value;
+                       m[11] *= r_skeletal_debugtranslatez.value;
+                       if (model->data_bones[i].parent >= 0)
+                               R_ConcatTransforms(bonepose[model->data_bones[i].parent].f, m, bonepose[i].f);
+                       else
+                               memcpy(bonepose[i].f, m, sizeof(m));
+                       // create a relative deformation matrix to describe displacement
+                       // from the base mesh, which is used by the actual weighting
+                       R_ConcatTransforms(bonepose[i].f, model->data_baseboneposeinverse + i * 12, boneposerelative[i].f);
+               }
+       }
+
+       // generate matrices for all blend combinations
+       weights = model->surfmesh.data_blendweights;
+       for (i = 0;i < model->surfmesh.num_blends;i++, weights++)
+       {
+               float * RESTRICT b = boneposerelative[model->num_bones + i].f;
+               const float * RESTRICT m = boneposerelative[weights->index[0]].f;
+               float f = weights->influence[0] * (1.0f / 255.0f);
+               b[ 0] = f*m[ 0]; b[ 1] = f*m[ 1]; b[ 2] = f*m[ 2]; b[ 3] = f*m[ 3];
+               b[ 4] = f*m[ 4]; b[ 5] = f*m[ 5]; b[ 6] = f*m[ 6]; b[ 7] = f*m[ 7];
+               b[ 8] = f*m[ 8]; b[ 9] = f*m[ 9]; b[10] = f*m[10]; b[11] = f*m[11];
+               for (k = 1;k < 4 && weights->influence[k];k++)
+               {
+                       m = boneposerelative[weights->index[k]].f;
+                       f = weights->influence[k] * (1.0f / 255.0f);
+                       b[ 0] += f*m[ 0]; b[ 1] += f*m[ 1]; b[ 2] += f*m[ 2]; b[ 3] += f*m[ 3];
+                       b[ 4] += f*m[ 4]; b[ 5] += f*m[ 5]; b[ 6] += f*m[ 6]; b[ 7] += f*m[ 7];
+                       b[ 8] += f*m[ 8]; b[ 9] += f*m[ 9]; b[10] += f*m[10]; b[11] += f*m[11];
+               }
+       }
+
+#define LOAD_MATRIX_SCALAR() const float * RESTRICT m = boneposerelative[*b].f
+
+#define LOAD_MATRIX3() \
+       LOAD_MATRIX_SCALAR()
+#define LOAD_MATRIX4() \
+       LOAD_MATRIX_SCALAR()
+
+#define TRANSFORM_POSITION_SCALAR(in, out) \
+       (out)[0] = ((in)[0] * m[0] + (in)[1] * m[1] + (in)[2] * m[ 2] + m[3]); \
+       (out)[1] = ((in)[0] * m[4] + (in)[1] * m[5] + (in)[2] * m[ 6] + m[7]); \
+       (out)[2] = ((in)[0] * m[8] + (in)[1] * m[9] + (in)[2] * m[10] + m[11]);
+#define TRANSFORM_VECTOR_SCALAR(in, out) \
+       (out)[0] = ((in)[0] * m[0] + (in)[1] * m[1] + (in)[2] * m[ 2]); \
+       (out)[1] = ((in)[0] * m[4] + (in)[1] * m[5] + (in)[2] * m[ 6]); \
+       (out)[2] = ((in)[0] * m[8] + (in)[1] * m[9] + (in)[2] * m[10]);
+
+#define TRANSFORM_POSITION(in, out) \
+       TRANSFORM_POSITION_SCALAR(in, out)
+#define TRANSFORM_VECTOR(in, out) \
+       TRANSFORM_VECTOR_SCALAR(in, out)
+
+       // transform vertex attributes by blended matrices
+       if (vertex3f)
+       {
+               const float * RESTRICT v = model->surfmesh.data_vertex3f;
+               const unsigned short * RESTRICT b = model->surfmesh.blends;
+               // special case common combinations of attributes to avoid repeated loading of matrices
+               if (normal3f)
+               {
+                       const float * RESTRICT n = model->surfmesh.data_normal3f;
+                       if (svector3f && tvector3f)
+                       {
+                               const float * RESTRICT sv = model->surfmesh.data_svector3f;
+                               const float * RESTRICT tv = model->surfmesh.data_tvector3f;
+
+                               // Note that for SSE each iteration stores one element past end, so we break one vertex short
+                               // and handle that with scalars in that case
+                               for (i = 0; i < model->surfmesh.num_vertices; i++, v += 3, n += 3, sv += 3, tv += 3, b++,
+                                               vertex3f += 3, normal3f += 3, svector3f += 3, tvector3f += 3)
+                               {
+                                       LOAD_MATRIX4();
+                                       TRANSFORM_POSITION(v, vertex3f);
+                                       TRANSFORM_VECTOR(n, normal3f);
+                                       TRANSFORM_VECTOR(sv, svector3f);
+                                       TRANSFORM_VECTOR(tv, tvector3f);
+                               }
+
+                               return;
+                       }
+
+                       for (i = 0;i < model->surfmesh.num_vertices; i++, v += 3, n += 3, b++, vertex3f += 3, normal3f += 3)
+                       {
+                               LOAD_MATRIX4();
+                               TRANSFORM_POSITION(v, vertex3f);
+                               TRANSFORM_VECTOR(n, normal3f);
+                       }
+               }
+               else
+               {
+                       for (i = 0;i < model->surfmesh.num_vertices; i++, v += 3, b++, vertex3f += 3)
+                       {
+                               LOAD_MATRIX4();
+                               TRANSFORM_POSITION(v, vertex3f);
+                       }
+               }
+       }
+
+       else if (normal3f)
+       {
+               const float * RESTRICT n = model->surfmesh.data_normal3f;
+               const unsigned short * RESTRICT b = model->surfmesh.blends;
+               for (i = 0; i < model->surfmesh.num_vertices; i++, n += 3, b++, normal3f += 3)
+               {
+                       LOAD_MATRIX3();
+                       TRANSFORM_VECTOR(n, normal3f);
+               }
+       }
+
+       if (svector3f)
+       {
+               const float * RESTRICT sv = model->surfmesh.data_svector3f;
+               const unsigned short * RESTRICT b = model->surfmesh.blends;
+               for (i = 0; i < model->surfmesh.num_vertices; i++, sv += 3, b++, svector3f += 3)
+               {
+                       LOAD_MATRIX3();
+                       TRANSFORM_VECTOR(sv, svector3f);
+               }
+       }
+
+       if (tvector3f)
+       {
+               const float * RESTRICT tv = model->surfmesh.data_tvector3f;
+               const unsigned short * RESTRICT b = model->surfmesh.blends;
+               for (i = 0; i < model->surfmesh.num_vertices; i++, tv += 3, b++, tvector3f += 3)
+               {
+                       LOAD_MATRIX3();
+                       TRANSFORM_VECTOR(tv, tvector3f);
+               }
+       }
+}
diff --git a/mod_skeletal_animatevertices_generic.h b/mod_skeletal_animatevertices_generic.h
new file mode 100644 (file)
index 0000000..2ad97eb
--- /dev/null
@@ -0,0 +1,8 @@
+#ifndef MOD_SKELETAL_ANIMATEVERTICES_GENERIC_H
+#define MOD_H
+
+#include "quakedef.h"
+
+void Mod_Skeletal_AnimateVertices_Generic(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f);
+
+#endif
diff --git a/mod_skeletal_animatevertices_sse.c b/mod_skeletal_animatevertices_sse.c
new file mode 100644 (file)
index 0000000..d6f71f1
--- /dev/null
@@ -0,0 +1,329 @@
+#include "mod_skeletal_animatevertices_sse.h"
+
+#ifdef SSE_POSSIBLE
+
+#ifdef MATRIX4x4_OPENGLORIENTATION
+#error "SSE skeletal requires D3D matrix layout"
+#endif
+
+#include <xmmintrin.h>
+
+void Mod_Skeletal_AnimateVertices_SSE(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f)
+{
+       // vertex weighted skeletal
+       int i, k;
+       int blends;
+       matrix4x4_t *bonepose;
+       matrix4x4_t *boneposerelative;
+       float m[12];
+       matrix4x4_t mm, mm2;
+       const blendweights_t * RESTRICT weights;
+       int num_vertices_minus_one;
+
+       if (!model->surfmesh.num_vertices)
+               return;
+
+       num_vertices_minus_one = model->surfmesh.num_vertices - 1;
+
+       //unsigned long long ts = rdtsc();
+       bonepose = (matrix4x4_t *) Mod_Skeletal_AnimateVertices_AllocBuffers(sizeof(matrix4x4_t) * (model->num_bones*2 + model->surfmesh.num_blends));
+       boneposerelative = bonepose + model->num_bones;
+
+       if (skeleton && !skeleton->relativetransforms)
+               skeleton = NULL;
+
+       // interpolate matrices
+       if (skeleton)
+       {
+               for (i = 0;i < model->num_bones;i++)
+               {
+                       // relativetransforms is in GL column-major order, which is what we need for SSE
+                       // transposed style processing
+                       if (model->data_bones[i].parent >= 0)
+                               Matrix4x4_Concat(&bonepose[i], &bonepose[model->data_bones[i].parent], &skeleton->relativetransforms[i]);
+                       else
+                               memcpy(&bonepose[i], &skeleton->relativetransforms[i], sizeof(matrix4x4_t));
+
+                       // create a relative deformation matrix to describe displacement
+                       // from the base mesh, which is used by the actual weighting
+                       Matrix4x4_FromArray12FloatD3D(&mm, model->data_baseboneposeinverse + i * 12); // baseboneposeinverse is 4x3 row-major
+                       Matrix4x4_Concat(&boneposerelative[i], &bonepose[i], &mm);
+               }
+       }
+       else
+       {
+               float originscale = model->num_posescale;
+               float x,y,z,w,lerp;
+               const short * RESTRICT pose6s;
+
+               for (i = 0;i < model->num_bones;i++)
+               {
+                       memset(m, 0, sizeof(m));
+                       for (blends = 0;blends < MAX_FRAMEBLENDS && frameblend[blends].lerp > 0;blends++)
+                       {
+                               pose6s = model->data_poses6s + 6 * (frameblend[blends].subframe * model->num_bones + i);
+                               lerp = frameblend[blends].lerp;
+                               x = pose6s[3] * (1.0f / 32767.0f);
+                               y = pose6s[4] * (1.0f / 32767.0f);
+                               z = pose6s[5] * (1.0f / 32767.0f);
+                               w = 1.0f - (x*x+y*y+z*z);
+                               w = w > 0.0f ? -sqrt(w) : 0.0f;
+                               m[ 0] += (1-2*(y*y+z*z)) * lerp;
+                               m[ 1] += (  2*(x*y-z*w)) * lerp;
+                               m[ 2] += (  2*(x*z+y*w)) * lerp;
+                               m[ 3] += (pose6s[0] * originscale) * lerp;
+                               m[ 4] += (  2*(x*y+z*w)) * lerp;
+                               m[ 5] += (1-2*(x*x+z*z)) * lerp;
+                               m[ 6] += (  2*(y*z-x*w)) * lerp;
+                               m[ 7] += (pose6s[1] * originscale) * lerp;
+                               m[ 8] += (  2*(x*z-y*w)) * lerp;
+                               m[ 9] += (  2*(y*z+x*w)) * lerp;
+                               m[10] += (1-2*(x*x+y*y)) * lerp;
+                               m[11] += (pose6s[2] * originscale) * lerp;
+                       }
+                       VectorNormalize(m       );
+                       VectorNormalize(m + 4);
+                       VectorNormalize(m + 8);
+                       if (i == r_skeletal_debugbone.integer)
+                               m[r_skeletal_debugbonecomponent.integer % 12] += r_skeletal_debugbonevalue.value;
+                       m[3] *= r_skeletal_debugtranslatex.value;
+                       m[7] *= r_skeletal_debugtranslatey.value;
+                       m[11] *= r_skeletal_debugtranslatez.value;
+                       Matrix4x4_FromArray12FloatD3D(&mm, m);
+                       if (model->data_bones[i].parent >= 0)
+                               Matrix4x4_Concat(&bonepose[i], &bonepose[model->data_bones[i].parent], &mm);
+                       else
+                               memcpy(&bonepose[i], &mm, sizeof(mm));
+                       // create a relative deformation matrix to describe displacement
+                       // from the base mesh, which is used by the actual weighting
+                       Matrix4x4_FromArray12FloatD3D(&mm, model->data_baseboneposeinverse + i * 12); // baseboneposeinverse is 4x3 row-major
+                       Matrix4x4_Concat(&mm2, &bonepose[i], &mm);
+                       Matrix4x4_Transpose(&boneposerelative[i], &mm2); // TODO: Eliminate this transpose
+               }
+       }
+
+       // generate matrices for all blend combinations
+       weights = model->surfmesh.data_blendweights;
+       for (i = 0;i < model->surfmesh.num_blends;i++, weights++)
+       {
+               float * RESTRICT b = &boneposerelative[model->num_bones + i].m[0][0];
+               const float * RESTRICT m = &boneposerelative[weights->index[0]].m[0][0];
+               float f = weights->influence[0] * (1.0f / 255.0f);
+               __m128 fv = _mm_set_ps1(f);
+               __m128 b0 = _mm_load_ps(m);
+               __m128 b1 = _mm_load_ps(m+4);
+               __m128 b2 = _mm_load_ps(m+8);
+               __m128 b3 = _mm_load_ps(m+12);
+               __m128 m0, m1, m2, m3;
+               b0 = _mm_mul_ps(b0, fv);
+               b1 = _mm_mul_ps(b1, fv);
+               b2 = _mm_mul_ps(b2, fv);
+               b3 = _mm_mul_ps(b3, fv);
+               for (k = 1;k < 4 && weights->influence[k];k++)
+               {
+                       m = &boneposerelative[weights->index[k]].m[0][0];
+                       f = weights->influence[k] * (1.0f / 255.0f);
+                       fv = _mm_set_ps1(f);
+                       m0 = _mm_load_ps(m);
+                       m1 = _mm_load_ps(m+4);
+                       m2 = _mm_load_ps(m+8);
+                       m3 = _mm_load_ps(m+12);
+                       m0 = _mm_mul_ps(m0, fv);
+                       m1 = _mm_mul_ps(m1, fv);
+                       m2 = _mm_mul_ps(m2, fv);
+                       m3 = _mm_mul_ps(m3, fv);
+                       b0 = _mm_add_ps(m0, b0);
+                       b1 = _mm_add_ps(m1, b1);
+                       b2 = _mm_add_ps(m2, b2);
+                       b3 = _mm_add_ps(m3, b3);
+               }
+               _mm_store_ps(b, b0);
+               _mm_store_ps(b+4, b1);
+               _mm_store_ps(b+8, b2);
+               _mm_store_ps(b+12, b3);
+       }
+
+#define LOAD_MATRIX_SCALAR() const float * RESTRICT m = &boneposerelative[*b].m[0][0]
+
+#define LOAD_MATRIX3() \
+       const float * RESTRICT m = &boneposerelative[*b].m[0][0]; \
+       /* bonepose array is 16 byte aligned */ \
+       __m128 m1 = _mm_load_ps((m)); \
+       __m128 m2 = _mm_load_ps((m)+4); \
+       __m128 m3 = _mm_load_ps((m)+8);
+#define LOAD_MATRIX4() \
+       const float * RESTRICT m = &boneposerelative[*b].m[0][0]; \
+       /* bonepose array is 16 byte aligned */ \
+       __m128 m1 = _mm_load_ps((m)); \
+       __m128 m2 = _mm_load_ps((m)+4); \
+       __m128 m3 = _mm_load_ps((m)+8); \
+       __m128 m4 = _mm_load_ps((m)+12)
+
+       /* Note that matrix is 4x4 and transposed compared to non-USE_SSE codepath */
+#define TRANSFORM_POSITION_SCALAR(in, out) \
+       (out)[0] = ((in)[0] * m[0] + (in)[1] * m[4] + (in)[2] * m[ 8] + m[12]); \
+       (out)[1] = ((in)[0] * m[1] + (in)[1] * m[5] + (in)[2] * m[ 9] + m[13]); \
+       (out)[2] = ((in)[0] * m[2] + (in)[1] * m[6] + (in)[2] * m[10] + m[14]);
+#define TRANSFORM_VECTOR_SCALAR(in, out) \
+       (out)[0] = ((in)[0] * m[0] + (in)[1] * m[4] + (in)[2] * m[ 8]); \
+       (out)[1] = ((in)[0] * m[1] + (in)[1] * m[5] + (in)[2] * m[ 9]); \
+       (out)[2] = ((in)[0] * m[2] + (in)[1] * m[6] + (in)[2] * m[10]);
+
+#define TRANSFORM_POSITION(in, out) { \
+               __m128 pin = _mm_loadu_ps(in); /* we ignore the value in the last element (x from the next vertex) */ \
+               __m128 x = _mm_shuffle_ps(pin, pin, 0x0); \
+               __m128 t1 = _mm_mul_ps(x, m1); \
+               \
+               /* y, + x */ \
+               __m128 y = _mm_shuffle_ps(pin, pin, 0x55); \
+               __m128 t2 = _mm_mul_ps(y, m2); \
+               __m128 t3 = _mm_add_ps(t1, t2); \
+               \
+               /* z, + (y+x) */ \
+               __m128 z = _mm_shuffle_ps(pin, pin, 0xaa); \
+               __m128 t4 = _mm_mul_ps(z, m3); \
+               __m128 t5 = _mm_add_ps(t3, t4); \
+               \
+               /* + m3 */ \
+               __m128 pout = _mm_add_ps(t5, m4); \
+               _mm_storeu_ps((out), pout); \
+       }
+
+#define TRANSFORM_VECTOR(in, out) { \
+               __m128 vin = _mm_loadu_ps(in); \
+               \
+               /* x */ \
+               __m128 x = _mm_shuffle_ps(vin, vin, 0x0); \
+               __m128 t1 = _mm_mul_ps(x, m1); \
+               \
+               /* y, + x */ \
+               __m128 y = _mm_shuffle_ps(vin, vin, 0x55); \
+               __m128 t2 = _mm_mul_ps(y, m2); \
+               __m128 t3 = _mm_add_ps(t1, t2); \
+               \
+               /* nz, + (ny + nx) */ \
+               __m128 z = _mm_shuffle_ps(vin, vin, 0xaa); \
+               __m128 t4 = _mm_mul_ps(z, m3); \
+               __m128 vout = _mm_add_ps(t3, t4); \
+               _mm_storeu_ps((out), vout); \
+       }
+
+       // transform vertex attributes by blended matrices
+       if (vertex3f)
+       {
+               const float * RESTRICT v = model->surfmesh.data_vertex3f;
+               const unsigned short * RESTRICT b = model->surfmesh.blends;
+               // special case common combinations of attributes to avoid repeated loading of matrices
+               if (normal3f)
+               {
+                       const float * RESTRICT n = model->surfmesh.data_normal3f;
+                       if (svector3f && tvector3f)
+                       {
+                               const float * RESTRICT sv = model->surfmesh.data_svector3f;
+                               const float * RESTRICT tv = model->surfmesh.data_tvector3f;
+
+                               // Note that for SSE each iteration stores one element past end, so we break one vertex short
+                               // and handle that with scalars in that case
+                               for (i = 0; i < num_vertices_minus_one; i++, v += 3, n += 3, sv += 3, tv += 3, b++,
+                                               vertex3f += 3, normal3f += 3, svector3f += 3, tvector3f += 3)
+                               {
+                                       LOAD_MATRIX4();
+                                       TRANSFORM_POSITION(v, vertex3f);
+                                       TRANSFORM_VECTOR(n, normal3f);
+                                       TRANSFORM_VECTOR(sv, svector3f);
+                                       TRANSFORM_VECTOR(tv, tvector3f);
+                               }
+
+                               // Last vertex needs to be done with scalars to avoid reading/writing 1 word past end of arrays
+                               {
+                                       LOAD_MATRIX_SCALAR();
+                                       TRANSFORM_POSITION_SCALAR(v, vertex3f);
+                                       TRANSFORM_VECTOR_SCALAR(n, normal3f);
+                                       TRANSFORM_VECTOR_SCALAR(sv, svector3f);
+                                       TRANSFORM_VECTOR_SCALAR(tv, tvector3f);
+                               }
+                               //printf("elapsed ticks: %llu\n", rdtsc() - ts); // XXX
+                               return;
+                       }
+
+                       for (i = 0;i < num_vertices_minus_one; i++, v += 3, n += 3, b++, vertex3f += 3, normal3f += 3)
+                       {
+                               LOAD_MATRIX4();
+                               TRANSFORM_POSITION(v, vertex3f);
+                               TRANSFORM_VECTOR(n, normal3f);
+                       }
+                       {
+                               LOAD_MATRIX_SCALAR();
+                               TRANSFORM_POSITION_SCALAR(v, vertex3f);
+                               TRANSFORM_VECTOR_SCALAR(n, normal3f);
+                       }
+               }
+               else
+               {
+                       for (i = 0;i < num_vertices_minus_one; i++, v += 3, b++, vertex3f += 3)
+                       {
+                               LOAD_MATRIX4();
+                               TRANSFORM_POSITION(v, vertex3f);
+                       }
+                       {
+                               LOAD_MATRIX_SCALAR();
+                               TRANSFORM_POSITION_SCALAR(v, vertex3f);
+                       }
+               }
+       }
+
+       else if (normal3f)
+       {
+               const float * RESTRICT n = model->surfmesh.data_normal3f;
+               const unsigned short * RESTRICT b = model->surfmesh.blends;
+               for (i = 0; i < num_vertices_minus_one; i++, n += 3, b++, normal3f += 3)
+               {
+                       LOAD_MATRIX3();
+                       TRANSFORM_VECTOR(n, normal3f);
+               }
+               {
+                       LOAD_MATRIX_SCALAR();
+                       TRANSFORM_VECTOR_SCALAR(n, normal3f);
+               }
+       }
+
+       if (svector3f)
+       {
+               const float * RESTRICT sv = model->surfmesh.data_svector3f;
+               const unsigned short * RESTRICT b = model->surfmesh.blends;
+               for (i = 0; i < num_vertices_minus_one; i++, sv += 3, b++, svector3f += 3)
+               {
+                       LOAD_MATRIX3();
+                       TRANSFORM_VECTOR(sv, svector3f);
+               }
+               {
+                       LOAD_MATRIX_SCALAR();
+                       TRANSFORM_VECTOR_SCALAR(sv, svector3f);
+               }
+       }
+
+       if (tvector3f)
+       {
+               const float * RESTRICT tv = model->surfmesh.data_tvector3f;
+               const unsigned short * RESTRICT b = model->surfmesh.blends;
+               for (i = 0; i < num_vertices_minus_one; i++, tv += 3, b++, tvector3f += 3)
+               {
+                       LOAD_MATRIX3();
+                       TRANSFORM_VECTOR(tv, tvector3f);
+               }
+               {
+                       LOAD_MATRIX_SCALAR();
+                       TRANSFORM_VECTOR_SCALAR(tv, tvector3f);
+               }
+       }
+
+#undef LOAD_MATRIX3
+#undef LOAD_MATRIX4
+#undef TRANSFORM_POSITION
+#undef TRANSFORM_VECTOR
+#undef LOAD_MATRIX_SCALAR
+#undef TRANSFORM_POSITION_SCALAR
+#undef TRANSFORM_VECTOR_SCALAR
+}
+
+#endif
diff --git a/mod_skeletal_animatevertices_sse.h b/mod_skeletal_animatevertices_sse.h
new file mode 100644 (file)
index 0000000..7de55ca
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef MOD_SKELTAL_ANIMATEVERTICES_SSE_H
+#define MOD_SKELTAL_ANIMATEVERTICES_SSE_H
+
+#include "quakedef.h"
+
+#ifdef SSE_POSSIBLE
+void Mod_Skeletal_AnimateVertices_SSE(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f);
+#endif
+
+#endif
index b731214..2acd310 100644 (file)
@@ -21,7 +21,15 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 #include "quakedef.h"
 #include "image.h"
 #include "r_shadow.h"
+#include "mod_skeletal_animatevertices_generic.h"
+#ifdef SSE_POSSIBLE
+#include "mod_skeletal_animatevertices_sse.h"
+#endif
 
+#ifdef SSE_POSSIBLE
+static qboolean r_skeletal_use_sse_defined = false;
+cvar_t r_skeletal_use_sse = {0, "r_skeletal_use_sse", "1", "use SSE for skeletal model animation"};
+#endif
 cvar_t r_skeletal_debugbone = {0, "r_skeletal_debugbone", "-1", "development cvar for testing skeletal model code"};
 cvar_t r_skeletal_debugbonecomponent = {0, "r_skeletal_debugbonecomponent", "3", "development cvar for testing skeletal model code"};
 cvar_t r_skeletal_debugbonevalue = {0, "r_skeletal_debugbonevalue", "100", "development cvar for testing skeletal model code"};
@@ -32,6 +40,88 @@ cvar_t mod_alias_supporttagscale = {0, "mod_alias_supporttagscale", "1", "suppor
 
 float mod_md3_sin[320];
 
+static size_t Mod_Skeltal_AnimateVertices_maxbonepose = 0;
+static void *Mod_Skeltal_AnimateVertices_bonepose = NULL;
+void Mod_Skeletal_FreeBuffers(void)
+{
+       if(Mod_Skeltal_AnimateVertices_bonepose)
+               Mem_Free(Mod_Skeltal_AnimateVertices_bonepose);
+       Mod_Skeltal_AnimateVertices_maxbonepose = 0;
+       Mod_Skeltal_AnimateVertices_bonepose = NULL;
+}
+void *Mod_Skeletal_AnimateVertices_AllocBuffers(size_t nbytes)
+{
+       if(Mod_Skeltal_AnimateVertices_maxbonepose < nbytes)
+       {
+               Mem_Free(Mod_Skeltal_AnimateVertices_bonepose);
+               Mod_Skeltal_AnimateVertices_bonepose = Z_Malloc(nbytes);
+               Mod_Skeltal_AnimateVertices_maxbonepose = nbytes;
+       }
+       return Mod_Skeltal_AnimateVertices_bonepose;
+}
+
+void Mod_Skeletal_AnimateVertices(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f)
+{
+#ifdef SSE_POSSIBLE
+       if(r_skeletal_use_sse_defined)
+               if(r_skeletal_use_sse.integer)
+               {
+                       Mod_Skeletal_AnimateVertices_SSE(model, frameblend, skeleton, vertex3f, normal3f, svector3f, tvector3f);
+                       return;
+               }
+#endif
+       Mod_Skeletal_AnimateVertices_Generic(model, frameblend, skeleton, vertex3f, normal3f, svector3f, tvector3f);
+}
+
+#ifdef SSE_POSSIBLE
+#ifndef SSE_PRESENT
+// code from SDL, shortened as we can expect CPUID to work
+static int CPUID_Features(void)
+{
+       int features = 0;
+# if defined(__GNUC__) && defined(__i386__)
+        __asm__ (
+"        movl    %%ebx,%%edi\n"
+"        xorl    %%eax,%%eax                                           \n"
+"        incl    %%eax                                                 \n"
+"        cpuid                       # Get family/model/stepping/features\n"
+"        movl    %%edx,%0                                              \n"
+"        movl    %%edi,%%ebx\n"
+        : "=m" (features)
+        :
+        : "%eax", "%ecx", "%edx", "%edi"
+        );
+# elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
+        __asm {
+        xor     eax, eax
+        inc     eax
+        cpuid                       ; Get family/model/stepping/features
+        mov     features, edx
+        }
+# else
+#  error SSE_POSSIBLE set but no CPUID implementation
+# endif
+       return features;
+}
+#endif
+static qboolean Have_SSE(void)
+{
+       // COMMANDLINEOPTION: SSE: -nosse disables SSE support and detection
+       if(COM_CheckParm("-nosse"))
+               return false;
+       // COMMANDLINEOPTION: SSE: -forcesse enables SSE support and disables detection
+#ifdef SSE_PRESENT
+       return true;
+#else
+       if(COM_CheckParm("-forcesse"))
+               return true;
+       if(CPUID_Features() & (1 << 25))
+               return true;
+       return false;
+#endif
+}
+#endif
+
 void Mod_AliasInit (void)
 {
        int i;
@@ -44,6 +134,20 @@ void Mod_AliasInit (void)
        Cvar_RegisterVariable(&mod_alias_supporttagscale);
        for (i = 0;i < 320;i++)
                mod_md3_sin[i] = sin(i * M_PI * 2.0f / 256.0);
+#ifdef SSE_POSSIBLE
+       {
+               if(Have_SSE())
+               {
+                       Con_Printf("Skeletal animation uses SSE code path\n");
+                       r_skeletal_use_sse_defined = true;
+                       Cvar_RegisterVariable(&r_skeletal_use_sse);
+               }
+               else
+                       Con_Printf("Skeletal animation uses generic code path (SSE disabled or not detected)\n");
+       }
+#else
+       Con_Printf("Skeletal animation uses generic code path (SSE not compiled in)\n");
+#endif
 }
 
 int Mod_Skeletal_AddBlend(dp_model_t *model, const blendweights_t *newweights)
@@ -106,216 +210,6 @@ int Mod_Skeletal_CompressBlend(dp_model_t *model, const int *newindex, const flo
        return Mod_Skeletal_AddBlend(model, &newweights);
 }
 
-static int maxbonepose = 0;
-static float (*bonepose)[12] = NULL;
-
-void Mod_Skeletal_FreeBuffers(void)
-{
-       if(bonepose)
-               Mem_Free(bonepose);
-       maxbonepose = 0;
-       bonepose = NULL;
-}
-
-void Mod_Skeletal_AnimateVertices(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f)
-{
-       // vertex weighted skeletal
-       int i, k;
-       int blends;
-       float m[12];
-       float (*boneposerelative)[12];
-       const blendweights_t * RESTRICT weights;
-
-       if (maxbonepose < model->num_bones*2 + model->surfmesh.num_blends)
-       {
-               if (bonepose)
-                       Z_Free(bonepose);
-               maxbonepose = model->num_bones*2 + model->surfmesh.num_blends;
-               bonepose = (float (*)[12])Z_Malloc(maxbonepose * sizeof(float[12]));
-       }
-
-       boneposerelative = bonepose + model->num_bones;
-
-       if (skeleton && !skeleton->relativetransforms)
-               skeleton = NULL;
-
-       // interpolate matrices
-       if (skeleton)
-       {
-               for (i = 0;i < model->num_bones;i++)
-               {
-                       Matrix4x4_ToArray12FloatD3D(&skeleton->relativetransforms[i], m);
-                       if (model->data_bones[i].parent >= 0)
-                               R_ConcatTransforms(bonepose[model->data_bones[i].parent], m, bonepose[i]);
-                       else
-                               memcpy(bonepose[i], m, sizeof(m));
-
-                       // create a relative deformation matrix to describe displacement
-                       // from the base mesh, which is used by the actual weighting
-                       R_ConcatTransforms(bonepose[i], model->data_baseboneposeinverse + i * 12, boneposerelative[i]);
-               }
-       }
-       else
-       {
-               float originscale = model->num_posescale;
-               float x,y,z,w,lerp;
-               const short * RESTRICT pose6s;
-               for (i = 0;i < model->num_bones;i++)
-               {
-                       memset(m, 0, sizeof(m));
-                       for (blends = 0;blends < MAX_FRAMEBLENDS && frameblend[blends].lerp > 0;blends++)
-                       {
-                               pose6s = model->data_poses6s + 6 * (frameblend[blends].subframe * model->num_bones + i);
-                               lerp = frameblend[blends].lerp;
-                               x = pose6s[3] * (1.0f / 32767.0f);
-                               y = pose6s[4] * (1.0f / 32767.0f);
-                               z = pose6s[5] * (1.0f / 32767.0f);
-                               w = 1.0f - (x*x+y*y+z*z);
-                               w = w > 0.0f ? -sqrt(w) : 0.0f;
-                               m[ 0] += (1-2*(y*y+z*z)) * lerp;
-                               m[ 1] += (  2*(x*y-z*w)) * lerp;
-                               m[ 2] += (  2*(x*z+y*w)) * lerp;
-                               m[ 3] += (pose6s[0] * originscale) * lerp;
-                               m[ 4] += (  2*(x*y+z*w)) * lerp;
-                               m[ 5] += (1-2*(x*x+z*z)) * lerp;
-                               m[ 6] += (  2*(y*z-x*w)) * lerp;
-                               m[ 7] += (pose6s[1] * originscale) * lerp;
-                               m[ 8] += (  2*(x*z-y*w)) * lerp;
-                               m[ 9] += (  2*(y*z+x*w)) * lerp;
-                               m[10] += (1-2*(x*x+y*y)) * lerp;
-                               m[11] += (pose6s[2] * originscale) * lerp;
-                       }
-                       VectorNormalize(m       );
-                       VectorNormalize(m + 4);
-                       VectorNormalize(m + 8);
-                       if (i == r_skeletal_debugbone.integer)
-                               m[r_skeletal_debugbonecomponent.integer % 12] += r_skeletal_debugbonevalue.value;
-                       m[3] *= r_skeletal_debugtranslatex.value;
-                       m[7] *= r_skeletal_debugtranslatey.value;
-                       m[11] *= r_skeletal_debugtranslatez.value;
-                       if (model->data_bones[i].parent >= 0)
-                               R_ConcatTransforms(bonepose[model->data_bones[i].parent], m, bonepose[i]);
-                       else
-                               memcpy(bonepose[i], m, sizeof(m));
-                       // create a relative deformation matrix to describe displacement
-                       // from the base mesh, which is used by the actual weighting
-                       R_ConcatTransforms(bonepose[i], model->data_baseboneposeinverse + i * 12, boneposerelative[i]);
-               }
-       }
-       
-       // generate matrices for all blend combinations
-       weights = model->surfmesh.data_blendweights;
-       for (i = 0;i < model->surfmesh.num_blends;i++, weights++)
-       {
-               float * RESTRICT b = boneposerelative[model->num_bones + i];
-               const float * RESTRICT m = boneposerelative[weights->index[0]];
-               float f = weights->influence[0] * (1.0f / 255.0f);
-               b[ 0] = f*m[ 0]; b[ 1] = f*m[ 1]; b[ 2] = f*m[ 2]; b[ 3] = f*m[ 3];
-               b[ 4] = f*m[ 4]; b[ 5] = f*m[ 5]; b[ 6] = f*m[ 6]; b[ 7] = f*m[ 7];
-               b[ 8] = f*m[ 8]; b[ 9] = f*m[ 9]; b[10] = f*m[10]; b[11] = f*m[11];
-               for (k = 1;k < 4 && weights->influence[k];k++)
-               {
-                       m = boneposerelative[weights->index[k]];
-                       f = weights->influence[k] * (1.0f / 255.0f);
-                       b[ 0] += f*m[ 0]; b[ 1] += f*m[ 1]; b[ 2] += f*m[ 2]; b[ 3] += f*m[ 3];
-                       b[ 4] += f*m[ 4]; b[ 5] += f*m[ 5]; b[ 6] += f*m[ 6]; b[ 7] += f*m[ 7];
-                       b[ 8] += f*m[ 8]; b[ 9] += f*m[ 9]; b[10] += f*m[10]; b[11] += f*m[11];
-               }
-       }
-
-       // transform vertex attributes by blended matrices
-       if (vertex3f)
-       {
-               const float * RESTRICT v = model->surfmesh.data_vertex3f;
-               const unsigned short * RESTRICT b = model->surfmesh.blends;
-               // special case common combinations of attributes to avoid repeated loading of matrices
-               if (normal3f)
-               {
-                       const float * RESTRICT n = model->surfmesh.data_normal3f;
-                       if (svector3f && tvector3f)
-                       {
-                               const float * RESTRICT sv = model->surfmesh.data_svector3f;
-                               const float * RESTRICT tv = model->surfmesh.data_tvector3f;
-                               for (i = 0;i < model->surfmesh.num_vertices;i++, v += 3, n += 3, sv += 3, tv += 3, b++, vertex3f += 3, normal3f += 3, svector3f += 3, tvector3f += 3)
-                               {
-                                       const float * RESTRICT m = boneposerelative[*b];
-                                       vertex3f[0] = (v[0] * m[0] + v[1] * m[1] + v[2] * m[ 2] + m[ 3]);
-                                       vertex3f[1] = (v[0] * m[4] + v[1] * m[5] + v[2] * m[ 6] + m[ 7]);
-                                       vertex3f[2] = (v[0] * m[8] + v[1] * m[9] + v[2] * m[10] + m[11]);
-                                       normal3f[0] = (n[0] * m[0] + n[1] * m[1] + n[2] * m[ 2]);
-                                       normal3f[1] = (n[0] * m[4] + n[1] * m[5] + n[2] * m[ 6]);
-                                       normal3f[2] = (n[0] * m[8] + n[1] * m[9] + n[2] * m[10]);
-                                       svector3f[0] = (sv[0] * m[0] + sv[1] * m[1] + sv[2] * m[ 2]);
-                                       svector3f[1] = (sv[0] * m[4] + sv[1] * m[5] + sv[2] * m[ 6]);
-                                       svector3f[2] = (sv[0] * m[8] + sv[1] * m[9] + sv[2] * m[10]);
-                                       tvector3f[0] = (tv[0] * m[0] + tv[1] * m[1] + tv[2] * m[ 2]);
-                                       tvector3f[1] = (tv[0] * m[4] + tv[1] * m[5] + tv[2] * m[ 6]);
-                                       tvector3f[2] = (tv[0] * m[8] + tv[1] * m[9] + tv[2] * m[10]);
-                               }
-                               return;
-                       }
-                       for (i = 0;i < model->surfmesh.num_vertices;i++, v += 3, n += 3, b++, vertex3f += 3, normal3f += 3)
-                       {
-                               const float * RESTRICT m = boneposerelative[*b];
-                               vertex3f[0] = (v[0] * m[0] + v[1] * m[1] + v[2] * m[ 2] + m[ 3]);
-                               vertex3f[1] = (v[0] * m[4] + v[1] * m[5] + v[2] * m[ 6] + m[ 7]);
-                               vertex3f[2] = (v[0] * m[8] + v[1] * m[9] + v[2] * m[10] + m[11]);
-                               normal3f[0] = (n[0] * m[0] + n[1] * m[1] + n[2] * m[ 2]);
-                               normal3f[1] = (n[0] * m[4] + n[1] * m[5] + n[2] * m[ 6]);
-                               normal3f[2] = (n[0] * m[8] + n[1] * m[9] + n[2] * m[10]);
-                       }
-               }
-               else
-               {
-                       for (i = 0;i < model->surfmesh.num_vertices;i++, v += 3, b++, vertex3f += 3)
-                       {
-                               const float * RESTRICT m = boneposerelative[*b];
-                               vertex3f[0] = (v[0] * m[0] + v[1] * m[1] + v[2] * m[ 2] + m[ 3]);
-                               vertex3f[1] = (v[0] * m[4] + v[1] * m[5] + v[2] * m[ 6] + m[ 7]);
-                               vertex3f[2] = (v[0] * m[8] + v[1] * m[9] + v[2] * m[10] + m[11]);
-                       }
-               }
-       }
-       else if (normal3f)
-       {
-               const float * RESTRICT n = model->surfmesh.data_normal3f;
-               const unsigned short * RESTRICT b = model->surfmesh.blends;
-               for (i = 0;i < model->surfmesh.num_vertices;i++, n += 3, b++, normal3f += 3)
-               {
-                       const float * RESTRICT m = boneposerelative[*b];
-                       normal3f[0] = (n[0] * m[0] + n[1] * m[1] + n[2] * m[ 2]);
-                       normal3f[1] = (n[0] * m[4] + n[1] * m[5] + n[2] * m[ 6]);
-                       normal3f[2] = (n[0] * m[8] + n[1] * m[9] + n[2] * m[10]);
-               }
-       }
-
-       if (svector3f)
-       {
-               const float * RESTRICT sv = model->surfmesh.data_svector3f;
-               const unsigned short * RESTRICT b = model->surfmesh.blends;
-               for (i = 0;i < model->surfmesh.num_vertices;i++, sv += 3, b++, svector3f += 3)
-               {
-                       const float * RESTRICT m = boneposerelative[*b];
-                       svector3f[0] = (sv[0] * m[0] + sv[1] * m[1] + sv[2] * m[ 2]);
-                       svector3f[1] = (sv[0] * m[4] + sv[1] * m[5] + sv[2] * m[ 6]);
-                       svector3f[2] = (sv[0] * m[8] + sv[1] * m[9] + sv[2] * m[10]);
-               }
-       }
-
-       if (tvector3f)
-       {
-               const float * RESTRICT tv = model->surfmesh.data_tvector3f;
-               const unsigned short * RESTRICT b = model->surfmesh.blends;
-               for (i = 0;i < model->surfmesh.num_vertices;i++, tv += 3, b++, tvector3f += 3)
-               {
-                       const float * RESTRICT m = boneposerelative[*b];
-                       tvector3f[0] = (tv[0] * m[0] + tv[1] * m[1] + tv[2] * m[ 2]);
-                       tvector3f[1] = (tv[0] * m[4] + tv[1] * m[5] + tv[2] * m[ 6]);
-                       tvector3f[2] = (tv[0] * m[8] + tv[1] * m[9] + tv[2] * m[10]);
-               }
-       }
-}
-
 void Mod_MD3_AnimateVertices(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f)
 {
        // vertex morph
@@ -404,7 +298,6 @@ void Mod_MD3_AnimateVertices(const dp_model_t * RESTRICT model, const frameblend
                }
        }
 }
-
 void Mod_MDL_AnimateVertices(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f)
 {
        // vertex morph
index a756488..ddf8e5c 100644 (file)
@@ -235,5 +235,14 @@ aliasbone_t;
 // for decoding md3 model latlong vertex normals
 extern float mod_md3_sin[320];
 
+extern cvar_t r_skeletal_debugbone;
+extern cvar_t r_skeletal_debugbonecomponent;
+extern cvar_t r_skeletal_debugbonevalue;
+extern cvar_t r_skeletal_debugtranslatex;
+extern cvar_t r_skeletal_debugtranslatey;
+extern cvar_t r_skeletal_debugtranslatez;
+
+void *Mod_Skeletal_AnimateVertices_AllocBuffers(size_t nbytes);
+
 #endif
 
index 9e62f0d..771804b 100644 (file)
@@ -445,15 +445,28 @@ extern cvar_t developer_loading;
 #if defined(__GNUC__)
 # if defined(__i386__)
 #  define DP_ARCH_STR          "686"
+#  define SSE_POSSIBLE
 # elif defined(__x86_64__)
 #  define DP_ARCH_STR          "x86_64"
+#  define SSE_PRESENT
 # elif defined(__powerpc__)
 #  define DP_ARCH_STR          "ppc"
 # endif
 #elif defined(_WIN64)
 # define DP_ARCH_STR           "x86_64"
+# define SSE_PRESENT
 #elif defined(WIN32)
 # define DP_ARCH_STR           "x86"
+# define SSE_POSSIBLE
+#endif
+
+#ifdef SSE_PRESENT
+# define SSE_POSSIBLE
+#endif
+
+#ifdef NO_SSE
+# undef SSE_PRESENT
+# undef SSE_POSSIBLE
 #endif
 
 /// incremented every frame, never reset