2 ===========================================================================
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
7 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
26 ===========================================================================
29 #include "../precompiled.h"
32 #include "Simd_Generic.h"
35 #include "Simd_SSE2.h"
36 #include "Simd_SSE3.h"
39 //===============================================================
41 // SSE3 implementation of idSIMDProcessor
43 //===============================================================
45 #if defined(MACOS_X) && defined(__i386__)
52 const char * idSIMD_SSE3::GetName( void ) const {
53 return "MMX & SSE & SSE2 & SSE3";
58 #include <xmmintrin.h>
60 #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
61 #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
62 #define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
63 #define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
67 The first argument of an instruction macro is the destination
68 and the second argument is the source operand. The destination
69 operand can be _xmm0 to _xmm7 only. The source operand can be
70 any one of the registers _xmm0 to _xmm7 or _eax, _ecx, _edx, _esp,
71 _ebp, _ebx, _esi, or _edi that contains the effective address.
73 For instance: haddps xmm0, xmm1
74 becomes: haddps( _xmm0, _xmm1 )
75 and: haddps xmm0, [esi]
76 becomes: haddps( _xmm0, _esi )
78 The ADDRESS_ADDC macro can be used when the effective source address
79 is formed by adding a constant to a general purpose register.
80 For instance: haddps xmm0, [esi+48]
81 becomes: haddps( _xmm0, ADDRESS_ADDC( _esi, 48 ) )
83 The ADDRESS_ADDR macro can be used when the effective source address
84 is formed by adding two general purpose registers.
85 For instance: haddps xmm0, [esi+eax]
86 becomes: haddps( _xmm0, ADDRESS_ADDR( _esi, _eax ) )
88 The ADDRESS_ADDRC macro can be used when the effective source address
89 is formed by adding two general purpose registers and a constant.
90 The constant must be in the range [-128, 127].
91 For instance: haddps xmm0, [esi+eax+48]
92 becomes: haddps( _xmm0, ADDRESS_ADDRC( _esi, _eax, 48 ) )
94 The ADDRESS_SCALEADDR macro can be used when the effective source address is formed
95 by adding a scaled general purpose register to another general purpose register.
96 The scale must be either 1, 2, 4 or 8.
97 For instance: haddps xmm0, [esi+eax*4]
98 becomes: haddps( _xmm0, ADDRESS_SCALEADDR( _esi, _eax, 4 ) )
100 The ADDRESS_SCALEADDRC macro can be used when the effective source address is formed
101 by adding a scaled general purpose register to another general purpose register and
102 also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must
103 be in the range [-128, 127].
104 For instance: haddps xmm0, [esi+eax*4+64]
105 becomes: haddps( _xmm0, ADDRESS_SCALEADDRC( _esi, _eax, 4, 64 ) )
127 #define RSCALE( s ) ( (s&2)<<5 ) | ( (s&4)<<5 ) | ( (s&8)<<3 ) | ( (s&8)<<4 )
129 #define ADDRESS_ADDC( reg0, constant ) 0x40 | ( reg0 & 7 ) \
132 #define ADDRESS_ADDR( reg0, reg1 ) 0x04 \
133 _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )
135 #define ADDRESS_ADDRC( reg0, reg1, constant ) 0x44 \
136 _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) \
139 #define ADDRESS_SCALEADDR( reg0, reg1, scale ) 0x04 \
140 _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )
142 #define ADDRESS_SCALEADDRC( reg0, reg1, scale, constant ) 0x44 \
143 _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale ) \
147 // Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )
148 #define addsubps( dst, src ) \
152 _asm _emit ( ( dst & 7 ) << 3 ) | src
154 // Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )
155 #define addsubpd( dst, src ) \
159 _asm _emit ( ( dst & 7 ) << 3 ) | src
161 // Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )
162 #define haddps( dst, src ) \
166 _asm _emit ( ( dst & 7 ) << 3 ) | src
168 // Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )
169 #define haddpd( dst, src ) \
173 _asm _emit ( ( dst & 7 ) << 3 ) | src
175 // Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )
176 #define hsubps( dst, src ) \
180 _asm _emit ( ( dst & 7 ) << 3 ) | src
182 // Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )
183 #define hsubpd( dst, src ) \
187 _asm _emit ( ( dst & 7 ) << 3 ) | src
189 // Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )
190 #define movsldup( dst, src ) \
194 _asm _emit ( ( dst & 7 ) << 3 ) | src
196 // Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )
197 #define movdldup( dst, src ) \
201 _asm _emit ( ( dst & 7 ) << 3 ) | src
203 // Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )
204 #define movshdup( dst, src ) \
208 _asm _emit ( ( dst & 7 ) << 3 ) | src
210 // Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )
211 #define movdhdup( dst, src ) \
215 _asm _emit ( ( dst & 7 ) << 3 ) | src
217 // Load Unaligned Integer 128 bits
218 #define lddqu( dst, src ) \
222 _asm _emit ( ( dst & 7 ) << 3 ) | src
225 #define DRAWVERT_SIZE 60
226 #define DRAWVERT_XYZ_OFFSET (0*4)
227 #define DRAWVERT_ST_OFFSET (3*4)
228 #define DRAWVERT_NORMAL_OFFSET (5*4)
229 #define DRAWVERT_TANGENT0_OFFSET (8*4)
230 #define DRAWVERT_TANGENT1_OFFSET (11*4)
231 #define DRAWVERT_COLOR_OFFSET (14*4)
233 #define JOINTQUAT_SIZE (7*4)
234 #define JOINTMAT_SIZE (4*3*4)
235 #define JOINTWEIGHT_SIZE (4*4)
243 float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) {
250 haddps( _xmm0, _xmm0 )
251 haddps( _xmm0, _xmm0 )
262 const char * idSIMD_SSE3::GetName( void ) const {
263 return "MMX & SSE & SSE2 & SSE3";
268 idSIMD_SSE3::TransformVerts
271 void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
274 assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
275 assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
276 assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
277 assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
284 imul eax, DRAWVERT_SIZE
299 add esi, JOINTWEIGHT_SIZE
302 mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
303 mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
304 mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
306 cmp dword ptr [edx-4], 0
315 add esi, JOINTWEIGHT_SIZE
318 mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
319 mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
320 mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
322 cmp dword ptr [edx-4], 0
331 add eax, DRAWVERT_SIZE
333 haddps( _xmm0, _xmm1 )
334 haddps( _xmm2, _xmm0 )
336 movhps [ecx+eax-DRAWVERT_SIZE+0], xmm2
338 haddps( _xmm2, _xmm2 )
340 movss [ecx+eax-DRAWVERT_SIZE+8], xmm2
349 const byte *jointsPtr = (byte *)joints;
351 for( j = i = 0; i < numVerts; i++ ) {
354 v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
355 while( index[j*2+1] == 0 ) {
357 v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];