neo/idlib/math/Simd_SSE3.cpp

   1 /*
   2 ===========================================================================
   3
   4 Doom 3 GPL Source Code
   5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
   6
   7 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
   8
   9 Doom 3 Source Code is free software: you can redistribute it and/or modify
  10 it under the terms of the GNU General Public License as published by
  11 the Free Software Foundation, either version 3 of the License, or
  12 (at your option) any later version.
  13
  14 Doom 3 Source Code is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with Doom 3 Source Code.  If not, see <http://www.gnu.org/licenses/>.
  21
  22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code.  If not, please request a copy in writing from id Software at the address below.
  23
  24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  25
  26 ===========================================================================
  27 */
  28
  29 #include "../precompiled.h"
  30 #pragma hdrstop
  31
  32 #include "Simd_Generic.h"
  33 #include "Simd_MMX.h"
  34 #include "Simd_SSE.h"
  35 #include "Simd_SSE2.h"
  36 #include "Simd_SSE3.h"
  37
  38
  39 //===============================================================
  40 //
  41 //      SSE3 implementation of idSIMDProcessor
  42 //
  43 //===============================================================
  44
  45 #if defined(MACOS_X) && defined(__i386__)
  46
  47 /*
  48 ============
  49 idSIMD_SSE3::GetName
  50 ============
  51 */
  52 const char * idSIMD_SSE3::GetName( void ) const {
  53         return "MMX & SSE & SSE2 & SSE3";
  54 }
  55
  56 #elif defined(_WIN32)
  57
  58 #include <xmmintrin.h>
  59
  60 #define SHUFFLEPS( x, y, z, w )         (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
  61 #define R_SHUFFLEPS( x, y, z, w )       (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
  62 #define SHUFFLEPD( x, y )                       (( (x) & 1 ) << 1 | ( (y) & 1 ))
  63 #define R_SHUFFLEPD( x, y )                     (( (y) & 1 ) << 1 | ( (x) & 1 ))
  64
  65 /*
  66
  67         The first argument of an instruction macro is the destination
  68         and the second argument is the source operand. The destination
  69         operand can be _xmm0 to _xmm7 only. The source operand can be
  70         any one of the registers _xmm0 to _xmm7 or _eax, _ecx, _edx, _esp,
  71         _ebp, _ebx, _esi, or _edi that contains the effective address.
  72
  73         For instance:  haddps   xmm0, xmm1
  74         becomes:       haddps( _xmm0, _xmm1 )
  75         and:           haddps   xmm0, [esi]
  76         becomes:       haddps( _xmm0, _esi )
  77
  78         The ADDRESS_ADDC macro can be used when the effective source address
  79         is formed by adding a constant to a general purpose register.
  80         For instance:  haddps   xmm0, [esi+48]
  81         becomes:       haddps( _xmm0, ADDRESS_ADDC( _esi, 48 ) )
  82
  83         The ADDRESS_ADDR macro can be used when the effective source address
  84         is formed by adding two general purpose registers.
  85         For instance:  haddps   xmm0, [esi+eax]
  86         becomes:       haddps( _xmm0, ADDRESS_ADDR( _esi, _eax ) )
  87
  88         The ADDRESS_ADDRC macro can be used when the effective source address
  89         is formed by adding two general purpose registers and a constant.
  90         The constant must be in the range [-128, 127].
  91         For instance:  haddps   xmm0, [esi+eax+48]
  92         becomes:       haddps( _xmm0, ADDRESS_ADDRC( _esi, _eax, 48 ) )
  93
  94         The ADDRESS_SCALEADDR macro can be used when the effective source address is formed
  95         by adding a scaled general purpose register to another general purpose register.
  96         The scale must be either 1, 2, 4 or 8.
  97         For instance:  haddps   xmm0, [esi+eax*4]
  98         becomes:       haddps( _xmm0, ADDRESS_SCALEADDR( _esi, _eax, 4 ) )
  99
 100         The ADDRESS_SCALEADDRC macro can be used when the effective source address is formed
 101         by adding a scaled general purpose register to another general purpose register and
 102         also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must
 103         be in the range [-128, 127].
 104         For instance:  haddps   xmm0, [esi+eax*4+64]
 105         becomes:       haddps( _xmm0, ADDRESS_SCALEADDRC( _esi, _eax, 4, 64 ) )
 106
 107 */
 108
 109 #define _eax    0x00
 110 #define _ecx    0x01
 111 #define _edx    0x02
 112 #define _ebx    0x03
 113 #define _esp    0x04
 114 #define _ebp    0x05
 115 #define _esi    0x06
 116 #define _edi    0x07
 117
 118 #define _xmm0   0xC0
 119 #define _xmm1   0xC1
 120 #define _xmm2   0xC2
 121 #define _xmm3   0xC3
 122 #define _xmm4   0xC4
 123 #define _xmm5   0xC5
 124 #define _xmm6   0xC6
 125 #define _xmm7   0xC7
 126
 127 #define RSCALE( s )             ( (s&2)<<5 ) | ( (s&4)<<5 ) | ( (s&8)<<3 ) | ( (s&8)<<4 )
 128
 129 #define ADDRESS_ADDC( reg0, constant )                                          0x40 | ( reg0 & 7 )     \
 130         _asm _emit constant
 131
 132 #define ADDRESS_ADDR( reg0, reg1 )                                                      0x04                            \
 133         _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )
 134
 135 #define ADDRESS_ADDRC( reg0, reg1, constant )                           0x44                            \
 136         _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )                                                         \
 137         _asm _emit constant
 138
 139 #define ADDRESS_SCALEADDR( reg0, reg1, scale )                          0x04                            \
 140         _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )
 141
 142 #define ADDRESS_SCALEADDRC( reg0, reg1, scale, constant )       0x44                            \
 143         _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )                       \
 144         _asm _emit constant
 145
 146
 147 // Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )
 148 #define addsubps( dst, src )                                            \
 149         _asm _emit 0xF2                                                                 \
 150         _asm _emit 0x0F                                                                 \
 151         _asm _emit 0xD0                                                                 \
 152         _asm _emit ( ( dst & 7 ) << 3 ) | src
 153
 154 // Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )
 155 #define addsubpd( dst, src )                                            \
 156         _asm _emit 0x66                                                                 \
 157         _asm _emit 0x0F                                                                 \
 158         _asm _emit 0xD0                                                                 \
 159         _asm _emit ( ( dst & 7 ) << 3 ) | src
 160
 161 // Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )
 162 #define haddps( dst, src )                                                      \
 163         _asm _emit 0xF2                                                                 \
 164         _asm _emit 0x0F                                                                 \
 165         _asm _emit 0x7C                                                                 \
 166         _asm _emit ( ( dst & 7 ) << 3 ) | src
 167
 168 // Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )
 169 #define haddpd( dst, src )                                                      \
 170         _asm _emit 0x66                                                                 \
 171         _asm _emit 0x0F                                                                 \
 172         _asm _emit 0x7C                                                                 \
 173         _asm _emit ( ( dst & 7 ) << 3 ) | src
 174
 175 // Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )
 176 #define hsubps( dst, src )                                                      \
 177         _asm _emit 0xF2                                                                 \
 178         _asm _emit 0x0F                                                                 \
 179         _asm _emit 0x7D                                                                 \
 180         _asm _emit ( ( dst & 7 ) << 3 ) | src
 181
 182 // Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )
 183 #define hsubpd( dst, src )                                                      \
 184         _asm _emit 0x66                                                                 \
 185         _asm _emit 0x0F                                                                 \
 186         _asm _emit 0x7D                                                                 \
 187         _asm _emit ( ( dst & 7 ) << 3 ) | src
 188
 189 // Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )
 190 #define movsldup( dst, src )                                            \
 191         _asm _emit 0xF3                                                                 \
 192         _asm _emit 0x0F                                                                 \
 193         _asm _emit 0x12                                                                 \
 194         _asm _emit ( ( dst & 7 ) << 3 ) | src
 195
 196 // Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )
 197 #define movdldup( dst, src )                                            \
 198         _asm _emit 0xF2                                                                 \
 199         _asm _emit 0x0F                                                                 \
 200         _asm _emit 0x12                                                                 \
 201         _asm _emit ( ( dst & 7 ) << 3 ) | src
 202
 203 // Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )
 204 #define movshdup( dst, src )                                            \
 205         _asm _emit 0xF3                                                                 \
 206         _asm _emit 0x0F                                                                 \
 207         _asm _emit 0x16                                                                 \
 208         _asm _emit ( ( dst & 7 ) << 3 ) | src
 209
 210 // Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )
 211 #define movdhdup( dst, src )                                            \
 212         _asm _emit 0xF2                                                                 \
 213         _asm _emit 0x0F                                                                 \
 214         _asm _emit 0x16                                                                 \
 215         _asm _emit ( ( dst & 7 ) << 3 ) | src
 216
 217 // Load Unaligned Integer 128 bits
 218 #define lddqu( dst, src )                                                       \
 219         _asm _emit 0xF2                                                                 \
 220         _asm _emit 0x0F                                                                 \
 221         _asm _emit 0xF0                                                                 \
 222         _asm _emit ( ( dst & 7 ) << 3 ) | src
 223
 224
 225 #define DRAWVERT_SIZE                           60
 226 #define DRAWVERT_XYZ_OFFSET                     (0*4)
 227 #define DRAWVERT_ST_OFFSET                      (3*4)
 228 #define DRAWVERT_NORMAL_OFFSET          (5*4)
 229 #define DRAWVERT_TANGENT0_OFFSET        (8*4)
 230 #define DRAWVERT_TANGENT1_OFFSET        (11*4)
 231 #define DRAWVERT_COLOR_OFFSET           (14*4)
 232
 233 #define JOINTQUAT_SIZE                          (7*4)
 234 #define JOINTMAT_SIZE                           (4*3*4)
 235 #define JOINTWEIGHT_SIZE                        (4*4)
 236
 237
 238 /*
 239 ============
 240 SSE3_Dot
 241 ============
 242 */
 243 float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) {
 244         float d;
 245         __asm {
 246                 mov             esi, v1
 247                 mov             edi, v2
 248                 movaps  xmm0, [esi]
 249                 mulps   xmm0, [edi]
 250                 haddps( _xmm0, _xmm0 )
 251                 haddps( _xmm0, _xmm0 )
 252                 movss   d, xmm0
 253         }
 254         return d;
 255 }
 256
 257 /*
 258 ============
 259 idSIMD_SSE3::GetName
 260 ============
 261 */
 262 const char * idSIMD_SSE3::GetName( void ) const {
 263         return "MMX & SSE & SSE2 & SSE3";
 264 }
 265
 266 /*
 267 ============
 268 idSIMD_SSE3::TransformVerts
 269 ============
 270 */
 271 void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
 272 #if 1
 273
 274         assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
 275         assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
 276         assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
 277         assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
 278
 279         __asm
 280         {
 281                 mov                     eax, numVerts
 282                 test            eax, eax
 283                 jz                      done
 284                 imul            eax, DRAWVERT_SIZE
 285
 286                 mov                     ecx, verts
 287                 mov                     edx, index
 288                 mov                     esi, weights
 289                 mov                     edi, joints
 290
 291                 add                     ecx, eax
 292                 neg                     eax
 293
 294         loopVert:
 295                 mov                     ebx, [edx]
 296                 movaps          xmm2, [esi]
 297                 add                     edx, 8
 298                 movaps          xmm0, xmm2
 299                 add                     esi, JOINTWEIGHT_SIZE
 300                 movaps          xmm1, xmm2
 301
 302                 mulps           xmm0, [edi+ebx+ 0]                                              // xmm0 = m0, m1, m2, t0
 303                 mulps           xmm1, [edi+ebx+16]                                              // xmm1 = m3, m4, m5, t1
 304                 mulps           xmm2, [edi+ebx+32]                                              // xmm2 = m6, m7, m8, t2
 305
 306                 cmp                     dword ptr [edx-4], 0
 307
 308                 jne                     doneWeight
 309
 310         loopWeight:
 311                 mov                     ebx, [edx]
 312                 movaps          xmm5, [esi]
 313                 add                     edx, 8
 314                 movaps          xmm3, xmm5
 315                 add                     esi, JOINTWEIGHT_SIZE
 316                 movaps          xmm4, xmm5
 317
 318                 mulps           xmm3, [edi+ebx+ 0]                                              // xmm3 = m0, m1, m2, t0
 319                 mulps           xmm4, [edi+ebx+16]                                              // xmm4 = m3, m4, m5, t1
 320                 mulps           xmm5, [edi+ebx+32]                                              // xmm5 = m6, m7, m8, t2
 321
 322                 cmp                     dword ptr [edx-4], 0
 323
 324                 addps           xmm0, xmm3
 325                 addps           xmm1, xmm4
 326                 addps           xmm2, xmm5
 327
 328                 je                      loopWeight
 329
 330         doneWeight:
 331                 add                     eax, DRAWVERT_SIZE
 332
 333                 haddps(         _xmm0, _xmm1 )
 334                 haddps(         _xmm2, _xmm0 )
 335
 336                 movhps          [ecx+eax-DRAWVERT_SIZE+0], xmm2
 337
 338                 haddps(         _xmm2, _xmm2 )
 339
 340                 movss           [ecx+eax-DRAWVERT_SIZE+8], xmm2
 341
 342                 jl                      loopVert
 343         done:
 344         }
 345
 346 #else
 347
 348         int i, j;
 349         const byte *jointsPtr = (byte *)joints;
 350
 351         for( j = i = 0; i < numVerts; i++ ) {
 352                 idVec3 v;
 353
 354                 v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
 355                 while( index[j*2+1] == 0 ) {
 356                         j++;
 357                         v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
 358                 }
 359                 j++;
 360
 361                 verts[i].xyz = v;
 362         }
 363
 364 #endif
 365 }
 366
 367 #endif /* _WIN32 */