2 ===========================================================================
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
7 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
26 ===========================================================================
30 #include "../precompiled.h"
33 #include "Simd_Generic.h"
34 #include "Simd_AltiVec.h"
39 #include <ppc_intrinsics.h>
42 // Doom3 SIMD Library version 0.5
43 // Patrick Flanagan (pflanagan@apple.com)
44 // Sanjay Patel (spatel@apple.com)
45 // Architecture & Performance Group, Apple Computer
48 //===============================================================
50 // AltiVec implementation of idSIMDProcessor
52 //===============================================================
54 #if defined(MACOS_X) && defined(__ppc__)
58 #ifndef DRAWVERT_PADDED
59 // 60 bytes, 15 floats at 4 bytes each
60 #define DRAWVERT_OFFSET 15
62 // 64 bytes, 16 floats
63 #define DRAWVERT_OFFSET 16
65 // 16 bytes each, 4 floats
66 #define PLANE_OFFSET 4
67 // 16 bytes each, 4 floats
68 #define IDVEC4_OFFSET 4
71 #define IS_16BYTE_ALIGNED( x ) ( ( (unsigned long)&x & 0x0F ) == 0 )
72 #define NOT_16BYTE_ALIGNED( x ) ( ( (unsigned long)&x & 0x0F) != 0 )
74 // Aligned storing floats
75 #define ALIGNED_STORE2( ADDR, V0, V1 ) \
76 vec_st( V0, 0, ADDR ); \
77 vec_st( V1, 16, ADDR )
79 #define ALIGNED_STORE3( ADDR, V0, V1, V2 ) \
80 vec_st( V0, 0, ADDR ); \
81 vec_st( V1, 16, ADDR ); \
82 vec_st( V2, 32, ADDR )
84 #define ALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) \
85 vec_st( V0, 0, ADDR ); \
86 vec_st( V1, 16, ADDR ); \
87 vec_st( V2, 32, ADDR ); \
88 vec_st( V3, 48, ADDR )
90 #define ALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) \
91 vec_st( V0, 0, ADDR ); \
92 vec_st( V1, 16, ADDR ); \
93 vec_st( V2, 32, ADDR ); \
94 vec_st( V3, 48, ADDR ); \
95 vec_st( V4, 64, ADDR ); \
96 vec_st( V5, 80, ADDR )
98 #define ALIGNED_STORE8( ADDR, V0, V1, V2, V3, V4, V5, V6, V7 ) \
99 vec_st( V0, 0, ADDR ); \
100 vec_st( V1, 16, ADDR ); \
101 vec_st( V2, 32, ADDR ); \
102 vec_st( V3, 48, ADDR ); \
103 vec_st( V4, 64, ADDR ); \
104 vec_st( V5, 80, ADDR ); \
105 vec_st( V6, 96, ADDR ); \
106 vec_st( V7, 112, ADDR )
108 // Unaligned storing floats. These assume that we can trash the input
109 #define UNALIGNED_STORE1( ADDR, V0 ) { \
110 /* use store element */ \
111 vector unsigned char ULStoreMacroPerm = vec_lvsr( 0, ADDR ); \
112 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
113 vec_ste( V0, 0, ADDR ); \
114 vec_ste( V0, 4, ADDR ); \
115 vec_ste( V0, 8, ADDR ); \
116 vec_ste( V0, 12, ADDR ); \
119 #define UNALIGNED_STORE2( ADDR, V0, V1 ) { \
120 /* load up the values that are there now */ \
121 vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
122 vector float ULStoreMacro2 = vec_ld( 31, ADDR ); \
123 /* generate permute vector and mask */ \
124 vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
125 vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
126 /* right rotate input data */ \
127 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
128 V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
129 /* setup the output vectors */ \
130 vector float ULStoreVal1, ULStoreVal2, ULStoreVal3; \
131 ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
132 ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
133 ULStoreVal3 = vec_sel( V1, ULStoreMacro2, ULStoreMacroMask ); \
134 /* store results */ \
135 vec_st( ULStoreVal1, 0, ADDR ); \
136 vec_st( ULStoreVal2, 15, ADDR ); \
137 vec_st( ULStoreVal3, 31, ADDR ); }
139 #define UNALIGNED_STORE3( ADDR, V0, V1, V2 ) { \
140 /* load up the values that are there now */ \
141 vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
142 vector float ULStoreMacro2 = vec_ld( 47, ADDR ); \
143 /* generate permute vector and mask */ \
144 vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
145 vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
146 /* right rotate input data */ \
147 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
148 V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
149 V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
150 /* setup the output vectors */ \
151 vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4; \
152 ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
153 ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
154 ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
155 ULStoreVal4 = vec_sel( V2, ULStoreMacro2, ULStoreMacroMask ); \
156 /* store results */ \
157 vec_st( ULStoreVal1, 0, ADDR ); \
158 vec_st( ULStoreVal2, 15, ADDR ); \
159 vec_st( ULStoreVal3, 31, ADDR ); \
160 vec_st( ULStoreVal4, 47, ADDR ); }
162 #define UNALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) { \
163 /* load up the values that are there now */ \
164 vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
165 vector float ULStoreMacro2 = vec_ld( 63, ADDR ); \
166 /* generate permute vector and mask */ \
167 vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
168 vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
169 /* right rotate input data */ \
170 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
171 V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
172 V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
173 V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
174 /* setup the output vectors */ \
175 vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5; \
176 ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
177 ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
178 ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
179 ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
180 ULStoreVal5 = vec_sel( V3, ULStoreMacro2, ULStoreMacroMask ); \
181 /* store results */ \
182 vec_st( ULStoreVal1, 0, ADDR ); \
183 vec_st( ULStoreVal2, 15, ADDR ); \
184 vec_st( ULStoreVal3, 31, ADDR ); \
185 vec_st( ULStoreVal4, 47, ADDR ); \
186 vec_st( ULStoreVal5, 63, ADDR ); }
188 #define UNALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) { \
189 /* load up the values that are there now */ \
190 vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
191 vector float ULStoreMacro2 = vec_ld( 95, ADDR ); \
192 /* generate permute vector and mask */ \
193 vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
194 vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
195 /* right rotate input data */ \
196 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
197 V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
198 V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
199 V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
200 V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
201 V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
202 /* setup the output vectors */ \
203 vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
204 ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
205 ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
206 ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
207 ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
208 ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
209 ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
210 ULStoreVal7 = vec_sel( V5, ULStoreMacro2, ULStoreMacroMask ); \
211 /* store results */ \
212 vec_st( ULStoreVal1, 0, ADDR ); \
213 vec_st( ULStoreVal2, 15, ADDR ); \
214 vec_st( ULStoreVal3, 31, ADDR ); \
215 vec_st( ULStoreVal4, 47, ADDR ); \
216 vec_st( ULStoreVal5, 63, ADDR ); \
217 vec_st( ULStoreVal6, 79, ADDR ); \
218 vec_st( ULStoreVal7, 95, ADDR ); }
220 #define UNALIGNED_STORE9( ADDR, V0, V1, V2, V3, V4, V5, V6, V7, V8 ) { \
221 /* load up the values that are there now */ \
222 vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
223 vector float ULStoreMacro2 = vec_ld( 143, ADDR ); \
224 /* generate permute vector and mask */ \
225 vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
226 vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
227 /* right rotate input data */ \
228 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
229 V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
230 V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
231 V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
232 V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
233 V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
234 V6 = vec_perm( V6, V6, ULStoreMacroPerm ); \
235 V7 = vec_perm( V7, V7, ULStoreMacroPerm ); \
236 V8 = vec_perm( V8, V8, ULStoreMacroPerm ); \
237 /* setup the output vectors */ \
238 vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
239 vector float ULStoreVal8, ULStoreVal9, ULStoreVal10; \
240 ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
241 ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
242 ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
243 ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
244 ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
245 ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
246 ULStoreVal7 = vec_sel( V5, V6, ULStoreMacroMask ); \
247 ULStoreVal8 = vec_sel( V6, V7, ULStoreMacroMask ); \
248 ULStoreVal9 = vec_sel( V7, V8, ULStoreMacroMask ); \
249 ULStoreVal10 = vec_sel( V8, ULStoreMacro2, ULStoreMacroMask ); \
250 /* store results */ \
251 vec_st( ULStoreVal1, 0, ADDR ); \
252 vec_st( ULStoreVal2, 15, ADDR ); \
253 vec_st( ULStoreVal3, 31, ADDR ); \
254 vec_st( ULStoreVal4, 47, ADDR ); \
255 vec_st( ULStoreVal5, 63, ADDR ); \
256 vec_st( ULStoreVal6, 79, ADDR ); \
257 vec_st( ULStoreVal7, 95, ADDR ); \
258 vec_st( ULStoreVal8, 111, ADDR ); \
259 vec_st( ULStoreVal9, 127, ADDR ); \
260 vec_st( ULStoreVal10, 143, ADDR ); }
264 idSIMD_AltiVec::GetName
267 const char *idSIMD_AltiVec::GetName( void ) const {
275 // Prints the values of a vector, useful for debugging but
276 // should never be called in real code
277 inline void debugPrintVector( vector float v, char *msg ) {
278 printf("%s -- %vf\n", msg, v );
281 inline void debugPrintVector( vector unsigned int v, char *msg ) {
282 printf("%s -- %vd\n", msg, v );
285 inline void debugPrintVector( vector bool int v, char *msg ) {
286 printf("%s -- %vi\n", msg, v );
289 inline void debugPrintVector( vector unsigned char v, char *msg ) {
290 printf("%s -- %vuc\n", msg, v );
293 inline void debugPrintVector( vector unsigned short v, char *msg ) {
294 printf("%s -- %vs\n", msg, v );
301 For each element in vector:
306 // Use Newton-Raphson to calculate reciprocal of a vector
307 inline vector float Reciprocal( vector float v ) {
308 //Get the reciprocal estimate
309 vector float estimate = vec_re( v );
310 //One round of Newton-Raphson refinement
311 return vec_madd( vec_nmsub( estimate, v, (vector float) (1.0) ), estimate, estimate );
318 For each element in vector:
322 // Reciprocal square root estimate of a vector
323 inline vector float ReciprocalSquareRoot( vector float v ) {
324 //Get the square root reciprocal estimate
325 vector float zero = (vector float)(0);
326 vector float oneHalf = (vector float)(0.5);
327 vector float one = (vector float)(1.0);
328 vector float estimate = vec_rsqrte( vec_max( v, (vector float)(FLT_MIN) ) );
330 //One round of Newton-Raphson refinement
331 vector float estimateSquared = vec_madd( estimate, estimate, zero );
332 vector float halfEstimate = vec_madd( estimate, oneHalf, zero );
333 return vec_madd( vec_nmsub( v, estimateSquared, one ), halfEstimate, estimate );
341 For each element in vectors:
345 // Use reciprocal estimate and multiply to divide a vector
346 inline vector float Divide( vector float a, vector float b ) {
347 return vec_madd( a, Reciprocal( b ), (vector float)(0) );
352 loadSplatUnalignedScalar
354 For each element in vector:
358 inline vector float loadSplatUnalignedScalar( const float *s ) {
359 vector unsigned char splatMap = vec_lvsl( 0, s );
360 vector float v = vec_ld( 0, s );
361 splatMap = (vector unsigned char) vec_splat( (vector float) splatMap, 0 );
362 return vec_perm( v, v, splatMap );
369 For each element in vector:
370 n = idMath::ATan16( x, y )
373 // calculates arc tangent of a vector with 16 bits of precision, based on atan16 in idMath
374 inline vector float VectorATan16( vector float x, vector float y ) {
376 vector float xDivY = Divide( x, y );
377 vector float yDivX = Divide( y, x );
378 vector float zeroVector = (vector float)(0);
380 vector bool int vecCmp = vec_cmpgt( vec_abs( y ), vec_abs( x ) );
381 vector float vecA = vec_sel( yDivX, xDivY, vecCmp );
382 vector bool int vecCmp2 = vec_cmplt( vecA, zeroVector );
383 vector float vecS = vec_madd( vecA, vecA, (vector float)(0) );
385 // do calculation for S
386 vector float vecWork1 = vec_madd( (vector float)(0.0028662257f), vecS, (vector float)(-0.0161657367f) );
387 vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.0429096138f) );
388 vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.0752896400f) );
389 vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1065626393f) );
390 vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.1420889944f) );
391 vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1999355085f) );
392 vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.3333314528f) );
393 vecWork1 = vec_madd( vecWork1, vecS, (vector float)(1) );
395 // get the regular S value
396 vecS = vec_madd( vecWork1, vecA, (vector float)(0) );
398 // calculate what to return if y > x
399 vector float negSPlusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(0.5f * 3.14159265358979323846f) );
400 vector float negSMinusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(-0.5f * 3.14159265358979323846f) );
401 vector float modRet = vec_sel( negSPlusHalfPI, negSMinusHalfPI, vecCmp2 );
403 return vec_sel( modRet, vecS, vecCmp );
410 For each element in vector:
411 n = idMath::Sin16( v )
414 inline vector float VectorSin16( vector float v ) {
415 vector float zero = (vector float)(0);
418 // load up half PI and use it to calculate the rest of the values. This is
419 // sometimes cheaper than loading them from memory
421 vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
422 vector float PI = vec_add( halfPI, halfPI );
423 vector float oneandhalfPI = vec_add( PI, halfPI );
424 vector float twoPI = vec_add( oneandhalfPI, halfPI );
426 vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
427 vector float PI = (vector float)(3.14159265358979323846f);
428 vector float oneandhalfPI = (vector float)(3.14159265358979323846f + ( 0.5f * 3.14159265358979323846f ) );
429 vector float twoPI = (vector float)( 2.0f * 3.14159265358979323846f);
432 vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4;
435 vector float vecResult;
437 // fix the range if needbe
438 vecMod = vec_floor( Divide( v, twoPI ) );
439 vecResult = vec_nmsub( vecMod, twoPI, v );
441 vector float vecPIminusA = vec_sub( PI, vecResult );
442 vector float vecAminus2PI = vec_sub( vecResult, twoPI );
444 vecCmp1 = vec_cmplt( vecResult, PI );
445 vecCmp2 = vec_cmpgt( vecResult, halfPI );
447 // these are the ones where a > PI + HALF_PI so set a = a - TWO_PI
448 vecCmp3 = vec_cmpgt( vecResult, oneandhalfPI );
450 // we also want to set a = PI - a everywhere that !(a < PI) and !(a > PI + HALF_PI)
451 vecCmp4 = vec_and( vec_xor( vecCmp3, (vector bool int)(1) ), vec_xor( vecCmp1, (vector bool int)(1) ) ); // everywhere that both of those are false
453 // these are ones where a < PI and a > HALF_PI so we set a = PI - a
454 vecCmp1 = vec_and( vecCmp1, vecCmp2 );
455 vecCmp1 = vec_or( vecCmp1, vecCmp4 );
457 // put the correct values into place
458 vecResult = vec_sel( vecResult, vecPIminusA, vecCmp1 );
459 vecResult = vec_sel( vecResult, vecAminus2PI, vecCmp3 );
462 vector float vecASquared = vec_madd( vecResult, vecResult, zero );
463 vector float vecEst = vec_madd( (vector float)(-2.39e-08f), vecASquared, (vector float)(2.7526e-06f) );
464 vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.98409e-04f) );
465 vecEst = vec_madd( vecEst, vecASquared, (vector float)(8.3333315e-03f) );
466 vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.666666664e-01f) );
467 vecEst = vec_madd( vecEst, vecASquared, (vector float)(1.0f) );
468 return vec_madd( vecResult, vecEst, zero );
475 For each element in vector:
479 // splats an element across a vector using a runtime variable
480 inline vector float vecSplatWithRunTime( vector float v, int i ) {
481 vector unsigned char rotate = vec_lvsl( i * sizeof( float ), (int*) 0L );
482 v = vec_perm( v, v, rotate );
483 return vec_splat( v, 0 );
494 inline float FastScalarInvSqrt( float f ) {
495 #ifdef PPC_INTRINSICS
497 const float kSmallestFloat = FLT_MIN;
499 //Calculate a 5 bit starting estimate for the reciprocal sqrt
500 estimate = __frsqrte ( f + kSmallestFloat );
502 //if you require less precision, you may reduce the number of loop iterations.
503 // This will do 2 rounds of NR
504 estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
505 estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
508 return idMath::InvSqrt( f );
516 arg1 = 1 / sqrt( arg1 )
517 arg2 = 1 / sqrt( arg2 )
518 arg3 = 1 / sqrt( arg3 )
521 inline void FastScalarInvSqrt_x3( float *arg1, float *arg2, float *arg3 ) {
522 #ifdef PPC_INTRINSICS
523 register float estimate1, estimate2, estimate3;
524 const float kSmallestFloat = FLT_MIN;
526 //Calculate a 5 bit starting estimate for the reciprocal sqrt of each
527 estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
528 estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
529 estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
531 // two rounds newton-raphson
532 estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
533 estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
534 estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
535 estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
536 estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
537 estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
543 *arg1 = idMath::InvSqrt( *arg1 );
544 *arg2 = idMath::InvSqrt( *arg2 );
545 *arg3 = idMath::InvSqrt( *arg3 );
553 arg1 = 1 / sqrt( arg1 )
554 arg2 = 1 / sqrt( arg2 )
555 arg3 = 1 / sqrt( arg3 )
556 arg4 = 1 / sqrt( arg4 )
557 arg5 = 1 / sqrt( arg5 )
558 arg6 = 1 / sqrt( arg6 )
560 On a G5, you've got 2 pipeline stages to fill. (2 FPU's with 6 stages each)
563 inline void FastScalarInvSqrt_x6( float *arg1, float *arg2, float *arg3, float *arg4, float *arg5, float *arg6 ) {
564 #ifdef PPC_INTRINSICS
565 register float estimate1, estimate2, estimate3, estimate4, estimate5, estimate6;
566 const float kSmallestFloat = FLT_MIN;
568 //Calculate a 5 bit starting estimate for the reciprocal sqrt of each
569 estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
570 estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
571 estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
572 estimate4 = __frsqrte ( *arg4 + kSmallestFloat );
573 estimate5 = __frsqrte ( *arg5 + kSmallestFloat );
574 estimate6 = __frsqrte ( *arg6 + kSmallestFloat );
576 // two rounds newton-raphson
577 estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
578 estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
579 estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
580 estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
581 estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
582 estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
584 estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
585 estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
586 estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
587 estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
588 estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
589 estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
598 *arg1 = idMath::InvSqrt( *arg1 );
599 *arg2 = idMath::InvSqrt( *arg2 );
600 *arg3 = idMath::InvSqrt( *arg3 );
601 *arg4 = idMath::InvSqrt( *arg4 );
602 *arg5 = idMath::InvSqrt( *arg5 );
603 *arg6 = idMath::InvSqrt( *arg6 );
608 // End Helper Functions
610 #ifdef ENABLE_SIMPLE_MATH
616 dst[i] = constant + src[i];
619 void VPCALL idSIMD_AltiVec::Add( float *dst, const float constant, const float *src, const int count ) {
620 vector float v0, v1, v2, v3;
621 vector float v0_low, v0_hi, v1_hi;
622 vector unsigned char permVec;
623 vector float constVec;
626 // handle unaligned cases at beginning
627 for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
628 dst[i] = constant + src[i];
631 //splat constant into a vector
632 constVec = loadSplatUnalignedScalar( &constant );
634 //calculate permute and do first load
635 permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), (vector unsigned char)(1) );
636 v1_hi = vec_ld( 0, &src[i] );
639 for ( ; i+7 < count; i += 8 ) {
642 v0_hi = vec_ld( 15, &src[i] );
643 v1_hi = vec_ld( 31, &src[i] );
645 v0 = vec_perm( v0_low, v0_hi, permVec );
646 v1 = vec_perm( v0_hi, v1_hi, permVec );
648 v2 = vec_add( v0, constVec );
649 v3 = vec_add( v1, constVec );
652 ALIGNED_STORE2( &dst[i], v2, v3 );
656 for ( ; i < count ; i++ ) {
657 dst[i] = constant + src[i];
665 dst[i] = src0[i] + src1[i];
668 void VPCALL idSIMD_AltiVec::Add( float *dst, const float *src0, const float *src1, const int count ) {
670 register vector float v0, v1, v2, v3, v4, v5;
672 register vector float v0_low, v0_hi, v2_low, v2_hi;
674 register vector float v1_low, v1_hi, v3_low, v3_hi;
676 register vector unsigned char permVec1, permVec2;
677 vector unsigned char oneCharVector = (vector unsigned char)(1);
682 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
683 dst[i] = src0[i] + src1[i];
686 //calculate permute and do loads
687 permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
688 permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
689 v2_hi = vec_ld( 0, &src0[i] );
690 v3_hi = vec_ld( 0, &src1[i] );
693 for ( ; i+7 < count; i += 8 ) {
696 v0_hi = vec_ld( 15, &src0[i] );
698 v2_hi = vec_ld( 31, &src0[i] );
701 v1_hi = vec_ld( 15, &src1[i] );
703 v3_hi = vec_ld( 31, &src1[i] );
705 v0 = vec_perm( v0_low, v0_hi, permVec1 );
706 v1 = vec_perm( v1_low, v1_hi, permVec2 );
707 v2 = vec_perm( v2_low, v2_hi, permVec1 );
708 v3 = vec_perm( v3_low, v3_hi, permVec2 );
710 v4 = vec_add( v0, v1 );
711 v5 = vec_add( v2, v3 );
713 ALIGNED_STORE2( &dst[i], v4, v5 );
718 for ( ; i < count ; i++ ) {
719 dst[i] = src0[i] + src1[i];
727 dst[i] = constant - src[i];
730 void VPCALL idSIMD_AltiVec::Sub( float *dst, const float constant, const float *src, const int count ) {
732 register vector float v0, v1, v2, v3;
733 register vector float v0_low, v0_hi, v1_low, v1_hi;
734 register vector unsigned char permVec;
735 register vector float constVec;
736 vector unsigned char oneCharVector = (vector unsigned char)(1);
739 //handle unaligned at start
740 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
741 dst[i] = constant - src[i];
744 //splat constant into a vector
745 constVec = loadSplatUnalignedScalar( &constant );
747 //calculate permute vector and do first load
748 permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
749 v1_hi = vec_ld( 0, &src[i] );
752 for ( ; i+7 < count; i += 8 ) {
755 v0_hi = vec_ld( 15, &src[i] );
757 v1_hi = vec_ld( 31, &src[i] );
759 v0 = vec_perm( v0_low, v0_hi, permVec );
760 v1 = vec_perm( v1_low, v1_hi, permVec );
762 v2 = vec_sub( constVec, v0 );
763 v3 = vec_sub( constVec, v1 );
765 ALIGNED_STORE2( &dst[i], v2, v3 );
769 for ( ; i < count ; i++ ) {
770 dst[i] = constant - src[i];
778 dst[i] = src0[i] - src1[i];
781 void VPCALL idSIMD_AltiVec::Sub( float *dst, const float *src0, const float *src1, const int count ) {
782 register vector float v0, v1, v2, v3, v4, v5;
784 register vector float v0_low, v0_hi, v2_low, v2_hi;
786 register vector float v1_low, v1_hi, v3_low, v3_hi;
787 register vector unsigned char permVec1, permVec2;
788 vector unsigned char oneCharVector = (vector unsigned char)(1);
791 //handle unaligned at start
792 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
793 dst[i] = src0[i] - src1[i];
796 //calculate permute and do first loads
797 permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
798 permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
799 v2_hi = vec_ld( 0, &src0[i] );
800 v3_hi = vec_ld( 0, &src1[i] );
803 for ( ; i+7 < count; i += 8 ) {
806 v0_hi = vec_ld( 15, &src0[i] );
808 v2_hi = vec_ld( 31, &src0[i] );
811 v1_hi = vec_ld( 15, &src1[i] );
813 v3_hi = vec_ld( 31, &src1[i] );
815 v0 = vec_perm( v0_low, v0_hi, permVec1 );
816 v1 = vec_perm( v1_low, v1_hi, permVec2 );
817 v2 = vec_perm( v2_low, v2_hi, permVec1 );
818 v3 = vec_perm( v3_low, v3_hi, permVec2 );
820 v4 = vec_sub( v0, v1 );
821 v5 = vec_sub( v2, v3 );
823 ALIGNED_STORE2( &dst[i], v4, v5 );
827 for ( ; i < count ; i++ ) {
828 dst[i] = src0[i] - src1[i];
836 dst[i] = constant * src[i];
839 void VPCALL idSIMD_AltiVec::Mul( float *dst, const float constant, const float *src, const int count) {
840 register vector float v0, v0_low, v0_hi, v1_low, v1_hi, v1, v2, v3;
841 register vector float constVec;
842 register vector unsigned char permVec;
843 vector unsigned char oneCharVector = (vector unsigned char)(1);
844 register vector float zeroVector = (vector float)(0.0);
847 // handle unaligned data at start
848 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
849 dst[i] = constant * src[i];
852 //splat constant into a vector
853 constVec = loadSplatUnalignedScalar( &constant );
855 permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
856 v1_hi = vec_ld( 0, &src[i] );
859 for ( ; i+7 < count; i += 8 ) {
862 v0_hi = vec_ld( 15, &src[i] );
864 v1_hi = vec_ld( 31, &src[i] );
866 v0 = vec_perm( v0_low, v0_hi, permVec );
867 v1 = vec_perm( v1_low, v1_hi, permVec );
869 v2 = vec_madd( constVec, v0, zeroVector );
870 v3 = vec_madd( constVec, v1, zeroVector );
872 ALIGNED_STORE2( &dst[i], v2, v3 );
876 for ( ; i < count ; i++ ) {
877 dst[i] = constant * src[i];
885 dst[i] = src0[i] * src1[i];
888 void VPCALL idSIMD_AltiVec::Mul( float *dst, const float *src0, const float *src1, const int count ) {
889 register vector float v0, v1, v2, v3, v4, v5;
891 register vector float v0_low, v0_hi, v2_low, v2_hi;
893 register vector float v1_low, v1_hi, v3_low, v3_hi;
895 register vector unsigned char permVec1, permVec2;
896 register vector float constVec = (vector float)(0.0);
897 vector unsigned char oneCharVector = (vector unsigned char)(1);
900 //handle unaligned at start
901 for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
902 dst[i] = src0[i] * src1[i];
905 //calculate permute and do loads
906 permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
907 permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
908 v2_hi = vec_ld( 0, &src0[i] );
909 v3_hi = vec_ld( 0, &src1[i] );
912 for ( ; i+7 < count; i += 8 ) {
915 v0_hi = vec_ld( 15, &src0[i] );
917 v2_hi = vec_ld( 31, &src0[i] );
920 v1_hi = vec_ld( 15, &src1[i] );
922 v3_hi = vec_ld( 31, &src1[i] );
924 v0 = vec_perm( v0_low, v0_hi, permVec1 );
925 v1 = vec_perm( v1_low, v1_hi, permVec2 );
926 v2 = vec_perm( v2_low, v2_hi, permVec1 );
927 v3 = vec_perm( v3_low, v3_hi, permVec2 );
929 //no such thing as regular multiply so we do
930 //multiply then add zero
931 v4 = vec_madd( v0, v1, constVec );
932 v5 = vec_madd( v2, v3, constVec );
934 ALIGNED_STORE2( &dst[i], v4, v5 );
938 for ( ; i < count ; i++ ) {
939 dst[i] = src0[i] * src1[i];
947 dst[i] = constant / divisor[i];
950 void VPCALL idSIMD_AltiVec::Div( float *dst, const float constant, const float *divisor, const int count ) {
951 register vector float v0, v1, v2, v3;
952 register vector float v0_low, v0_hi, v1_low, v1_hi;
953 register vector unsigned char permVec;
954 register vector float constVec;
955 vector unsigned char oneCharVector = (vector unsigned char)(1);
958 //handle unaligned at start
959 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
960 dst[i] = constant / divisor[i];
963 //splat constant into a vector
964 constVec = loadSplatUnalignedScalar( &constant );
966 //calculate permute and do first loads
967 permVec = vec_add( vec_lvsl( -1, (int*) &divisor[i] ), oneCharVector );
968 v1_hi = vec_ld( 0, &divisor[i] );
971 for ( ; i+7 < count; i += 8 ) {
974 v0_hi = vec_ld( 15, &divisor[i] );
976 v1_hi = vec_ld( 31, &divisor[i] );
978 v0 = vec_perm( v0_low, v0_hi, permVec );
979 v1 = vec_perm( v1_low, v1_hi, permVec );
981 v2 = Divide( constVec, v0 );
982 v3 = Divide( constVec, v1 );
984 ALIGNED_STORE2( &dst[i], v2, v3 );
988 for ( ; i < count ; i++ ) {
989 dst[i] = constant / divisor[i];
997 dst[i] = src0[i] / src1[i];
1000 void VPCALL idSIMD_AltiVec::Div( float *dst, const float *src0, const float *src1, const int count ) {
1001 register vector float v0, v1, v2, v3, v4, v5;
1003 register vector float v0_low, v0_hi, v2_low, v2_hi;
1005 register vector float v1_low, v1_hi, v3_low, v3_hi;
1007 register vector unsigned char permVec1, permVec2;
1008 vector unsigned char oneCharVector = (vector unsigned char)(1);
1011 //handle unaligned at start
1012 for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1013 dst[i] = src0[i] / src1[i];
1016 //calculate permute and do loads
1017 permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
1018 permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
1019 v2_hi = vec_ld( 0, &src0[i] );
1020 v3_hi = vec_ld( 0, &src1[i] );
1023 for ( ; i+7 < count; i += 8 ) {
1026 v0_hi = vec_ld( 15, &src0[i] );
1028 v2_hi = vec_ld( 31, &src0[i] );
1031 v1_hi = vec_ld( 15, &src1[i] );
1033 v3_hi = vec_ld( 31, &src1[i] );
1035 v0 = vec_perm( v0_low, v0_hi, permVec1 );
1036 v1 = vec_perm( v1_low, v1_hi, permVec2 );
1037 v2 = vec_perm( v2_low, v2_hi, permVec1 );
1038 v3 = vec_perm( v3_low, v3_hi, permVec2 );
1040 v4 = Divide( v0, v1 );
1041 v5 = Divide( v2, v3 );
1043 ALIGNED_STORE2( &dst[i], v4, v5 );
1047 for ( ; i < count ; i++ ) {
1048 dst[i] = src0[i] / src1[i];
1054 idSIMD_AltiVec::MulAdd
1056 dst[i] += constant * src[i];
1059 void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float constant, const float *src, const int count ) {
1061 register vector float v0, v1, v2, v3, v4, v5;
1062 register vector float constVec;
1064 register vector float v0_low, v0_hi, v2_low, v2_hi;
1066 register vector unsigned char permVec1;
1067 vector unsigned char oneCharVector = (vector unsigned char)(1);
1070 //handle unaligned at start
1071 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1072 dst[i] += constant * src[i];
1075 //splat constant into a vector
1076 constVec = loadSplatUnalignedScalar( &constant );
1078 //calculate permute and do loads
1079 permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
1080 v2_hi = vec_ld( 0, &src[i] );
1083 for ( ; i+7 < count; i += 8 ) {
1085 v0_hi = vec_ld( 15, &src[i] );
1087 v2_hi = vec_ld( 31, &src[i] );
1089 v0 = vec_perm( v0_low, v0_hi, permVec1 );
1090 v2 = vec_perm( v2_low, v2_hi, permVec1 );
1092 // at this point, dst is known to be aligned
1093 v1 = vec_ld( 0, &dst[i] );
1094 v3 = vec_ld( 16, &dst[i] );
1096 v4 = vec_madd( constVec, v0, v1 );
1097 v5 = vec_madd( constVec, v2, v3 );
1099 ALIGNED_STORE2( &dst[i], v4, v5 );
1103 for ( ; i < count ; i++ ) {
1104 dst[i] += constant * src[i];
1110 idSIMD_AltiVec::MulAdd
1112 dst[i] += src0[i] * src1[i];
1115 void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
1116 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1118 register vector float v0_low, v0_hi, v2_low, v2_hi;
1120 register vector float v1_low, v1_hi, v3_low, v3_hi;
1122 register vector unsigned char permVec1, permVec2;
1123 vector unsigned char oneCharVector = (vector unsigned char)(1);
1127 //unaligned at start
1128 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1129 dst[i] += src0[i] * src1[i];
1132 //calculate permute and do loads
1133 permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
1134 permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
1135 v2_hi = vec_ld( 0, &src0[i] );
1136 v3_hi = vec_ld( 0, &src1[i] );
1139 for ( ; i+7 < count; i += 8 ) {
1142 v0_hi = vec_ld( 15, &src0[i] );
1144 v2_hi = vec_ld( 31, &src0[i] );
1147 v1_hi = vec_ld( 15, &src1[i] );
1149 v3_hi = vec_ld( 31, &src1[i] );
1151 v0 = vec_perm( v0_low, v0_hi, permVec1 );
1152 v1 = vec_perm( v1_low, v1_hi, permVec2 );
1153 v2 = vec_perm( v2_low, v2_hi, permVec1 );
1154 v3 = vec_perm( v3_low, v3_hi, permVec2 );
1156 //we know dst is aligned because we handled unaligned cases
1158 v4 = vec_ld( 0, &dst[i] );
1159 v5 = vec_ld( 16, &dst[i] );
1161 v6 = vec_madd( v0, v1, v4 );
1162 v7 = vec_madd( v2, v3, v5 );
1164 ALIGNED_STORE2( &dst[i], v6, v7 );
1168 for ( ; i < count ; i++ ) {
1169 dst[i] += src0[i] * src1[i];
1175 idSIMD_AltiVec::MulSub
1177 dst[i] -= constant * src[i];
1180 void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float constant, const float *src, const int count ) {
1181 register vector float v0, v1, v2, v3, v4, v5;
1182 register vector float constVec;
1184 register vector float v0_low, v0_hi, v2_low, v2_hi;
1186 register vector unsigned char permVec1;
1187 vector unsigned char oneCharVector = (vector unsigned char)(1);
1190 //handle unaligned at start
1191 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1192 dst[i] -= constant * src[i];
1195 //splat constant into a vector
1196 constVec = loadSplatUnalignedScalar( &constant );
1198 //calculate permute and do loads
1199 permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
1200 v2_hi = vec_ld( 0, &src[i] );
1203 for ( ; i+7 < count; i += 8 ) {
1205 v0_hi = vec_ld( 15, &src[i] );
1207 v2_hi = vec_ld( 31, &src[i] );
1209 v0 = vec_perm( v0_low, v0_hi, permVec1 );
1210 v2 = vec_perm( v2_low, v2_hi, permVec1 );
1212 //we know dst will be aligned here because we already handled the preceeding
1214 v1 = vec_ld( 0, &dst[i] );
1215 v3 = vec_ld( 16, &dst[i] );
1217 v4 = vec_nmsub( v0, constVec, v1 );
1218 v5 = vec_nmsub( v2, constVec, v3 );
1220 ALIGNED_STORE2( &dst[i], v4, v5 );
1224 for ( ; i < count ; i++ ) {
1225 dst[i] -= constant * src[i];
1231 idSIMD_AltiVec::MulSub
1233 dst[i] -= src0[i] * src1[i];
1236 void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
1237 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1239 register vector float v0_low, v0_hi, v2_low, v2_hi;
1241 register vector float v1_low, v1_hi, v3_low, v3_hi;
1243 register vector unsigned char permVec1, permVec2;
1244 vector unsigned char oneCharVector = (vector unsigned char)(1);
1247 //unaligned at start
1248 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1249 dst[i] -= src0[i] * src1[i];
1252 //calculate permute and do loads
1253 permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
1254 permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
1255 v2_hi = vec_ld( 0, &src0[i] );
1256 v3_hi = vec_ld( 0, &src1[i] );
1260 for ( ; i+7 < count; i += 8 ) {
1263 v0_hi = vec_ld( 15, &src0[i] );
1265 v2_hi = vec_ld( 31, &src0[i] );
1268 v1_hi = vec_ld( 15, &src1[i] );
1270 v3_hi = vec_ld( 31, &src1[i] );
1272 v0 = vec_perm( v0_low, v0_hi, permVec1 );
1273 v1 = vec_perm( v1_low, v1_hi, permVec2 );
1274 v2 = vec_perm( v2_low, v2_hi, permVec1 );
1275 v3 = vec_perm( v3_low, v3_hi, permVec2 );
1277 //we know dst is aligned because we handled unaligned cases
1279 v4 = vec_ld( 0, &dst[i] );
1280 v5 = vec_ld( 16, &dst[i] );
1282 v6 = vec_nmsub( v0, v1, v4 );
1283 v7 = vec_nmsub( v2, v3, v5 );
1285 ALIGNED_STORE2( &dst[i], v6, v7 );
1289 for ( ; i < count ; i++ ) {
1290 dst[i] -= src0[i] * src1[i];
1294 #endif /* ENABLE_SIMPLE_MATH */
1301 dst[i] = constant * src[i];
1304 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
1306 register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
1307 register vector float vecX, vecY, vecZ;
1308 vector float vecX2, vecY2, vecZ2;
1309 const float *addr = src[0].ToFloatPtr();
1312 register vector float zeroVector = (vector float)(0.0);
1313 register vector float vecConstX, vecConstY, vecConstZ;
1316 register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
1317 register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
1319 register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
1320 register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
1322 register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
1323 register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
1327 // for scalar cleanup, if necessary
1328 constVal[0] = constant[0];
1329 constVal[1] = constant[1];
1330 constVal[2] = constant[2];
1333 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1334 vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
1335 vecLd2 = vec_ld( 11, constant.ToFloatPtr() );
1336 vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
1339 // populate const vectors
1340 vecConstX = vec_splat( vecLd1, 0 );
1341 vecConstY = vec_splat( vecLd1, 1 );
1342 vecConstZ = vec_splat( vecLd1, 2 );
1344 vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
1345 vector float vecOld = vec_ld( 0, addr );
1347 // handle unaligned case at beginning
1348 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1349 dst[i] = constant * src[i];
1352 for ( ; i + 7 < count; i += 8 ) {
1353 float *vecPtr = (float*)( addr + (i*3) );
1354 vector float v0, v1, v2, v3, v4, v5;
1356 v0 = vecOld; //vec_ld( 0, vecPtr );
1357 v1 = vec_ld( 15, vecPtr );
1358 v2 = vec_ld( 31, vecPtr );
1359 v3 = vec_ld( 47, vecPtr );
1360 v4 = vec_ld( 63, vecPtr );
1361 v5 = vec_ld( 79, vecPtr );
1362 vecOld = vec_ld( 95, vecPtr );
1364 vecLd1 = vec_perm( v0, v1, permVec );
1365 vecLd2 = vec_perm( v1, v2, permVec );
1366 vecLd3 = vec_perm( v2, v3, permVec );
1368 vecLd4 = vec_perm( v3, v4, permVec );
1369 vecLd5 = vec_perm( v4, v5, permVec );
1370 vecLd6 = vec_perm( v5, vecOld, permVec );
1372 // permute into X Y Z vectors
1373 vecX = vec_perm( vecLd1, vecLd2, permX1 );
1374 vecY = vec_perm( vecLd1, vecLd2, permY1 );
1375 vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
1376 vecX = vec_perm( vecX, vecLd3, permX2 );
1377 vecY = vec_perm( vecY, vecLd3, permY2 );
1378 vecZ = vec_perm( vecZ, vecLd3, permZ2 );
1380 vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
1381 vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
1382 vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
1383 vecX2 = vec_perm( vecX2, vecLd6, permX2 );
1384 vecY2 = vec_perm( vecY2, vecLd6, permY2 );
1385 vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
1388 vecX = vec_madd( vecX, vecConstX, zeroVector );
1389 vecY = vec_madd( vecY, vecConstY, vecX );
1390 vecZ = vec_madd( vecZ, vecConstZ, vecY );
1392 vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
1393 vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
1394 vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
1396 // store out results
1397 ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
1401 for ( ; i < count; i++ ) {
1402 // look up whats at the address we want, cast it as float pointer, then
1403 // dereference that pointer
1404 tempVal[0] = *( addr + (i*3) + 0 );
1405 tempVal[1] = *( addr + (i*3) + 1 );
1406 tempVal[2] = *( addr + (i*3) + 2 );
1407 dst[i] = constVal[0] * tempVal[0] + constVal[1] * tempVal[1] + constVal[2] * tempVal[2];
1416 dst[i] = constant * src[i].Normal() + src[i][3];
1419 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
1420 //#define OPER(X) dst[(X)] = constant * src[(X)].Normal() + src[(X)][3];
1422 assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
1430 vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
1431 vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
1432 vector float vecX, vecY, vecZ, vecI3;
1433 vector float vecX2, vecY2, vecZ2, vecI32;
1434 vector float vecConstX, vecConstY, vecConstZ;
1436 constVal[0] = constant[0];
1437 constVal[1] = constant[1];
1438 constVal[2] = constant[2];
1441 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1442 vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
1443 vector float v1 = vec_ld( 11, constant.ToFloatPtr() );
1444 vector float vecConst = vec_perm( v0, v1, constPerm );
1446 vecConstX = vec_splat( vecConst, 0 );
1447 vecConstY = vec_splat( vecConst, 1 );
1448 vecConstZ = vec_splat( vecConst, 2 );
1450 // handle unaligned case at beginning
1451 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1452 dst[i] = constant * src[i].Normal() + src[i][3];
1455 const float *addr = src[i].ToFloatPtr();
1456 vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
1457 vector float vecOld = vec_ld( 0, addr );
1459 for ( ; i + 7 < count; i += 8 ) {
1460 float *planePtr = (float*)( addr + (i*PLANE_OFFSET) );
1461 vector float v0, v1, v2, v3, v4, v5, v6, v7;
1463 v0 = vecOld; //vec_ld( 0, planePtr );
1464 v1 = vec_ld( 15, planePtr );
1465 v2 = vec_ld( 31, planePtr );
1466 v3 = vec_ld( 47, planePtr );
1467 v4 = vec_ld( 63, planePtr );
1468 v5 = vec_ld( 79, planePtr );
1469 v6 = vec_ld( 95, planePtr );
1470 v7 = vec_ld( 111, planePtr );
1471 vecOld = vec_ld( 127, planePtr );
1473 vecPlaneLd1 = vec_perm( v0, v1, permVec );
1474 vecPlaneLd2 = vec_perm( v1, v2, permVec );
1475 vecPlaneLd3 = vec_perm( v2, v3, permVec );
1476 vecPlaneLd4 = vec_perm( v3, v4, permVec );
1478 vecPlaneLd5 = vec_perm( v4, v5, permVec );
1479 vecPlaneLd6 = vec_perm( v5, v6, permVec );
1480 vecPlaneLd7 = vec_perm( v6, v7, permVec );
1481 vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
1483 // permute into X Y Z vectors, since this is square its basically
1484 // a matrix transpose
1485 v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
1486 v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
1487 v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
1488 v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
1490 vecX = vec_mergeh( v0, v1 );
1491 vecY = vec_mergel( v0, v1 );
1492 vecZ = vec_mergeh( v2, v3 );
1493 vecI3 = vec_mergel( v2, v3 );
1495 v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
1496 v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
1497 v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
1498 v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
1500 vecX2 = vec_mergeh( v4, v5 );
1501 vecY2 = vec_mergel( v4, v5 );
1502 vecZ2 = vec_mergeh( v6, v7 );
1503 vecI32 = vec_mergel( v6, v7 );
1506 v6 = vec_madd( vecZ, vecConstZ, vecI3 );
1507 v5 = vec_madd( vecY, vecConstY, v6 );
1508 v4 = vec_madd( vecX, vecConstX, v5 );
1510 v0 = vec_madd( vecZ2, vecConstZ, vecI32 );
1511 v1 = vec_madd( vecY2, vecConstY, v0 );
1512 v2 = vec_madd( vecX2, vecConstX, v1 );
1515 ALIGNED_STORE2( &dst[i], v4, v2 );
1519 for ( ; i < count; i++ ) {
1520 // populate srcVal with src X Y Z
1521 srcVal[0] = *(addr + (i*PLANE_OFFSET) + 0 );
1522 srcVal[1] = *(addr + (i*PLANE_OFFSET) + 1 );
1523 srcVal[2] = *(addr + (i*PLANE_OFFSET) + 2 );
1525 // put src[i][3] into srcI3
1526 srcI3 = *(addr + (i*PLANE_OFFSET) + 3 );
1528 tempVal = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
1529 dst[i] = tempVal + srcI3;
1533 #ifndef DRAWVERT_PADDED
1538 dst[i] = constant * src[i].xyz;
1541 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
1542 //#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
1544 // idDrawVert size is 60 bytes
1545 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
1547 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1549 register vector float vecConstX, vecConstY, vecConstZ;
1550 register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
1551 register vector float zeroVector = (vector float)(0.0);
1552 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
1554 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1555 v0 = vec_ld( 0, constant.ToFloatPtr() );
1556 v1 = vec_ld( 11, constant.ToFloatPtr() );
1557 v0 = vec_perm( v0, v1, constPerm );
1559 // permute into constant vectors
1560 vecConstX = vec_splat( v0, 0 );
1561 vecConstY = vec_splat( v0, 1 );
1562 vecConstZ = vec_splat( v0, 2 );
1564 // handle unaligned case at beginning
1565 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1566 dst[i] = constant * src[i].xyz;
1569 // every fourth one will have the same alignment. Make sure we've got enough here
1570 if ( i+3 < count ) {
1571 vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1572 vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1573 vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1574 vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1577 for ( ; i+3 < count; i += 4 ) {
1578 const float *vertPtr = src[i].xyz.ToFloatPtr();
1579 const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
1580 const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
1581 const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
1583 v0 = vec_ld( 0, vertPtr );
1584 v1 = vec_ld( 11, vertPtr );
1585 v2 = vec_ld( 0, vertPtr2 );
1586 v3 = vec_ld( 11, vertPtr2 );
1587 v4 = vec_ld( 0, vertPtr3 );
1588 v5 = vec_ld( 11, vertPtr3 );
1589 v6 = vec_ld( 0, vertPtr4 );
1590 v7 = vec_ld( 11, vertPtr4 );
1592 v0 = vec_perm( v0, v1, vertPerm1 );
1593 v2 = vec_perm( v2, v3, vertPerm2 );
1594 v4 = vec_perm( v4, v5, vertPerm3 );
1595 v6 = vec_perm( v6, v7, vertPerm4 );
1597 // transpose into X Y Z vectors
1598 v1 = vec_mergeh( v0, v4 );
1599 v3 = vec_mergeh( v2, v6 );
1600 v5 = vec_mergel( v0, v4 );
1601 v7 = vec_mergel( v2, v6 );
1603 vecSrcX1 = vec_mergeh( v1, v3 );
1604 vecSrcY1 = vec_mergel( v1, v3 );
1605 vecSrcZ1 = vec_mergeh( v5, v7 );
1607 // now calculate dot product
1608 vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
1609 vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
1610 vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
1613 vec_st( vecSrcZ1, 0, &dst[i] );
1616 for ( ; i < count; i++ ) {
1617 dst[i] = constant * src[i].xyz;
1625 dst[i] = constant * src[i].xyz;
1628 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
1629 //#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
1631 // idDrawVert size is 64 bytes
1632 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
1634 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1636 register vector float vecConstX, vecConstY, vecConstZ;
1637 register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
1638 register vector float zeroVector = (vector float)(0.0);
1639 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
1641 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1642 v0 = vec_ld( 0, constant.ToFloatPtr() );
1643 v1 = vec_ld( 11, constant.ToFloatPtr() );
1644 v0 = vec_perm( v0, v1, constPerm );
1646 // permute into constant vectors
1647 vecConstX = vec_splat( v0, 0 );
1648 vecConstY = vec_splat( v0, 1 );
1649 vecConstZ = vec_splat( v0, 2 );
1651 // handle unaligned case at beginning
1652 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1653 dst[i] = constant * src[i].xyz;
1656 for ( ; i+3 < count; i += 4 ) {
1657 const float *vertPtr = src[i].xyz.ToFloatPtr();
1658 const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
1659 const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
1660 const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
1662 v0 = vec_ld( 0, vertPtr );
1663 v2 = vec_ld( 0, vertPtr2 );
1664 v4 = vec_ld( 0, vertPtr3 );
1665 v6 = vec_ld( 0, vertPtr4 );
1667 // transpose into X Y Z vectors
1668 v1 = vec_mergeh( v0, v4 );
1669 v3 = vec_mergeh( v2, v6 );
1670 v5 = vec_mergel( v0, v4 );
1671 v7 = vec_mergel( v2, v6 );
1673 vecSrcX1 = vec_mergeh( v1, v3 );
1674 vecSrcY1 = vec_mergel( v1, v3 );
1675 vecSrcZ1 = vec_mergeh( v5, v7 );
1677 // now calculate dot product
1678 vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
1679 vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
1680 vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
1683 vec_st( vecSrcZ1, 0, &dst[i] );
1686 for ( ; i < count; i++ ) {
1687 dst[i] = constant * src[i].xyz;
1691 #endif /* DRAWVERT_PADDED */
1697 dst[i] = constant.Normal() * src[i] + constant[3];
1700 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
1701 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)] + constant[3];
1703 register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
1704 register vector float vecX, vecY, vecZ, vecX2, vecY2, vecZ2;
1705 register vector float zeroVector = (vector float)(0.0);
1706 register vector float vecConstX, vecConstY, vecConstZ;
1707 register vector float vecConst3;
1709 idVec3 constNormal = constant.Normal();
1710 float const3 = constant[3];
1713 register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
1714 register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
1716 register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
1717 register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
1719 register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
1720 register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
1724 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1725 vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
1726 vecLd2 = vec_ld( 15, constant.ToFloatPtr() );
1727 vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
1729 // populate const vec
1730 vecConstX = vec_splat( vecLd1, 0 );
1731 vecConstY = vec_splat( vecLd1, 1 );
1732 vecConstZ = vec_splat( vecLd1, 2 );
1734 // put constant to add in vector
1735 vecConst3 = loadSplatUnalignedScalar( &const3 );
1737 // handle unaligned case at beginning
1738 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1739 dst[i] = constant.Normal() * src[i] + constant[3];
1742 const float *addr = src[i].ToFloatPtr();
1743 vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
1744 vector float vecOld = vec_ld( 0, addr );
1746 for ( ; i+7 < count; i += 8 ) {
1747 float *vecPtr = (float*)( addr + (i*3) );
1748 vector float v0, v1, v2, v3, v4, v5;
1750 v0 = vecOld; //vec_ld( 0, vecPtr );
1751 v1 = vec_ld( 15, vecPtr );
1752 v2 = vec_ld( 31, vecPtr );
1753 v3 = vec_ld( 47, vecPtr );
1754 v4 = vec_ld( 63, vecPtr );
1755 v5 = vec_ld( 79, vecPtr );
1756 vecOld = vec_ld( 95, vecPtr );
1758 vecLd1 = vec_perm( v0, v1, permVec );
1759 vecLd2 = vec_perm( v1, v2, permVec );
1760 vecLd3 = vec_perm( v2, v3, permVec );
1762 vecLd4 = vec_perm( v3, v4, permVec );
1763 vecLd5 = vec_perm( v4, v5, permVec );
1764 vecLd6 = vec_perm( v5, vecOld, permVec );
1766 // permute into X Y Z vectors
1767 vecX = vec_perm( vecLd1, vecLd2, permX1 );
1768 vecY = vec_perm( vecLd1, vecLd2, permY1 );
1769 vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
1770 vecX = vec_perm( vecX, vecLd3, permX2 );
1771 vecY = vec_perm( vecY, vecLd3, permY2 );
1772 vecZ = vec_perm( vecZ, vecLd3, permZ2 );
1774 vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
1775 vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
1776 vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
1777 vecX2 = vec_perm( vecX2, vecLd6, permX2 );
1778 vecY2 = vec_perm( vecY2, vecLd6, permY2 );
1779 vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
1781 // calculate dot product
1782 vecX = vec_madd( vecX, vecConstX, zeroVector );
1783 vecY = vec_madd( vecY, vecConstY, vecX );
1784 vecZ = vec_madd( vecZ, vecConstZ, vecY );
1786 vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
1787 vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
1788 vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
1790 // add in constant[3]
1791 vecZ = vec_add( vecZ, vecConst3 );
1792 vecZ2 = vec_add( vecZ2, vecConst3 );
1794 // store out results
1795 ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
1799 for ( ; i < count; i++ ) {
1800 dst[i] = constNormal * src[i] + const3;
1808 dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
1811 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
1812 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].Normal() + constant[3] * src[(X)][3];
1815 assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
1821 const float *constPtr = constant.ToFloatPtr();
1823 register vector float vecX, vecY, vecZ, vecI3;
1824 register vector float vecX2, vecY2, vecZ2, vecI32;
1826 vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
1827 vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
1828 register vector float zeroVector = (vector float)(0.0);
1829 register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
1831 constVal[0] = *(constPtr);
1832 constVal[1] = *(constPtr+1);
1833 constVal[2] = *(constPtr+2);
1834 constVal[3] = *(constPtr+3);
1836 // populate const vector
1837 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1838 vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
1839 vector float v1 = vec_ld( 15, constant.ToFloatPtr() );
1840 vector float vecConst = vec_perm( v0, v1, constPerm );
1842 vecConstX = vec_splat( vecConst, 0 );
1843 vecConstY = vec_splat( vecConst, 1 );
1844 vecConstZ = vec_splat( vecConst, 2 );
1845 vecConstI3 = vec_splat( vecConst, 3 );
1847 // handle unaligned case at beginning
1848 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1849 dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
1852 const float *srcPtr = src[i].ToFloatPtr();
1853 vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
1854 vector float vecOld = vec_ld( 0, srcPtr );
1856 for ( ; i+7 < count; i += 8 ) {
1857 float *planePtr = (float*)( srcPtr + (i*PLANE_OFFSET) );
1858 vector float v0, v1, v2, v3, v4, v5, v6, v7;
1860 v0 = vecOld; // vec_ld( 0, planePtr );
1861 v1 = vec_ld( 15, planePtr );
1862 v2 = vec_ld( 31, planePtr );
1863 v3 = vec_ld( 47, planePtr );
1864 v4 = vec_ld( 63, planePtr );
1865 v5 = vec_ld( 79, planePtr );
1866 v6 = vec_ld( 95, planePtr );
1867 v7 = vec_ld( 111, planePtr );
1868 vecOld = vec_ld( 127, planePtr );
1870 vecPlaneLd1 = vec_perm( v0, v1, permVec );
1871 vecPlaneLd2 = vec_perm( v1, v2, permVec );
1872 vecPlaneLd3 = vec_perm( v2, v3, permVec );
1873 vecPlaneLd4 = vec_perm( v3, v4, permVec );
1875 vecPlaneLd5 = vec_perm( v4, v5, permVec );
1876 vecPlaneLd6 = vec_perm( v5, v6, permVec );
1877 vecPlaneLd7 = vec_perm( v6, v7, permVec );
1878 vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
1880 // permute into X Y Z vectors, since this is square its basically
1881 // a matrix transpose
1882 v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
1883 v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
1884 v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
1885 v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
1887 vecX = vec_mergeh( v0, v1 );
1888 vecY = vec_mergel( v0, v1 );
1889 vecZ = vec_mergeh( v2, v3 );
1890 vecI3 = vec_mergel( v2, v3 );
1892 v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
1893 v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
1894 v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
1895 v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
1897 vecX2 = vec_mergeh( v4, v5 );
1898 vecY2 = vec_mergel( v4, v5 );
1899 vecZ2 = vec_mergeh( v6, v7 );
1900 vecI32 = vec_mergel( v6, v7 );
1903 v4 = vec_madd( vecConstX, vecX, zeroVector );
1904 v5 = vec_madd( vecConstY, vecY, v4 );
1905 v6 = vec_madd( vecConstZ, vecZ, v5 );
1906 v7 = vec_madd( vecConstI3, vecI3, v6 );
1908 v0 = vec_madd( vecConstX, vecX2, zeroVector );
1909 v1 = vec_madd( vecConstY, vecY2, v0 );
1910 v2 = vec_madd( vecConstZ, vecZ2, v1 );
1911 v3 = vec_madd( vecConstI3, vecI32, v2 );
1914 ALIGNED_STORE2( &dst[i], v7, v3 );
1918 for ( ; i < count; i++ ) {
1919 //dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
1920 srcVal[0] = *(srcPtr + (i*PLANE_OFFSET) + 0 );
1921 srcVal[1] = *(srcPtr + (i*PLANE_OFFSET) + 1 );
1922 srcVal[2] = *(srcPtr + (i*PLANE_OFFSET) + 2 );
1923 srcVal[3] = *(srcPtr + (i*PLANE_OFFSET) + 3 );
1924 dst[i] = srcVal[0] * constVal[0] + srcVal[1] * constVal[1] + srcVal[2] * constVal[2] + constVal[3] * srcVal[3];
1929 #ifndef DRAWVERT_PADDED
1934 dst[i] = constant.Normal() * src[i].xyz + constant[3];
1937 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
1938 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
1940 // idDrawVert size is 60 bytes
1941 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
1944 const float *constPtr = constant.ToFloatPtr();
1945 const float *srcPtr = src[0].xyz.ToFloatPtr();
1947 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1948 register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
1949 register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
1950 register vector float vecDest1;
1951 register vector float zeroVector = (vector float)(0.0);
1952 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
1957 constVal[0] = *(constPtr+0);
1958 constVal[1] = *(constPtr+1);
1959 constVal[2] = *(constPtr+2);
1960 constVal[3] = *(constPtr+3);
1962 // populate const vec
1963 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1964 v0 = vec_ld( 0, constant.ToFloatPtr() );
1965 v1 = vec_ld( 15, constant.ToFloatPtr() );
1966 v0 = vec_perm( v0, v1, constPerm );
1968 vecConstX = vec_splat( v0, 0 );
1969 vecConstY = vec_splat( v0, 1 );
1970 vecConstZ = vec_splat( v0, 2 );
1971 vecConstI3 = vec_splat( v0, 3 );
1973 // handle unaligned case at beginning
1974 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1975 dst[i] = constant.Normal() * src[i].xyz + constant[3];
1978 // every fourth one will have the same alignment, so can store these. Make sure we
1979 // have enough so we don't run off the end of the array
1980 if ( i+3 < count ) {
1981 vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1982 vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1983 vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1984 vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1987 for ( ; i+3 < count; i+=4 ) {
1988 const float *vertPtr = src[i].xyz.ToFloatPtr();
1989 const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
1990 const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
1991 const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
1993 v0 = vec_ld( 0, vertPtr );
1994 v1 = vec_ld( 11, vertPtr );
1995 v2 = vec_ld( 0, vertPtr2 );
1996 v3 = vec_ld( 11, vertPtr2 );
1997 v4 = vec_ld( 0, vertPtr3 );
1998 v5 = vec_ld( 11, vertPtr3 );
1999 v6 = vec_ld( 0, vertPtr4 );
2000 v7 = vec_ld( 11, vertPtr4 );
2002 v0 = vec_perm( v0, v1, vertPerm1 );
2003 v2 = vec_perm( v2, v3, vertPerm2 );
2004 v4 = vec_perm( v4, v5, vertPerm3 );
2005 v6 = vec_perm( v6, v7, vertPerm4 );
2007 // transpose into X Y Z vectors
2008 v1 = vec_mergeh( v0, v4 );
2009 v3 = vec_mergeh( v2, v6 );
2010 v5 = vec_mergel( v0, v4 );
2011 v7 = vec_mergel( v2, v6 );
2013 vecSrcX1 = vec_mergeh( v1, v3 );
2014 vecSrcY1 = vec_mergel( v1, v3 );
2015 vecSrcZ1 = vec_mergeh( v5, v7 );
2017 // now calculate dot product
2018 vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
2019 vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
2020 vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
2021 vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
2024 vec_st( vecDest1, 0, &dst[i] );
2028 for ( ; i < count; i++ ) {
2029 srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
2030 srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
2031 srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
2032 // dst[i] = constant.Normal() * src[i].xyz + constant[3];
2034 dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
2035 dst[i] += constVal[3];
2043 dst[i] = constant.Normal() * src[i].xyz + constant[3];
2046 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
2047 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
2049 // idDrawVert size is 60 bytes
2050 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
2053 const float *constPtr = constant.ToFloatPtr();
2054 const float *srcPtr = src[0].xyz.ToFloatPtr();
2056 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
2057 register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
2058 register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
2059 register vector float vecDest1;
2060 register vector float zeroVector = (vector float)(0.0);
2061 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
2066 constVal[0] = *(constPtr+0);
2067 constVal[1] = *(constPtr+1);
2068 constVal[2] = *(constPtr+2);
2069 constVal[3] = *(constPtr+3);
2071 // populate const vec
2072 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
2073 v0 = vec_ld( 0, constant.ToFloatPtr() );
2074 v1 = vec_ld( 15, constant.ToFloatPtr() );
2075 v0 = vec_perm( v0, v1, constPerm );
2077 vecConstX = vec_splat( v0, 0 );
2078 vecConstY = vec_splat( v0, 1 );
2079 vecConstZ = vec_splat( v0, 2 );
2080 vecConstI3 = vec_splat( v0, 3 );
2082 // handle unaligned case at beginning
2083 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
2084 dst[i] = constant.Normal() * src[i].xyz + constant[3];
2087 for ( ; i+3 < count; i+=4 ) {
2088 const float *vertPtr = src[i].xyz.ToFloatPtr();
2089 const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
2090 const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
2091 const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
2093 v0 = vec_ld( 0, vertPtr );
2094 v2 = vec_ld( 0, vertPtr2 );
2095 v4 = vec_ld( 0, vertPtr3 );
2096 v6 = vec_ld( 0, vertPtr4 );
2098 // transpose into X Y Z vectors
2099 v1 = vec_mergeh( v0, v4 );
2100 v3 = vec_mergeh( v2, v6 );
2101 v5 = vec_mergel( v0, v4 );
2102 v7 = vec_mergel( v2, v6 );
2104 vecSrcX1 = vec_mergeh( v1, v3 );
2105 vecSrcY1 = vec_mergel( v1, v3 );
2106 vecSrcZ1 = vec_mergeh( v5, v7 );
2108 // now calculate dot product
2109 vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
2110 vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
2111 vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
2112 vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
2115 vec_st( vecDest1, 0, &dst[i] );
2119 for ( ; i < count; i++ ) {
2120 srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
2121 srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
2122 srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
2123 // dst[i] = constant.Normal() * src[i].xyz + constant[3];
2125 dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
2126 dst[i] += constVal[3];
2130 #endif /* DRAWVERT_PADDED */
2136 dst[i] = src0[i] * src1[i];
2139 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
2140 //#define OPER(X) dst[(X)] = src0[(X)] * src1[(X)];
2146 register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
2147 vector float vecLd7, vecLd8, vecLd9, vecLd10, vecLd11, vecLd12;
2148 register vector float vecX0, vecY0, vecZ0, vecX1, vecY1, vecZ1;
2149 register vector float vecX02, vecY02, vecZ02, vecX12, vecY12, vecZ12;
2150 register vector float zeroVector = (vector float)(0.0);
2152 register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
2153 register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2154 register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
2155 register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2156 register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
2157 register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2159 // handle unaligned case at beginning
2160 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
2161 dst[i] = src0[i] * src1[i];
2164 const float *src0Ptr = src0[i].ToFloatPtr();
2165 const float *src1Ptr = src1[i].ToFloatPtr();
2166 vector unsigned char permVec1 = vec_add( vec_lvsl( -1, src0Ptr ), (vector unsigned char)(1) );
2167 vector unsigned char permVec2 = vec_add( vec_lvsl( -1, src1Ptr ), (vector unsigned char)(1) );
2168 vector float vecOld0 = vec_ld( 0, src0Ptr );
2169 vector float vecOld1 = vec_ld( 0, src1Ptr );
2171 for ( i = 0; i+7 < count; i += 8 ) {
2172 float *s0Ptr = (float*)( src0Ptr + (i*3) );
2173 float *s1Ptr = (float*)( src1Ptr + (i*3) );
2175 vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
2177 v1 = vec_ld( 15, s0Ptr );
2178 v2 = vec_ld( 31, s0Ptr );
2179 v3 = vec_ld( 47, s0Ptr );
2180 v4 = vec_ld( 63, s0Ptr );
2181 v5 = vec_ld( 79, s0Ptr );
2182 vecOld0 = vec_ld( 95, s0Ptr );
2185 v7 = vec_ld( 15, s1Ptr );
2186 v8 = vec_ld( 31, s1Ptr );
2187 v9 = vec_ld( 47, s1Ptr );
2188 v10 = vec_ld( 63, s1Ptr );
2189 v11 = vec_ld( 79, s1Ptr );
2190 vecOld1 = vec_ld( 95, s1Ptr );
2192 vecLd1 = vec_perm( v0, v1, permVec1 );
2193 vecLd2 = vec_perm( v1, v2, permVec1 );
2194 vecLd3 = vec_perm( v2, v3, permVec1 );
2195 vecLd4 = vec_perm( v3, v4, permVec1 );
2196 vecLd5 = vec_perm( v4, v5, permVec1 );
2197 vecLd6 = vec_perm( v5, vecOld0, permVec1 );
2199 vecLd7 = vec_perm( v6, v7, permVec2 );
2200 vecLd8 = vec_perm( v7, v8, permVec2 );
2201 vecLd9 = vec_perm( v8, v9, permVec2 );
2202 vecLd10 = vec_perm( v9, v10, permVec2 );
2203 vecLd11 = vec_perm( v10, v11, permVec2 );
2204 vecLd12 = vec_perm( v11, vecOld1, permVec2 );
2206 // permute into X Y Z vectors
2207 vecX0 = vec_perm( vecLd1, vecLd2, permX1 );
2208 vecY0 = vec_perm( vecLd1, vecLd2, permY1 );
2209 vecZ0 = vec_perm( vecLd1, vecLd2, permZ1 );
2210 vecX0 = vec_perm( vecX0, vecLd3, permX2 );
2211 vecY0 = vec_perm( vecY0, vecLd3, permY2 );
2212 vecZ0 = vec_perm( vecZ0, vecLd3, permZ2 );
2214 vecX02 = vec_perm( vecLd4, vecLd5, permX1 );
2215 vecY02 = vec_perm( vecLd4, vecLd5, permY1 );
2216 vecZ02 = vec_perm( vecLd4, vecLd5, permZ1 );
2217 vecX02 = vec_perm( vecX02, vecLd6, permX2 );
2218 vecY02 = vec_perm( vecY02, vecLd6, permY2 );
2219 vecZ02 = vec_perm( vecZ02, vecLd6, permZ2 );
2221 vecX1 = vec_perm( vecLd7, vecLd8, permX1 );
2222 vecY1 = vec_perm( vecLd7, vecLd8, permY1 );
2223 vecZ1 = vec_perm( vecLd7, vecLd8, permZ1 );
2224 vecX1 = vec_perm( vecX1, vecLd9, permX2 );
2225 vecY1 = vec_perm( vecY1, vecLd9, permY2 );
2226 vecZ1 = vec_perm( vecZ1, vecLd9, permZ2 );
2228 vecX12 = vec_perm( vecLd10, vecLd11, permX1 );
2229 vecY12 = vec_perm( vecLd10, vecLd11, permY1 );
2230 vecZ12 = vec_perm( vecLd10, vecLd11, permZ1 );
2231 vecX12 = vec_perm( vecX12, vecLd12, permX2 );
2232 vecY12 = vec_perm( vecY12, vecLd12, permY2 );
2233 vecZ12 = vec_perm( vecZ12, vecLd12, permZ2 );
2236 vecX0 = vec_madd( vecX0, vecX1, zeroVector );
2237 vecY0 = vec_madd( vecY0, vecY1, vecX0 );
2238 vecZ0 = vec_madd( vecZ0, vecZ1, vecY0 );
2239 vecX02 = vec_madd( vecX02, vecX12, zeroVector );
2240 vecY02 = vec_madd( vecY02, vecY12, vecX02 );
2241 vecZ02 = vec_madd( vecZ02, vecZ12, vecY02 );
2243 // store out results
2244 ALIGNED_STORE2( &dst[i], vecZ0, vecZ02 );
2248 for ( ; i < count; i++ ) {
2249 // dst[i] = src0[i] * src1[i];
2250 src0Val[0] = *( src0Ptr + (i*3) + 0 );
2251 src0Val[1] = *( src0Ptr + (i*3) + 1 );
2252 src0Val[2] = *( src0Ptr + (i*3) + 2 );
2254 src1Val[0] = *( src1Ptr + (i*3) + 0 );
2255 src1Val[1] = *( src1Ptr + (i*3) + 1 );
2256 src1Val[2] = *( src1Ptr + (i*3) + 2 );
2258 dst[i] = src0Val[0] * src1Val[0] + src0Val[1] * src1Val[1] + src0Val[2] * src1Val[2];
2266 dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
2269 void VPCALL idSIMD_AltiVec::Dot( float &dot, const float *src1, const float *src2, const int count ) {
2272 register vector float v0, v1, v2, v3;
2273 register vector float zeroVector;
2274 register vector float runningTotal1, runningTotal2;
2276 register vector float v0_low, v0_hi, v2_low, v2_hi;
2278 register vector float v1_low, v1_hi, v3_low, v3_hi;
2280 register vector unsigned char permVec1, permVec2;
2281 vector unsigned char oneCharVector = (vector unsigned char)(1);
2285 runningTotal1 = (vector float)(0.0);
2286 runningTotal2 = (vector float)(0.0);
2287 zeroVector = (vector float)(0.0);
2290 //calculate permute and do loads
2291 permVec1 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
2292 permVec2 = vec_add( vec_lvsl( -1, (int*) &src2[i] ), oneCharVector );
2293 v2_hi = vec_ld( 0, &src1[i] );
2294 v3_hi = vec_ld( 0, &src2[i] );
2297 for ( ; i+7 < count; i += 8 ) {
2300 v0_hi = vec_ld( 15, &src1[i] );
2302 v2_hi = vec_ld( 31, &src1[i] );
2305 v1_hi = vec_ld( 15, &src2[i] );
2307 v3_hi = vec_ld( 31, &src2[i] );
2309 v0 = vec_perm( v0_low, v0_hi, permVec1 );
2310 v1 = vec_perm( v1_low, v1_hi, permVec2 );
2311 v2 = vec_perm( v2_low, v2_hi, permVec1 );
2312 v3 = vec_perm( v3_low, v3_hi, permVec2 );
2314 //multiply together and keep running sum
2315 runningTotal1 = vec_madd( v0, v1, runningTotal1 );
2316 runningTotal2 = vec_madd( v2, v3, runningTotal2 );
2319 runningTotal1 = vec_add( runningTotal1, runningTotal2 );
2321 // sum accross vector
2322 v0 = vec_add( runningTotal1, vec_sld( runningTotal1, runningTotal1, 8 ) );
2323 v1 = vec_add( v0, vec_sld( v0, v0, 4 ) );
2324 runningTotal1 = vec_splat( v1, 0 );
2325 vec_ste( runningTotal1, 0, &dot );
2328 //handle cleanup. when profiling the game, we found that most of the counts to this function were small, so it
2329 // spends a lot of time in this scalar code. It's already really really fast (eg 1 TB tick) for scalar code for
2330 // counts less than 50, so not much point in trying to get vector code in on the action
2331 for ( ; i < count ; i++ ) {
2332 dot += src1[i] * src2[i];
2336 #endif /* ENABLE_DOT */
2338 #ifdef ENABLE_COMPARES
2342 idSIMD_AltiVec::CmpGT
2344 dst[i] = src0[i] > constant;
2348 void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
2349 //#define OPER(X) dst[(X)] = src0[(X)] > constant;
2351 register vector float v0, v1, v2, v3;
2352 register vector bool int vr1, vr2, vr3, vr4;
2353 register vector bool short vs1, vs2;
2354 register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2355 register vector unsigned char vc1;
2356 register vector bool char vbc1;
2357 register vector float constVec;
2358 register vector unsigned char oneVector = (vector unsigned char)(1);
2359 register vector unsigned char permVec;
2362 //handle unaligned at start
2363 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2364 dst[i] = src0[i] > constant;
2367 //splat constant into a vector
2368 constVec = loadSplatUnalignedScalar( &constant );
2370 //calculate permute and do loads
2371 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2372 v3_hi = vec_ld( 0, &src0[i] );
2375 for ( ; i+15 < count; i += 16 ) {
2378 v0_hi = vec_ld( 15, &src0[i] );
2380 v1_hi = vec_ld( 31, &src0[i] );
2382 v2_hi = vec_ld( 47, &src0[i] );
2384 v3_hi = vec_ld( 63, &src0[i] );
2386 //permute into the vectors we want
2387 v0 = vec_perm( v0_low, v0_hi, permVec );
2388 v1 = vec_perm( v1_low, v1_hi, permVec );
2389 v2 = vec_perm( v2_low, v2_hi, permVec );
2390 v3 = vec_perm( v3_low, v3_hi, permVec );
2393 vr1 = vec_cmpgt( v0, constVec );
2394 vr2 = vec_cmpgt( v1, constVec );
2395 vr3 = vec_cmpgt( v2, constVec );
2396 vr4 = vec_cmpgt( v3, constVec );
2398 // pack results into shorts
2399 vs1 = vec_pack(vr1, vr2);
2400 vs2 = vec_pack(vr3, vr4);
2402 // pack results into byte
2403 vbc1 = vec_pack(vs1, vs2);
2405 //AND with 1 to get true=1 not true=255
2406 vc1 = vec_and( vbc1, oneVector );
2409 vec_st( vc1, 0, &dst[i] );
2413 for ( ; i < count ; i++ ) {
2414 dst[i] = src0[i] > constant;
2421 idSIMD_AltiVec::CmpGT
2423 dst[i] |= ( src0[i] > constant ) << bitNum;
2426 void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2427 //#define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum;
2429 // Temp vector registers
2430 register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2431 register vector bool short vtbs0, vtbs1;
2432 register vector bool char vtbc0;
2433 register vector unsigned char vtuc0;
2434 register vector unsigned char permVec, permVec2;
2437 register vector unsigned char vd;
2439 register vector unsigned char bitNumVec;
2441 register vector float vs0, vs1, vs2, vs3;
2442 register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2444 register vector float constVec;
2446 register vector unsigned char oneVector = (vector unsigned char)(1);
2449 //handle unaligned at start
2450 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2451 dst[i] |= ( src0[i] > constant ) << bitNum;
2454 //splat constant into a vector
2455 constVec = loadSplatUnalignedScalar( &constant );
2457 //bitNum is unaligned.
2458 permVec2 = vec_lvsl( 0, &bitNum );
2459 vtuc0 = vec_ld( 0, &bitNum );
2460 bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2461 bitNumVec = vec_splat( bitNumVec, 0 );
2463 //calculate permute and do loads
2464 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2465 vs3_hi = vec_ld( 0, &src0[i] );
2468 for ( ; i+15 < count; i += 16 ) {
2469 //load sources (floats)
2471 vs0_hi = vec_ld( 15, &src0[i] );
2473 vs1_hi = vec_ld( 31, &src0[i] );
2475 vs2_hi = vec_ld( 47, &src0[i] );
2477 vs3_hi = vec_ld( 63, &src0[i] );
2479 //permute into the vectors we want
2480 vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2481 vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2482 vs2 = vec_perm( vs2_low, vs2_hi, permVec );
2483 vs3 = vec_perm( vs3_low, vs3_hi, permVec );
2485 //load dest (bytes) as unsigned char
2486 vd = vec_ld( 0, &dst[i] );
2488 // do comparison and get bool int result
2489 vtbi0 = vec_cmpgt( vs0, constVec );
2490 vtbi1 = vec_cmpgt( vs1, constVec );
2491 vtbi2 = vec_cmpgt( vs2, constVec );
2492 vtbi3 = vec_cmpgt( vs3, constVec );
2494 // pack results into shorts
2495 vtbs0 = vec_pack(vtbi0, vtbi1);
2496 vtbs1 = vec_pack(vtbi2, vtbi3);
2498 // pack results into byte
2499 vtbc0 = vec_pack(vtbs0, vtbs1);
2501 //and with 1 to get true=1 instead of true=255
2502 vtuc0 = vec_and(vtbc0, oneVector);
2503 vtuc0 = vec_sl(vtuc0, bitNumVec );
2506 vd = vec_or( vd, vtuc0 );
2508 vec_st( vd, 0, &dst[i] );
2512 for ( ; i < count ; i++ ) {
2513 dst[i] |= ( src0[i] > constant ) << bitNum;
2519 idSIMD_AltiVec::CmpGE
2521 dst[i] = src0[i] >= constant;
2524 void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
2526 register vector float v0, v1, v2, v3;
2527 register vector bool int vr1, vr2, vr3, vr4;
2528 register vector bool short vs1, vs2;
2529 register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2530 register vector unsigned char vc1;
2531 register vector bool char vbc1;
2532 register vector float constVec;
2533 register vector unsigned char oneVector = (vector unsigned char)(1);
2534 register vector unsigned char permVec;
2537 //handle unaligned at start
2538 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2539 dst[i] = src0[i] >= constant;
2542 //splat constant into a vector
2543 constVec = loadSplatUnalignedScalar( &constant );
2545 //calculate permute and do loads
2546 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2547 v3_hi = vec_ld( 0, &src0[i] );
2550 for ( ; i+15 < count; i += 16 ) {
2553 v0_hi = vec_ld( 15, &src0[i] );
2555 v1_hi = vec_ld( 31, &src0[i] );
2557 v2_hi = vec_ld( 47, &src0[i] );
2559 v3_hi = vec_ld( 63, &src0[i] );
2561 //permute into the vectors we want
2562 v0 = vec_perm( v0_low, v0_hi, permVec );
2563 v1 = vec_perm( v1_low, v1_hi, permVec );
2564 v2 = vec_perm( v2_low, v2_hi, permVec );
2565 v3 = vec_perm( v3_low, v3_hi, permVec );
2568 vr1 = vec_cmpge( v0, constVec );
2569 vr2 = vec_cmpge( v1, constVec );
2570 vr3 = vec_cmpge( v2, constVec );
2571 vr4 = vec_cmpge( v3, constVec );
2573 // pack results into shorts
2574 vs1 = vec_pack(vr1, vr2);
2575 vs2 = vec_pack(vr3, vr4);
2577 // pack results into byte
2578 vbc1 = vec_pack(vs1, vs2);
2580 //AND with 1 to get true=1 not true=255
2581 vc1 = vec_and( vbc1, oneVector );
2584 vec_st( vc1, 0, &dst[i] );
2588 for ( ; i < count ; i++ ) {
2589 dst[i] = src0[i] >= constant;
2595 idSIMD_AltiVec::CmpGE
2597 dst[i] |= ( src0[i] >= constant ) << bitNum;
2600 void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2601 register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2602 register vector bool short vtbs0, vtbs1;
2603 register vector bool char vtbc0;
2604 register vector unsigned char vtuc0;
2605 register vector unsigned char permVec, permVec2;
2608 register vector unsigned char vd;
2610 register vector unsigned char bitNumVec;
2612 register vector float vs0, vs1, vs2, vs3;
2613 register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2615 register vector float constVec;
2617 register vector unsigned char oneVector = (vector unsigned char)(1);
2620 //handle unaligned at start
2621 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2622 dst[i] |= ( src0[i] >= constant ) << bitNum;
2625 //splat constant into a vector
2626 constVec = loadSplatUnalignedScalar( &constant );
2628 //bitNum is unaligned.
2629 permVec2 = vec_lvsl( 0, &bitNum );
2630 vtuc0 = vec_ld( 0, &bitNum );
2631 bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2632 bitNumVec = vec_splat( bitNumVec, 0 );
2634 //calculate permute and do loads
2635 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2636 vs3_hi = vec_ld( 0, &src0[i] );
2639 for ( ; i+15 < count; i += 16 ) {
2640 //load sources (floats)
2642 vs0_hi = vec_ld( 15, &src0[i] );
2644 vs1_hi = vec_ld( 31, &src0[i] );
2646 vs2_hi = vec_ld( 47, &src0[i] );
2648 vs3_hi = vec_ld( 63, &src0[i] );
2650 //permute into the vectors we want
2651 vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2652 vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2653 vs2 = vec_perm( vs2_low, vs2_hi, permVec );
2654 vs3 = vec_perm( vs3_low, vs3_hi, permVec );
2656 //load dest (bytes) as unsigned char
2657 vd = vec_ld( 0, &dst[i] );
2659 // do comparison and get bool int result
2660 vtbi0 = vec_cmpge( vs0, constVec );
2661 vtbi1 = vec_cmpge( vs1, constVec );
2662 vtbi2 = vec_cmpge( vs2, constVec );
2663 vtbi3 = vec_cmpge( vs3, constVec );
2665 // pack results into shorts
2666 vtbs0 = vec_pack(vtbi0, vtbi1);
2667 vtbs1 = vec_pack(vtbi2, vtbi3);
2669 // pack results into byte
2670 vtbc0 = vec_pack(vtbs0, vtbs1);
2672 //and with 1L to get true=1 instead of true=255
2673 vtuc0 = vec_and(vtbc0, oneVector);
2674 vtuc0 = vec_sl(vtuc0, bitNumVec );
2677 vd = vec_or( vd, vtuc0 );
2679 vec_st( vd, 0, &dst[i] );
2683 for ( ; i < count ; i++ ) {
2684 dst[i] |= ( src0[i] >= constant ) << bitNum;
2691 idSIMD_AltiVec::CmpLT
2693 dst[i] = src0[i] < constant;
2696 void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
2697 //#define OPER(X) dst[(X)] = src0[(X)] < constant;
2698 register vector float v0, v1, v2, v3;
2699 register vector bool int vr1, vr2, vr3, vr4;
2700 register vector bool short vs1, vs2;
2701 register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2702 register vector unsigned char vc1;
2703 register vector bool char vbc1;
2704 register vector float constVec;
2705 register vector unsigned char oneVector = (vector unsigned char)(1);
2706 register vector unsigned char permVec;
2709 //handle unaligned at start
2710 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2711 dst[i] = src0[i] < constant;
2714 //splat constant into a vector
2715 constVec = loadSplatUnalignedScalar( &constant );
2717 //calculate permute and do loads
2718 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2719 v3_hi = vec_ld( 0, &src0[i] );
2722 for ( ; i+15 < count; i += 16 ) {
2725 v0_hi = vec_ld( 15, &src0[i] );
2727 v1_hi = vec_ld( 31, &src0[i] );
2729 v2_hi = vec_ld( 47, &src0[i] );
2731 v3_hi = vec_ld( 63, &src0[i] );
2733 //permute into the vectors we want
2734 v0 = vec_perm( v0_low, v0_hi, permVec );
2735 v1 = vec_perm( v1_low, v1_hi, permVec );
2736 v2 = vec_perm( v2_low, v2_hi, permVec );
2737 v3 = vec_perm( v3_low, v3_hi, permVec );
2740 vr1 = vec_cmplt( v0, constVec );
2741 vr2 = vec_cmplt( v1, constVec );
2742 vr3 = vec_cmplt( v2, constVec );
2743 vr4 = vec_cmplt( v3, constVec );
2745 // pack results into shorts
2746 vs1 = vec_pack(vr1, vr2);
2747 vs2 = vec_pack(vr3, vr4);
2749 // pack results into byte
2750 vbc1 = vec_pack(vs1, vs2);
2752 //AND with 1 to get true=1 not true=255
2753 vc1 = vec_and( vbc1, oneVector );
2756 vec_st( vc1, 0, &dst[i] );
2760 for ( ; i < count ; i++ ) {
2761 dst[i] = src0[i] < constant;
2767 idSIMD_AltiVec::CmpLT
2769 dst[i] |= ( src0[i] < constant ) << bitNum;
2772 void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2773 //#define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum;
2774 register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2775 register vector bool short vtbs0, vtbs1;
2776 register vector bool char vtbc0;
2777 register vector unsigned char vtuc0;
2778 register vector unsigned char permVec, permVec2;
2781 register vector unsigned char vd;
2783 register vector unsigned char bitNumVec;
2785 register vector float vs0, vs1, vs2, vs3;
2786 register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2788 register vector float constVec;
2790 register vector unsigned char oneVector = (vector unsigned char)(1);
2793 //handle unaligned at start
2794 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2795 dst[i] |= ( src0[i] < constant ) << bitNum;
2798 //splat constant into a vector
2799 constVec = loadSplatUnalignedScalar( &constant );
2801 //bitNum is unaligned.
2802 permVec2 = vec_lvsl( 0, &bitNum );
2803 vtuc0 = vec_ld( 0, &bitNum );
2804 bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2805 bitNumVec = vec_splat( bitNumVec, 0 );
2807 //calculate permute and do loads
2808 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2809 vs3_hi = vec_ld( 0, &src0[i] );
2812 for ( ; i+15 < count; i += 16 ) {
2813 //load sources (floats)
2815 vs0_hi = vec_ld( 15, &src0[i] );
2817 vs1_hi = vec_ld( 31, &src0[i] );
2819 vs2_hi = vec_ld( 47, &src0[i] );
2821 vs3_hi = vec_ld( 63, &src0[i] );
2823 //permute into the vectors we want
2824 vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2825 vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2826 vs2 = vec_perm( vs2_low, vs2_hi, permVec );
2827 vs3 = vec_perm( vs3_low, vs3_hi, permVec );
2829 //load dest (bytes) as unsigned char
2830 vd = vec_ld( 0, &dst[i] );
2832 // do comparison and get bool int result
2833 vtbi0 = vec_cmplt( vs0, constVec );
2834 vtbi1 = vec_cmplt( vs1, constVec );
2835 vtbi2 = vec_cmplt( vs2, constVec );
2836 vtbi3 = vec_cmplt( vs3, constVec );
2838 // pack results into shorts
2839 vtbs0 = vec_pack(vtbi0, vtbi1);
2840 vtbs1 = vec_pack(vtbi2, vtbi3);
2842 // pack results into byte
2843 vtbc0 = vec_pack(vtbs0, vtbs1);
2845 //and with 1L to get true=1 instead of true=255
2846 vtuc0 = vec_and(vtbc0, oneVector);
2847 vtuc0 = vec_sl(vtuc0, bitNumVec );
2850 vd = vec_or( vd, vtuc0 );
2852 vec_st( vd, 0, &dst[i] );
2856 for ( ; i < count ; i++ ) {
2857 dst[i] |= ( src0[i] < constant ) << bitNum;
2865 idSIMD_AltiVec::CmpLE
2867 dst[i] = src0[i] <= constant;
2870 void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
2871 //#define OPER(X) dst[(X)] = src0[(X)] <= constant;
2872 register vector float v0, v1, v2, v3;
2873 register vector bool int vr1, vr2, vr3, vr4;
2874 register vector bool short vs1, vs2;
2875 register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2876 register vector unsigned char vc1;
2877 register vector bool char vbc1;
2878 register vector float constVec;
2879 register vector unsigned char oneVector = (vector unsigned char)(1);
2880 register vector unsigned char permVec;
2883 //handle unaligned at start
2884 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2885 dst[i] = src0[i] <= constant;
2888 //splat constant into a vector
2889 constVec = loadSplatUnalignedScalar( &constant );
2891 //calculate permute and do loads
2892 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2893 v3_hi = vec_ld( 0, &src0[i] );
2896 for ( ; i+15 < count; i += 16 ) {
2899 v0_hi = vec_ld( 15, &src0[i] );
2901 v1_hi = vec_ld( 31, &src0[i] );
2903 v2_hi = vec_ld( 47, &src0[i] );
2905 v3_hi = vec_ld( 63, &src0[i] );
2907 //permute into the vectors we want
2908 v0 = vec_perm( v0_low, v0_hi, permVec );
2909 v1 = vec_perm( v1_low, v1_hi, permVec );
2910 v2 = vec_perm( v2_low, v2_hi, permVec );
2911 v3 = vec_perm( v3_low, v3_hi, permVec );
2914 vr1 = vec_cmple( v0, constVec );
2915 vr2 = vec_cmple( v1, constVec );
2916 vr3 = vec_cmple( v2, constVec );
2917 vr4 = vec_cmple( v3, constVec );
2919 // pack results into shorts
2920 vs1 = vec_pack(vr1, vr2);
2921 vs2 = vec_pack(vr3, vr4);
2923 // pack results into byte
2924 vbc1 = vec_pack(vs1, vs2);
2926 //AND with 1 to get true=1 not true=255
2927 vc1 = vec_and( vbc1, oneVector );
2930 vec_st( vc1, 0, &dst[i] );
2934 for ( ; i < count ; i++ ) {
2935 dst[i] = src0[i] <= constant;
2941 idSIMD_AltiVec::CmpLE
2943 dst[i] |= ( src0[i] <= constant ) << bitNum;
2946 void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2947 //#define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum;
2948 register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2949 register vector bool short vtbs0, vtbs1;
2950 register vector bool char vtbc0;
2951 register vector unsigned char vtuc0;
2952 register vector unsigned char permVec, permVec2;
2955 register vector unsigned char vd;
2957 register vector unsigned char bitNumVec;
2959 register vector float vs0, vs1, vs2, vs3;
2960 register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2962 register vector float constVec;
2964 register vector unsigned char oneVector = (vector unsigned char)(1);
2967 //handle unaligned at start
2968 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2969 dst[i] |= ( src0[i] <= constant ) << bitNum;
2972 //splat constant into a vector
2973 constVec = loadSplatUnalignedScalar( &constant );
2975 //bitNum is unaligned.
2976 permVec2 = vec_lvsl( 0, &bitNum );
2977 vtuc0 = vec_ld( 0, &bitNum );
2978 bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2979 bitNumVec = vec_splat( bitNumVec, 0 );
2981 //calculate permute and do loads
2982 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2983 vs3_hi = vec_ld( 0, &src0[i] );
2986 for ( ; i+15 < count; i += 16 ) {
2987 //load sources (floats)
2989 vs0_hi = vec_ld( 15, &src0[i] );
2991 vs1_hi = vec_ld( 31, &src0[i] );
2993 vs2_hi = vec_ld( 47, &src0[i] );
2995 vs3_hi = vec_ld( 63, &src0[i] );
2997 //permute into the vectors we want
2998 vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2999 vs1 = vec_perm( vs1_low, vs1_hi, permVec );
3000 vs2 = vec_perm( vs2_low, vs2_hi, permVec );
3001 vs3 = vec_perm( vs3_low, vs3_hi, permVec );
3003 //load dest (bytes) as unsigned char
3004 vd = vec_ld( 0, &dst[i] );
3006 // do comparison and get bool int result
3007 vtbi0 = vec_cmple( vs0, constVec );
3008 vtbi1 = vec_cmple( vs1, constVec );
3009 vtbi2 = vec_cmple( vs2, constVec );
3010 vtbi3 = vec_cmple( vs3, constVec );
3012 // pack results into shorts
3013 vtbs0 = vec_pack(vtbi0, vtbi1);
3014 vtbs1 = vec_pack(vtbi2, vtbi3);
3016 // pack results into byte
3017 vtbc0 = vec_pack(vtbs0, vtbs1);
3019 //and with 1L to get true=1 instead of true=255
3020 vtuc0 = vec_and(vtbc0, oneVector);
3021 vtuc0 = vec_sl(vtuc0, bitNumVec );
3024 vd = vec_or( vd, vtuc0 );
3026 vec_st( vd, 0, &dst[i] );
3030 for ( ; i < count ; i++ ) {
3031 dst[i] |= ( src0[i] <= constant ) << bitNum;
3034 #endif /* ENABLE_COMPARES */
3036 #ifdef ENABLE_MINMAX
3040 idSIMD_AltiVec::MinMax
3043 void VPCALL idSIMD_AltiVec::MinMax( float &min, float &max, const float *src, const int count ) {
3044 min = idMath::INFINITY; max = -idMath::INFINITY;
3045 //#define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];}
3047 register vector float v0, v1, v2, v3;
3048 register vector float maxVec, minVec, tempMin, tempMax;
3049 register vector unsigned char permVec;
3050 register vector float v0_low, v0_hi, v1_low, v1_hi;
3051 vector unsigned char oneCharVector = (vector unsigned char)(1);
3056 //calculate permute and do first load to
3057 //get a starting point for min and max
3058 permVec = vec_add( vec_lvsl( -1, (int*) &src[0] ), oneCharVector );
3059 v1_hi = vec_ld( 0, &src[0] );
3061 maxVec = loadSplatUnalignedScalar( &max );
3062 minVec = loadSplatUnalignedScalar( &min );
3065 for ( ; i+7 < count; i += 8 ) {
3068 v0_hi = vec_ld( 15, &src[i] );
3070 v1_hi = vec_ld( 31, &src[i] );
3071 v0 = vec_perm( v0_low, v0_hi, permVec );
3072 v1 = vec_perm( v1_low, v1_hi, permVec );
3075 v2 = vec_min( v0, v1 );
3076 minVec = vec_min( minVec, v2 );
3078 v3 = vec_max( v0, v1 );
3079 maxVec = vec_max( maxVec, v3 );
3082 //minVec and maxVec hold the min/max elements from the array, but now
3083 //we need to figure out which particular element it is
3088 // rotate vector around and compare to itself to find the real min/max
3089 tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 8 ) );
3090 tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 8 ) );
3091 tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 4 ) );
3092 tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 4 ) );
3093 minVec = vec_splat( tempMin, 0 );
3094 maxVec = vec_splat( tempMax, 0 );
3095 vec_ste( minVec, 0, &min );
3096 vec_ste( maxVec, 0, &max );
3100 for ( ; i < count; i++ ) {
3101 if ( src[i] < min ) {
3104 if ( src[i] > max ) {
3112 idSIMD_AltiVec::MinMax
3115 void VPCALL idSIMD_AltiVec::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
3116 min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY;
3117 //#define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; }
3123 const float *srcPtr = src[0].ToFloatPtr();
3124 register vector float vecLd1, vecLd2, vecLd3, vecLd4;
3125 register vector float vecMin, vecMax;
3127 register vector float v0, v1, v2, v3;
3131 vecMin = (vector float)(FLT_MAX);
3132 vecMax = (vector float)(FLT_MIN);
3134 vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
3135 vector float vecOld = vec_ld( 0, srcPtr );
3137 for ( i = 0, j = 0; i+7 < count; i += 8, j += 4) {
3139 float *vecPtr = (float*)( srcPtr + (j*4) );
3140 vector float v0, v1, v2, v3;
3143 v1 = vec_ld( 15, vecPtr );
3144 v2 = vec_ld( 31, vecPtr );
3145 v3 = vec_ld( 47, vecPtr );
3146 vecOld = vec_ld( 63, vecPtr );
3148 vecLd1 = vec_perm( v0, v1, permVec );
3149 vecLd2 = vec_perm( v1, v2, permVec );
3150 vecLd3 = vec_perm( v2, v3, permVec );
3151 vecLd4 = vec_perm( v3, vecOld, permVec );
3153 // each of these vectors contains 2 elements
3154 // looks like | X Y X Y | X Y X Y
3155 v0 = vec_min( vecLd1, vecLd2 );
3156 v1 = vec_min( vecLd3, vecLd4 );
3157 v0 = vec_min( v0, v1 );
3159 v2 = vec_max( vecLd1, vecLd2 );
3160 v3 = vec_max( vecLd3, vecLd4 );
3161 v2 = vec_max( v2, v3 );
3163 // since its always X Y X Y we don't have to re-merge each time. we can wait
3165 vecMin = vec_min( v0, vecMin );
3166 vecMax = vec_max( v2, vecMax );
3169 vecMin = vec_min( vecMin, vec_sld( vecMin, vecMin, 8 ) );
3170 vecMax = vec_max( vecMax, vec_sld( vecMax, vecMax, 8 ) );
3171 v0 = vec_splat( vecMin, 0 );
3172 v1 = vec_splat( vecMin, 1 );
3173 v2 = vec_splat( vecMax, 0 );
3174 v3 = vec_splat( vecMax, 1 );
3176 vec_ste( v0, 0, &min[0] );
3177 vec_ste( v1, 0, &min[1] );
3178 vec_ste( v2, 0, &max[0] );
3179 vec_ste( v3, 0, &max[1] );
3183 for ( ; i < count; i++ ) {
3186 if ( v[0] < min[0] ) {
3189 if ( v[0] > max[0] ) {
3193 if ( v[1] < min[1] ) {
3196 if ( v[1] > max[1] ) {
3204 idSIMD_AltiVec::MinMax
3207 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
3208 min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3209 //#define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
3212 const float *srcPtr = src[0].ToFloatPtr();
3215 register vector float vecLd1, vecLd2, vecLd3;
3216 register vector float vecMin, vecMax;
3217 register vector float vecSrc1, vecSrc2, vecSrc3, vecSrc4;
3218 register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3222 vecMin = (vector float)(FLT_MAX);
3223 vecMax = (vector float)(FLT_MIN);
3225 vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr), (vector unsigned char)(1) );
3226 vector float vecOld = vec_ld( 0, srcPtr );
3228 // 4 elements at a time
3229 for ( ; i+3 < count; i += 4 ) {
3230 float *vecPtr = (float*)( srcPtr + (i*3) );
3231 vector float v0, v1, v2;
3234 v1 = vec_ld( 15, vecPtr );
3235 v2 = vec_ld( 31, vecPtr );
3236 vecOld = vec_ld( 47, vecPtr );
3238 vecLd1 = vec_perm( v0, v1, permVec );
3239 vecLd2 = vec_perm( v1, v2, permVec );
3240 vecLd3 = vec_perm( v2, vecOld, permVec );
3242 // put each idVec3 into its own vector as X Y Z (crap)
3244 vecSrc2 = vec_sld( vecLd1, vecLd2, 12 );
3245 vecSrc3 = vec_sld( vecLd2, vecLd3, 8 );
3246 vecSrc4 = vec_sld( vecLd3, vecLd3, 4 );
3249 vecMin1 = vec_min( vecSrc1, vecSrc2 );
3250 vecMin2 = vec_min( vecSrc3, vecSrc4 );
3251 vecMin1 = vec_min( vecMin1, vecMin2 );
3252 vecMin = vec_min( vecMin, vecMin1 );
3254 vecMax1 = vec_max( vecSrc1, vecSrc2 );
3255 vecMax2 = vec_max( vecSrc3, vecSrc4 );
3256 vecMax1 = vec_max( vecMax1, vecMax2 );
3257 vecMax = vec_max( vecMax1, vecMax );
3261 vector float v0, v1, v2, v3, v4, v5;
3262 v0 = vec_splat( vecMin, 0 );
3263 v1 = vec_splat( vecMin, 1 );
3264 v2 = vec_splat( vecMin, 2 );
3265 v3 = vec_splat( vecMax, 0 );
3266 v4 = vec_splat( vecMax, 1 );
3267 v5 = vec_splat( vecMax, 2 );
3269 vec_ste( v0, 0, &min[0] );
3270 vec_ste( v1, 0, &min[1] );
3271 vec_ste( v2, 0, &min[2] );
3272 vec_ste( v3, 0, &max[0] );
3273 vec_ste( v4, 0, &max[1] );
3274 vec_ste( v5, 0, &max[2] );
3278 for ( ; i < count; i ++ ) {
3281 if ( v[0] < min[0] ) {
3284 if ( v[0] > max[0] ) {
3287 if ( v[1] < min[1] ) {
3290 if ( v[1] > max[1] ) {
3293 if ( v[2] < min[2] ) {
3296 if ( v[2] > max[2] ) {
3302 #ifndef DRAWVERT_PADDED
3305 idSIMD_AltiVec::MinMax
3308 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
3310 min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3313 register vector float vecMin, vecMax;
3315 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3316 register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3319 vecMin = (vector float)(FLT_MAX);
3320 vecMax = (vector float)(FLT_MIN);
3322 vector unsigned char vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3323 vector unsigned char vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3324 vector unsigned char vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3325 vector unsigned char vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3327 for ( ; i+3 < count; i += 4) {
3328 const float *vertPtr = src[i].xyz.ToFloatPtr();
3329 const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
3330 const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
3331 const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
3333 v0 = vec_ld( 0, vertPtr );
3334 v1 = vec_ld( 11, vertPtr );
3335 v2 = vec_ld( 0, vertPtr2 );
3336 v3 = vec_ld( 11, vertPtr2 );
3337 v4 = vec_ld( 0, vertPtr3 );
3338 v5 = vec_ld( 11, vertPtr3 );
3339 v6 = vec_ld( 0, vertPtr4 );
3340 v7 = vec_ld( 11, vertPtr4 );
3342 v0 = vec_perm( v0, v1, vertPerm1 );
3343 v2 = vec_perm( v2, v3, vertPerm2 );
3344 v4 = vec_perm( v4, v5, vertPerm3 );
3345 v6 = vec_perm( v6, v7, vertPerm4 );
3347 vecMin1 = vec_min( v0, v2 );
3348 vecMin2 = vec_min( v4, v6 );
3349 vecMin1 = vec_min( vecMin1, vecMin2 );
3350 vecMin = vec_min( vecMin, vecMin1 );
3352 vecMax1 = vec_max( v0, v2 );
3353 vecMax2 = vec_max( v4, v6 );
3354 vecMax1 = vec_max( vecMax1, vecMax2 );
3355 vecMax = vec_max( vecMax, vecMax1 );
3358 // now we have min/max vectors in X Y Z form, store out
3359 v0 = vec_splat( vecMin, 0 );
3360 v1 = vec_splat( vecMin, 1 );
3361 v2 = vec_splat( vecMin, 2 );
3362 v3 = vec_splat( vecMax, 0 );
3363 v4 = vec_splat( vecMax, 1 );
3364 v5 = vec_splat( vecMax, 2 );
3366 vec_ste( v0, 0, &min[0] );
3367 vec_ste( v1, 0, &min[1] );
3368 vec_ste( v2, 0, &min[2] );
3369 vec_ste( v3, 0, &max[0] );
3370 vec_ste( v4, 0, &max[1] );
3371 vec_ste( v5, 0, &max[2] );
3375 for ( ; i < count; i++ ) {
3378 if ( v[0] < min[0] ) {
3381 if ( v[0] > max[0] ) {
3385 if ( v[1] < min[1] ) {
3388 if ( v[1] > max[1] ) {
3392 if ( v[2] > max[2] ) {
3396 if ( v[2] < min[2] ) {
3404 idSIMD_AltiVec::MinMax
3407 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
3409 min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3412 register vector float vecMin, vecMax;
3414 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3415 register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3418 vecMin = (vector float)(FLT_MAX);
3419 vecMax = (vector float)(FLT_MIN);
3421 for ( ; i+3 < count; i += 4) {
3422 const float *vertPtr = src[i].xyz.ToFloatPtr();
3423 const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
3424 const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
3425 const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
3427 v0 = vec_ld( 0, vertPtr );
3428 v2 = vec_ld( 0, vertPtr2 );
3429 v4 = vec_ld( 0, vertPtr3 );
3430 v6 = vec_ld( 0, vertPtr4 );
3432 vecMin1 = vec_min( v0, v2 );
3433 vecMin2 = vec_min( v4, v6 );
3434 vecMin1 = vec_min( vecMin1, vecMin2 );
3435 vecMin = vec_min( vecMin, vecMin1 );
3437 vecMax1 = vec_max( v0, v2 );
3438 vecMax2 = vec_max( v4, v6 );
3439 vecMax1 = vec_max( vecMax1, vecMax2 );
3440 vecMax = vec_max( vecMax, vecMax1 );
3443 // now we have min/max vectors in X Y Z form, store out
3444 v0 = vec_splat( vecMin, 0 );
3445 v1 = vec_splat( vecMin, 1 );
3446 v2 = vec_splat( vecMin, 2 );
3447 v3 = vec_splat( vecMax, 0 );
3448 v4 = vec_splat( vecMax, 1 );
3449 v5 = vec_splat( vecMax, 2 );
3451 vec_ste( v0, 0, &min[0] );
3452 vec_ste( v1, 0, &min[1] );
3453 vec_ste( v2, 0, &min[2] );
3454 vec_ste( v3, 0, &max[0] );
3455 vec_ste( v4, 0, &max[1] );
3456 vec_ste( v5, 0, &max[2] );
3460 for ( ; i < count; i++ ) {
3463 if ( v[0] < min[0] ) {
3466 if ( v[0] > max[0] ) {
3470 if ( v[1] < min[1] ) {
3473 if ( v[1] > max[1] ) {
3477 if ( v[2] > max[2] ) {
3481 if ( v[2] < min[2] ) {
3487 #endif /* DRAWVERT_PADDED */
3489 #ifndef DRAWVERT_PADDED
3492 idSIMD_AltiVec::MinMax
3495 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
3496 min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3501 register vector float vecMin, vecMax;
3503 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3504 register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3508 vecMin = (vector float)(FLT_MAX);
3509 vecMax = (vector float)(FLT_MIN);
3511 vector unsigned char vertPerm1;
3512 vector unsigned char vertPerm2;
3513 vector unsigned char vertPerm3;
3514 vector unsigned char vertPerm4;
3516 for ( ; i+3 < count; i += 4) {
3517 const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
3518 const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
3519 const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
3520 const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
3522 vertPerm1 = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
3523 vertPerm2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
3524 vertPerm3 = vec_add( vec_lvsl( -1, vertPtr3 ), (vector unsigned char)(1) );
3525 vertPerm4 = vec_add( vec_lvsl( -1, vertPtr4 ), (vector unsigned char)(1) );
3527 v0 = vec_ld( 0, vertPtr );
3528 v1 = vec_ld( 15, vertPtr );
3529 v2 = vec_ld( 0, vertPtr2 );
3530 v3 = vec_ld( 15, vertPtr2 );
3531 v4 = vec_ld( 0, vertPtr3 );
3532 v5 = vec_ld( 15, vertPtr3 );
3533 v6 = vec_ld( 0, vertPtr4 );
3534 v7 = vec_ld( 15, vertPtr4 );
3536 v0 = vec_perm( v0, v1, vertPerm1 );
3537 v2 = vec_perm( v2, v3, vertPerm2 );
3538 v4 = vec_perm( v4, v5, vertPerm3 );
3539 v6 = vec_perm( v6, v7, vertPerm4 );
3541 vecMin1 = vec_min( v0, v2 );
3542 vecMin2 = vec_min( v4, v6 );
3543 vecMin1 = vec_min( vecMin1, vecMin2 );
3544 vecMin = vec_min( vecMin, vecMin1 );
3546 vecMax1 = vec_max( v0, v2 );
3547 vecMax2 = vec_max( v4, v6 );
3548 vecMax1 = vec_max( vecMax1, vecMax2 );
3549 vecMax = vec_max( vecMax, vecMax1 );
3552 // now we have min/max vectors in X Y Z form, store out
3553 v0 = vec_splat( vecMin, 0 );
3554 v1 = vec_splat( vecMin, 1 );
3555 v2 = vec_splat( vecMin, 2 );
3556 v3 = vec_splat( vecMax, 0 );
3557 v4 = vec_splat( vecMax, 1 );
3558 v5 = vec_splat( vecMax, 2 );
3560 vec_ste( v0, 0, &min[0] );
3561 vec_ste( v1, 0, &min[1] );
3562 vec_ste( v2, 0, &min[2] );
3563 vec_ste( v3, 0, &max[0] );
3564 vec_ste( v4, 0, &max[1] );
3565 vec_ste( v5, 0, &max[2] );
3569 for ( ; i < count; i++ ) {
3570 v = src[indexes[i]].xyz;
3572 if ( v[0] < min[0] ) {
3575 if ( v[0] > max[0] ) {
3579 if ( v[1] < min[1] ) {
3582 if ( v[1] > max[1] ) {
3586 if ( v[2] > max[2] ) {
3590 if ( v[2] < min[2] ) {
3598 idSIMD_AltiVec::MinMax
3601 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
3602 min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3607 register vector float vecMin, vecMax;
3609 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3610 register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3614 vecMin = (vector float)(FLT_MAX);
3615 vecMax = (vector float)(FLT_MIN);
3617 vector unsigned char vertPerm1;
3618 vector unsigned char vertPerm2;
3619 vector unsigned char vertPerm3;
3620 vector unsigned char vertPerm4;
3622 for ( ; i+3 < count; i += 4) {
3623 const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
3624 const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
3625 const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
3626 const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
3628 v0 = vec_ld( 0, vertPtr );
3629 v2 = vec_ld( 0, vertPtr2 );
3630 v4 = vec_ld( 0, vertPtr3 );
3631 v6 = vec_ld( 0, vertPtr4 );
3633 vecMin1 = vec_min( v0, v2 );
3634 vecMin2 = vec_min( v4, v6 );
3635 vecMin1 = vec_min( vecMin1, vecMin2 );
3636 vecMin = vec_min( vecMin, vecMin1 );
3638 vecMax1 = vec_max( v0, v2 );
3639 vecMax2 = vec_max( v4, v6 );
3640 vecMax1 = vec_max( vecMax1, vecMax2 );
3641 vecMax = vec_max( vecMax, vecMax1 );
3644 // now we have min/max vectors in X Y Z form, store out
3645 v0 = vec_splat( vecMin, 0 );
3646 v1 = vec_splat( vecMin, 1 );
3647 v2 = vec_splat( vecMin, 2 );
3648 v3 = vec_splat( vecMax, 0 );
3649 v4 = vec_splat( vecMax, 1 );
3650 v5 = vec_splat( vecMax, 2 );
3652 vec_ste( v0, 0, &min[0] );
3653 vec_ste( v1, 0, &min[1] );
3654 vec_ste( v2, 0, &min[2] );
3655 vec_ste( v3, 0, &max[0] );
3656 vec_ste( v4, 0, &max[1] );
3657 vec_ste( v5, 0, &max[2] );
3661 for ( ; i < count; i++ ) {
3662 v = src[indexes[i]].xyz;
3664 if ( v[0] < min[0] ) {
3667 if ( v[0] > max[0] ) {
3671 if ( v[1] < min[1] ) {
3674 if ( v[1] > max[1] ) {
3678 if ( v[2] > max[2] ) {
3682 if ( v[2] < min[2] ) {
3689 #endif /* DRAWVERT_PADDED */
3691 #endif /* ENABLE_MINMAX */
3697 idSIMD_AltiVec::Clamp
3700 void VPCALL idSIMD_AltiVec::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
3701 //#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)];
3702 register vector float v0, v1, v2, v3, v4, v5;
3703 register vector unsigned char permVec;
3704 register vector float v0_low, v0_hi, v1_low, v1_hi;
3705 vector unsigned char oneVector = (vector unsigned char)(1);
3706 register vector float minVec, maxVec;
3709 //handle unaligned at start
3710 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
3711 dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
3714 //splat min/max into a vector
3715 minVec = loadSplatUnalignedScalar( &min );
3716 maxVec = loadSplatUnalignedScalar( &max );
3718 //calculate permute and do first load
3719 permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
3720 v1_hi = vec_ld( 0, &src[i] );
3724 for ( ; i+7 < count; i += 8 ) {
3727 v0_hi = vec_ld( 15, &src[i] );
3729 v1_hi = vec_ld( 31, &src[i] );
3731 v0 = vec_perm( v0_low, v0_hi, permVec );
3732 v1 = vec_perm( v1_low, v1_hi, permVec );
3735 v2 = vec_max( v0, minVec );
3736 v3 = vec_max( v1, minVec );
3739 v4 = vec_min( v2, maxVec );
3740 v5 = vec_min( v3, maxVec );
3742 ALIGNED_STORE2( &dst[i], v4, v5 );
3746 for ( ; i < count ; i++ ) {
3747 dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
3753 idSIMD_AltiVec::ClampMin
3756 void VPCALL idSIMD_AltiVec::ClampMin( float *dst, const float *src, const float min, const int count ) {
3757 //#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)];
3758 register vector float v0, v1, v2, v3;
3759 register vector unsigned char permVec;
3760 register vector float v0_low, v0_hi, v1_low, v1_hi;
3761 register vector float constVec;
3762 vector unsigned char oneVector = (vector unsigned char)(1);
3765 //handle unaligned at start
3766 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
3767 dst[i] = src[i] < min ? min : src[i];
3770 //splat constant into a vector
3771 constVec = loadSplatUnalignedScalar( &min );
3773 //calculate permute and do first load
3774 permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
3775 v1_hi = vec_ld( 0, &src[i] );
3778 for ( ; i+7 < count; i += 8 ) {
3781 v0_hi = vec_ld( 15, &src[i] );
3783 v1_hi = vec_ld( 31, &src[i] );
3785 v0 = vec_perm( v0_low, v0_hi, permVec );
3786 v1 = vec_perm( v1_low, v1_hi, permVec );
3788 v2 = vec_max( v0, constVec );
3789 v3 = vec_max( v1, constVec );
3791 ALIGNED_STORE2( &dst[i], v2, v3 );
3795 for ( ; i < count ; i++ ) {
3796 dst[i] = src[i] < min ? min : src[i];
3802 idSIMD_AltiVec::ClampMax
3805 void VPCALL idSIMD_AltiVec::ClampMax( float *dst, const float *src, const float max, const int count ) {
3806 //#define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)];
3807 register vector float v0, v1, v2, v3;
3808 register vector unsigned char permVec;
3809 register vector float constVec;
3810 register vector float v0_low, v0_hi, v1_low, v1_hi;
3811 vector unsigned char oneVector = (vector unsigned char)(1);
3814 //handle unaligned at start
3815 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
3816 dst[i] = src[i] < max ? max : src[i];
3819 //splat constant into a vector
3820 constVec = loadSplatUnalignedScalar( &max );
3822 //calculate permute and do first load
3823 permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
3824 v1_hi = vec_ld( 0, &src[i] );
3827 for ( ; i+7 < count; i += 8 ) {
3830 v0_hi = vec_ld( 15, &src[i] );
3832 v1_hi = vec_ld( 31, &src[i] );
3834 v0 = vec_perm( v0_low, v0_hi, permVec );
3835 v1 = vec_perm( v1_low, v1_hi, permVec );
3836 v2 = vec_min( v0, constVec );
3837 v3 = vec_min( v1, constVec );
3839 ALIGNED_STORE2( &dst[i], v2, v3 );
3843 for ( ; i < count ; i++ ) {
3844 dst[i] = src[i] < max ? max : src[i];
3848 #endif /* ENABLE_CLAMP */
3850 #ifdef ENABLE_16ROUTINES
3854 idSIMD_AltiVec::Zero16
3857 void VPCALL idSIMD_AltiVec::Zero16( float *dst, const int count ) {
3858 memset( dst, 0, count * sizeof( float ) );
3863 idSIMD_AltiVec::Negate16
3869 void VPCALL idSIMD_AltiVec::Negate16( float *dst, const int count ) {
3870 //#define OPER(X) ptr[(X)] ^= ( 1 << 31 ) // IEEE 32 bits float sign bit
3873 assert( IS_16BYTE_ALIGNED( dst[0] ) );
3875 // round count up to next 4 if needbe
3876 int count2 = ( count + 3 ) & ~3;
3879 vector float v0, v1, v2, v3;
3881 //know its 16-byte aligned
3882 for ( ; i + 7 < count2; i += 8 ) {
3883 v0 = vec_ld( 0, &dst[i] );
3884 v1 = vec_ld( 16, &dst[i] );
3886 v2 = vec_sub( (vector float)(0), v0 );
3887 v3 = vec_sub( (vector float)(0), v1 );
3889 ALIGNED_STORE2( &dst[i], v2, v3 );
3892 for ( ; i < count2; i += 4 ) {
3893 v0 = vec_ld( 0, &dst[i] );
3894 v1 = vec_sub( (vector float)(0), v0 );
3895 vec_st( v1, 0, &dst[i] );
3901 idSIMD_AltiVec::Copy16
3904 void VPCALL idSIMD_AltiVec::Copy16( float *dst, const float *src, const int count ) {
3905 //#define OPER(X) dst[(X)] = src[(X)]
3906 memcpy( dst, src, sizeof(float) * count );
3911 idSIMD_AltiVec::Add16
3914 Assumes dst, src1, src2 all start at aligned address
3917 void VPCALL idSIMD_AltiVec::Add16( float *dst, const float *src1, const float *src2, const int count ) {
3918 //#define OPER(X) dst[(X)] = src1[(X)] + src2[(X)]
3921 assert( IS_16BYTE_ALIGNED( dst[0] ) );
3923 assert( IS_16BYTE_ALIGNED( src1[0] ) );
3925 assert( IS_16BYTE_ALIGNED( src2[0] ) );
3927 // round count up to next 4 if needbe
3928 int count2 = ( count + 3 ) & ~3;
3930 register vector float v0, v1, v2, v3, v4, v5;
3933 //know all data is 16-byte aligned, so vectorize!
3934 for ( ; i+7 < count2; i += 8 ) {
3936 v0 = vec_ld( 0, &src1[i] );
3937 v1 = vec_ld( 16, &src1[i] );
3938 v2 = vec_ld( 0, &src2[i] );
3939 v3 = vec_ld( 16, &src2[i] );
3940 v4 = vec_add( v0, v2 );
3941 v5 = vec_add( v1, v3 );
3943 ALIGNED_STORE2( &dst[i], v4, v5 );
3946 for ( ; i < count2; i += 4 ) {
3947 v0 = vec_ld( 0, &src1[i] );
3948 v1 = vec_ld( 0, &src2[i] );
3949 v2 = vec_add( v0, v1 );
3950 vec_st( v2, 0, &dst[i] );
3956 idSIMD_AltiVec::Sub16
3959 Assumes that dst, src1, and src2 all start at aligned address
3962 void VPCALL idSIMD_AltiVec::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
3963 //#define OPER(X) dst[(X)] = src1[(X)] - src2[(X)]
3965 assert( IS_16BYTE_ALIGNED( dst[0] ) );
3967 assert( IS_16BYTE_ALIGNED( src1[0] ) );
3969 assert( IS_16BYTE_ALIGNED( src2[0] ) );
3971 // round count up to next 4 if needbe
3972 int count2 = ( count + 3 ) & ~3;
3974 register vector float v0, v1, v2, v3, v4, v5;
3977 //know data is aligned, so vectorize!
3978 for ( ; i+7 < count2; i += 8 ) {
3980 v0 = vec_ld( 0, &src1[i] );
3981 v1 = vec_ld( 16, &src1[i] );
3982 v2 = vec_ld( 0, &src2[i] );
3983 v3 = vec_ld( 16, &src2[i] );
3984 v4 = vec_sub( v0, v2 );
3985 v5 = vec_sub( v1, v3 );
3987 ALIGNED_STORE2( &dst[i], v4, v5 );
3990 for ( ; i < count2; i += 4 ) {
3991 v0 = vec_ld( 0, &src1[i] );
3992 v1 = vec_ld( 0, &src2[i] );
3993 v2 = vec_sub( v0, v1 );
3994 vec_st( v2, 0, &dst[i] );
4000 idSIMD_AltiVec::Mul16
4003 Assumes that dst and src1 start at aligned address
4006 void VPCALL idSIMD_AltiVec::Mul16( float *dst, const float *src1, const float constant, const int count ) {
4007 //#define OPER(X) dst[(X)] = src1[(X)] * constant
4010 assert( IS_16BYTE_ALIGNED( dst[0] ) );
4012 assert( IS_16BYTE_ALIGNED( src1[0] ) );
4014 // round count up to next 4 if needbe
4015 int count2 = ( count + 3 ) & ~3;
4017 register vector float v0, v1, v2, v3;
4018 register vector float constVec;
4019 register vector float zeroVector = (vector float)(0.0);
4022 //splat constant into a vector
4023 constVec = loadSplatUnalignedScalar( &constant );
4025 //know data is aligned, so vectorize!
4026 for ( ; i+7 < count2; i += 8 ) {
4028 v0 = vec_ld( 0, &src1[i] );
4029 v1 = vec_ld( 16, &src1[i] );
4030 v2 = vec_madd( constVec, v0, zeroVector );
4031 v3 = vec_madd( constVec, v1, zeroVector );
4032 ALIGNED_STORE2( &dst[i], v2, v3 );
4035 for ( ; i < count2; i += 4 ) {
4036 v0 = vec_ld( 0, &src1[i] );
4037 v1 = vec_madd( constVec, v0, zeroVector );
4038 vec_st( v1, 0, &dst[i] );
4044 idSIMD_AltiVec::AddAssign16
4047 Assumes that dst and src start at aligned address
4050 void VPCALL idSIMD_AltiVec::AddAssign16( float *dst, const float *src, const int count ) {
4051 //#define OPER(X) dst[(X)] += src[(X)]
4054 assert( IS_16BYTE_ALIGNED( dst[0] ) );
4056 assert( IS_16BYTE_ALIGNED( src[0] ) );
4058 // round count up to next 4 if needbe
4059 int count2 = ( count + 3 ) & ~3;
4061 register vector float v0, v1, v2, v3, v4, v5;
4065 for ( ; i+7 < count2; i += 8 ) {
4066 v0 = vec_ld( 0, &src[i] );
4067 v1 = vec_ld( 16, &src[i] );
4068 v2 = vec_ld( 0, &dst[i] );
4069 v3 = vec_ld( 16, &dst[i] );
4070 v4 = vec_add( v0, v2 );
4071 v5 = vec_add( v1, v3 );
4072 ALIGNED_STORE2( &dst[i], v4, v5 );
4075 for ( ; i < count2; i += 4 ) {
4076 v0 = vec_ld( 0, &src[i] );
4077 v1 = vec_ld( 0, &dst[i] );
4078 v2 = vec_add( v0, v1 );
4079 vec_st( v2, 0, &dst[i] );
4085 idSIMD_AltiVec::SubAssign16
4088 Assumes that dst and src start at aligned address
4091 void VPCALL idSIMD_AltiVec::SubAssign16( float *dst, const float *src, const int count ) {
4092 //#define OPER(X) dst[(X)] -= src[(X)]
4093 register vector float v0, v1, v2, v3, v4, v5;
4097 assert( IS_16BYTE_ALIGNED( dst[0] ) );
4099 assert( IS_16BYTE_ALIGNED( src[0] ) );
4100 // round count up to next 4 if needbe
4101 int count2 = ( count + 3 ) & ~3;
4104 for ( ; i+7 < count2; i += 8 ) {
4105 v0 = vec_ld( 0, &src[i] );
4106 v1 = vec_ld( 16, &src[i] );
4107 v2 = vec_ld( 0, &dst[i] );
4108 v3 = vec_ld( 16, &dst[i] );
4109 v4 = vec_sub( v2, v0 );
4110 v5 = vec_sub( v3, v1 );
4111 ALIGNED_STORE2( &dst[i], v4, v5 );
4114 for ( ; i < count2; i += 4 ) {
4115 v0 = vec_ld( 0, &src[i] );
4116 v1 = vec_ld( 0, &dst[i] );
4117 v2 = vec_sub( v1, v0 );
4118 vec_st( v2, 0, &dst[i] );
4124 idSIMD_AltiVec::MulAssign16
4127 Assumes that dst starts at aligned address and count is multiple of 4
4130 void VPCALL idSIMD_AltiVec::MulAssign16( float *dst, const float constant, const int count ) {
4131 //#define OPER(X) dst[(X)] *= constant
4134 assert( IS_16BYTE_ALIGNED( dst[0] ) );
4135 // round count up to next 4 if needbe
4136 int count2 = ( count + 3 ) & ~3;
4138 register vector float v0, v1, v2, v3;
4139 register vector float constVec;
4141 register vector float zeroVector = (vector float)(0.0);
4143 //splat constant into a vector
4144 constVec = loadSplatUnalignedScalar( &constant );
4147 for ( ; i+7 < count2; i += 8 ) {
4148 v0 = vec_ld( 0, &dst[i] );
4149 v1 = vec_ld( 16, &dst[i] );
4150 v2 = vec_madd( v0, constVec, zeroVector );
4151 v3 = vec_madd( v1, constVec, zeroVector );
4152 ALIGNED_STORE2( &dst[i], v2, v3 );
4155 for ( ; i < count2; i += 4 ) {
4156 v0 = vec_ld( 0, &dst[i] );
4157 v1 = vec_madd( v0, constVec, zeroVector );
4158 vec_st( v1, 0, &dst[i] );
4162 #endif /* ENABLE_16ROUTINES */
4164 #ifdef ENABLE_LOWER_TRIANGULAR
4168 idSIMD_AltiVec::MatX_LowerTriangularSolve
4170 solves x in L * x = b for the first n rows of L
4171 if skip > 0 the first skip elements of x are assumed to be valid already
4172 L has to be a lower triangular matrix with (implicit) ones on the diagonal
4177 void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
4192 vector float vecSum1 = (vector float)(0.0);
4193 vector float vecSum2 = (vector float)(0.0);
4194 vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
4195 vector float zeroVector = (vector float)(0.0);
4196 vector float vecSum3, vecSum4, vecSum5, vecSum6, vecSum7, vecSum8;
4198 vector unsigned char vecPermX = vec_add( vec_lvsl( -1, &x[0] ), (vector unsigned char)(1) );
4200 // unrolled this loop a bit
4201 for ( i = skip; i+3 < n; i+=4 ) {
4207 vecSum1 = zeroVector;
4208 vecSum2 = zeroVector;
4209 vecSum3 = vecSum4 = vecSum5 = vecSum6 = vecSum7 = vecSum8 = zeroVector;
4215 vector unsigned char vecPermLptr1 = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
4216 vector unsigned char vecPermLptr2 = vec_add( vec_lvsl( -1, lptr2 ), (vector unsigned char)(1) );
4217 vector unsigned char vecPermLptr3 = vec_add( vec_lvsl( -1, lptr3 ), (vector unsigned char)(1) );
4218 vector unsigned char vecPermLptr4 = vec_add( vec_lvsl( -1, lptr4 ), (vector unsigned char)(1) );
4220 for ( j = 0 ; j+7 < i; j+=8 ) {
4222 v0 = vec_ld( 0, &x[j] );
4223 v1 = vec_ld( 15, &x[j] );
4224 vector float vecExtraX = vec_ld( 31, &x[j] );
4225 v0 = vec_perm( v0, v1, vecPermX );
4226 v1 = vec_perm( v1, vecExtraX, vecPermX );
4228 v2 = vec_ld( 0, lptr + j );
4229 v3 = vec_ld( 15, lptr + j );
4230 vector float vecExtra1 = vec_ld( 31, lptr + j );
4231 v2 = vec_perm( v2, v3, vecPermLptr1 );
4232 v3 = vec_perm( v3, vecExtra1, vecPermLptr1 );
4234 v4 = vec_ld( 0, lptr2 + j );
4235 v5 = vec_ld( 15, lptr2 + j );
4236 vector float vecExtra2 = vec_ld( 31, lptr2 + j );
4237 v4 = vec_perm( v4, v5, vecPermLptr2 );
4238 v5 = vec_perm( v5, vecExtra2, vecPermLptr2 );
4240 v6 = vec_ld( 0, lptr3 + j );
4241 v7 = vec_ld( 15, lptr3 + j );
4242 vector float vecExtra3 = vec_ld( 31, lptr3 + j );
4243 v6 = vec_perm( v6, v7, vecPermLptr3 );
4244 v7 = vec_perm( v7, vecExtra3, vecPermLptr3 );
4246 v8 = vec_ld( 0, lptr4 + j );
4247 v9 = vec_ld( 15, lptr4 + j );
4248 vector float vecExtra4 = vec_ld( 31, lptr4 + j );
4249 v8 = vec_perm( v8, v9, vecPermLptr4 );
4250 v9 = vec_perm( v9, vecExtra4, vecPermLptr4 );
4252 vecSum1 = vec_madd( v2, v0, vecSum1 );
4253 vecSum2 = vec_madd( v3, v1, vecSum2 );
4255 vecSum3 = vec_madd( v4, v0, vecSum3 );
4256 vecSum4 = vec_madd( v5, v1, vecSum4 );
4258 vecSum5 = vec_madd( v6, v0, vecSum5 );
4259 vecSum6 = vec_madd( v7, v1, vecSum6 );
4261 vecSum7 = vec_madd( v8, v0, vecSum7 );
4262 vecSum8 = vec_madd( v9, v1, vecSum8 );
4265 // if we ran the unrolled code, we need to sum accross the vectors
4266 // to find out how much to subtract from sum
4268 vecSum1 = vec_add( vecSum1, vecSum2 );
4269 vecSum3 = vec_add( vecSum3, vecSum4 );
4270 vecSum5 = vec_add( vecSum5, vecSum6 );
4271 vecSum7 = vec_add( vecSum7, vecSum8 );
4272 //sum accross the vectors
4273 vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
4274 vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
4276 vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 8 ) );
4277 vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 4 ) );
4279 vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 8 ) );
4280 vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 4 ) );
4282 vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 8 ) );
4283 vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 4 ) );
4285 //move the result to the FPU
4286 vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
4287 vec_ste( vec_splat( vecSum3, 0 ), 0, &tempSum2 );
4288 vec_ste( vec_splat( vecSum5, 0 ), 0, &tempSum3 );
4289 vec_ste( vec_splat( vecSum7, 0 ), 0, &tempSum4 );
4298 for ( ; j < i; j++ ) {
4299 sum -= lptr[j] * x[j];
4300 sum2 -= lptr2[j] * x[j];
4301 sum3 -= lptr3[j] * x[j];
4302 sum4 -= lptr4[j] * x[j];
4305 // store the 4 results at a time
4306 sum2 -= ( lptr2[i] * sum );
4307 sum3 = sum3 - ( lptr3[i+1] * sum2 ) - ( lptr3[i] * sum );
4308 sum4 = sum4 - ( lptr4[i+2] * sum3 ) - ( lptr4[i+1] * sum2 ) - ( lptr4[i] * sum );
4317 for ( ; i < n; i++ ) {
4319 vecSum1 = zeroVector;
4320 vecSum2 = zeroVector;
4322 vector unsigned char vecPermLptr = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
4324 for ( j = 0 ; j+7 < i; j+=8 ) {
4326 v0 = vec_ld( 0, &x[j] );
4327 v2 = vec_ld( 15, &x[j] );
4328 vector float vecExtraX = vec_ld( 31, &x[j] );
4329 v0 = vec_perm( v0, v2, vecPermX );
4330 v2 = vec_perm( v2, vecExtraX, vecPermX );
4332 v1 = vec_ld( 0, lptr + j );
4333 v3 = vec_ld( 15, lptr + j );
4334 vector float vecExtra = vec_ld( 31, lptr + j );
4335 v1 = vec_perm( v1, v3, vecPermLptr );
4336 v3 = vec_perm( v3, vecExtra, vecPermLptr );
4338 vecSum1 = vec_madd( v1, v0, vecSum1 );
4339 vecSum2 = vec_madd( v3, v2, vecSum2 );
4342 // if we ran the unrolled code, we need to sum accross the vectors
4343 // to find out how much to subtract from sum
4345 //sum accross the vectors
4346 vecSum1 = vec_add( vecSum1, vecSum2 );
4347 vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
4348 vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
4350 //move the result to the FPU
4351 vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
4356 for ( ; j < i; j++ ) {
4357 sum -= lptr[j] * x[j];
4365 idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose
4367 solves x in L.Transpose() * x = b for the first n rows of L
4368 L has to be a lower triangular matrix with (implicit) ones on the diagonal
4372 void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
4377 lptr = L.ToFloatPtr();
4378 nc = L.GetNumColumns();
4380 float x0, x1, x2, x3, x4, x5, x6;
4381 // unrolled cases for n < 8
4384 // using local variables to avoid aliasing issues
4392 x0 = b[0] - lptr[1*nc+0] * x1;
4399 x1 = b[1] - lptr[2*nc+1] * x2;
4400 x0 = b[0] - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4408 x2 = b[2] - lptr[3*nc+2] * x3;
4409 x1 = b[1] - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4410 x0 = b[0] - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4420 x3 = b[3] - lptr[4*nc+3] * x4;
4421 x2 = b[2] - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
4422 x1 = b[1] - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4423 x0 = b[0] - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4433 x4 = b[4] - lptr[5*nc+4] * x5;
4434 x3 = b[3] - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
4435 x2 = b[2] - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
4436 x1 = b[1] - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4437 x0 = b[0] - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4449 x5 = b[5] - lptr[6*nc+5] * x6;
4450 x4 = b[4] - lptr[6*nc+4] * x6 - lptr[5*nc+4] * x5;
4451 x3 = b[3] - lptr[6*nc+3] * x6 - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
4452 x2 = b[2] - lptr[6*nc+2] * x6 - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
4453 x1 = b[1] - lptr[6*nc+1] * x6 - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4454 x0 = b[0] - lptr[6*nc+0] * x6 - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4469 register float s0, s1, s2, s3;
4472 lptr = L.ToFloatPtr() + n * nc + n - 4;
4475 // process 4 rows at a time
4476 for ( i = n; i >= 4; i -= 4 ) {
4481 // process 4x4 blocks
4482 for ( j = 0; j < n-i; j += 4 ) {
4483 s0 -= lptr[(j+0)*nc+0] * xptr[j+0];
4484 s1 -= lptr[(j+0)*nc+1] * xptr[j+0];
4485 s2 -= lptr[(j+0)*nc+2] * xptr[j+0];
4486 s3 -= lptr[(j+0)*nc+3] * xptr[j+0];
4487 s0 -= lptr[(j+1)*nc+0] * xptr[j+1];
4488 s1 -= lptr[(j+1)*nc+1] * xptr[j+1];
4489 s2 -= lptr[(j+1)*nc+2] * xptr[j+1];
4490 s3 -= lptr[(j+1)*nc+3] * xptr[j+1];
4491 s0 -= lptr[(j+2)*nc+0] * xptr[j+2];
4492 s1 -= lptr[(j+2)*nc+1] * xptr[j+2];
4493 s2 -= lptr[(j+2)*nc+2] * xptr[j+2];
4494 s3 -= lptr[(j+2)*nc+3] * xptr[j+2];
4495 s0 -= lptr[(j+3)*nc+0] * xptr[j+3];
4496 s1 -= lptr[(j+3)*nc+1] * xptr[j+3];
4497 s2 -= lptr[(j+3)*nc+2] * xptr[j+3];
4498 s3 -= lptr[(j+3)*nc+3] * xptr[j+3];
4500 // process left over of the 4 rows
4501 s0 -= lptr[0-1*nc] * s3;
4502 s1 -= lptr[1-1*nc] * s3;
4503 s2 -= lptr[2-1*nc] * s3;
4504 s0 -= lptr[0-2*nc] * s2;
4505 s1 -= lptr[1-2*nc] * s2;
4506 s0 -= lptr[0-3*nc] * s1;
4512 // update pointers for next four rows
4516 // process left over rows
4517 for ( i--; i >= 0; i-- ) {
4520 for ( j = i + 1; j < n; j++ ) {
4521 s0 -= lptr[j*nc] * x[j];
4529 idSIMD_AltiVec::MatX_LDLTFactor
4532 bool VPCALL idSIMD_AltiVec::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
4534 float *v, *diag, *mptr;
4535 float s0, s1, s2, s3, sum, d;
4536 float s0_2, s1_2, s2_2, s3_2, sum_2;
4539 v = (float *) _alloca16( n * sizeof( float ) );
4540 diag = (float *) _alloca16( n * sizeof( float ) );
4542 nc = mat.GetNumColumns();
4552 if ( sum == 0.0f ) {
4557 invDiag[0] = d = 1.0f / sum;
4564 for ( j = 1; j < n; j++ ) {
4565 mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
4570 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4573 if ( sum == 0.0f ) {
4579 invDiag[1] = d = 1.0f / sum;
4586 for ( j = 2; j < n; j++ ) {
4587 mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
4592 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4593 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
4594 sum = mptr[2] - s0 - s1;
4596 if ( sum == 0.0f ) {
4602 invDiag[2] = d = 1.0f / sum;
4609 for ( j = 3; j < n; j++ ) {
4610 mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
4615 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4616 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
4617 v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
4618 sum = mptr[3] - s0 - s1 - s2;
4620 if ( sum == 0.0f ) {
4626 invDiag[3] = d = 1.0f / sum;
4633 for ( j = 4; j < n; j++ ) {
4634 mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
4637 for ( i = 4; i < n; i++ ) {
4641 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4642 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
4643 v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
4644 v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
4645 for ( k = 4; k < i-3; k += 4 ) {
4646 v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
4647 v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
4648 v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
4649 v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
4652 case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
4653 case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
4654 case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
4660 sum = mptr[i] - sum;
4662 if ( sum == 0.0f ) {
4668 invDiag[i] = d = 1.0f / sum;
4674 // unrolling madness!
4676 mptr2 = mat[i+1] + nc;
4678 for ( j = i+1; j+1 < n; j+=2 ) {
4679 s0 = mptr[0] * v[0];
4680 s1 = mptr[1] * v[1];
4681 s2 = mptr[2] * v[2];
4682 s3 = mptr[3] * v[3];
4684 s0_2 = mptr2[0] * v[0];
4685 s1_2 = mptr2[1] * v[1];
4686 s2_2 = mptr2[2] * v[2];
4687 s3_2 = mptr2[3] * v[3];
4689 for ( k = 4; k < i-7; k += 8 ) {
4690 s0 += mptr[k+0] * v[k+0];
4691 s1 += mptr[k+1] * v[k+1];
4692 s2 += mptr[k+2] * v[k+2];
4693 s3 += mptr[k+3] * v[k+3];
4694 s0 += mptr[k+4] * v[k+4];
4695 s1 += mptr[k+5] * v[k+5];
4696 s2 += mptr[k+6] * v[k+6];
4697 s3 += mptr[k+7] * v[k+7];
4699 s0_2 += mptr2[k+0] * v[k+0];
4700 s1_2 += mptr2[k+1] * v[k+1];
4701 s2_2 += mptr2[k+2] * v[k+2];
4702 s3_2 += mptr2[k+3] * v[k+3];
4703 s0_2 += mptr2[k+4] * v[k+4];
4704 s1_2 += mptr2[k+5] * v[k+5];
4705 s2_2 += mptr2[k+6] * v[k+6];
4706 s3_2 += mptr2[k+7] * v[k+7];
4710 case 7: s0 += mptr[k+6] * v[k+6]; s0_2 += mptr2[k+6] * v[k+6];
4711 case 6: s1 += mptr[k+5] * v[k+5]; s1_2 += mptr2[k+5] * v[k+5];
4712 case 5: s2 += mptr[k+4] * v[k+4]; s2_2 += mptr2[k+4] * v[k+4];
4713 case 4: s3 += mptr[k+3] * v[k+3]; s3_2 += mptr2[k+3] * v[k+3];
4714 case 3: s0 += mptr[k+2] * v[k+2]; s0_2 += mptr2[k+2] * v[k+2];
4715 case 2: s1 += mptr[k+1] * v[k+1]; s1_2 += mptr2[k+1] * v[k+1];
4716 case 1: s2 += mptr[k+0] * v[k+0]; s2_2 += mptr2[k+0] * v[k+0];
4718 // disassociate these adds
4725 sum_2 = s1_2 + s3_2;
4727 mptr[i] = ( mptr[i] - sum ) * d;
4728 mptr2[i] = ( mptr2[i] - sum_2 ) * d;
4735 for ( ; j < n; j++ ) {
4736 s0 = mptr[0] * v[0];
4737 s1 = mptr[1] * v[1];
4738 s2 = mptr[2] * v[2];
4739 s3 = mptr[3] * v[3];
4740 for ( k = 4; k < i-7; k += 8 ) {
4741 s0 += mptr[k+0] * v[k+0];
4742 s1 += mptr[k+1] * v[k+1];
4743 s2 += mptr[k+2] * v[k+2];
4744 s3 += mptr[k+3] * v[k+3];
4745 s0 += mptr[k+4] * v[k+4];
4746 s1 += mptr[k+5] * v[k+5];
4747 s2 += mptr[k+6] * v[k+6];
4748 s3 += mptr[k+7] * v[k+7];
4751 case 7: s0 += mptr[k+6] * v[k+6];
4752 case 6: s1 += mptr[k+5] * v[k+5];
4753 case 5: s2 += mptr[k+4] * v[k+4];
4754 case 4: s3 += mptr[k+3] * v[k+3];
4755 case 3: s0 += mptr[k+2] * v[k+2];
4756 case 2: s1 += mptr[k+1] * v[k+1];
4757 case 1: s2 += mptr[k+0] * v[k+0];
4759 // disassociate these adds
4763 mptr[i] = ( mptr[i] - sum ) * d;
4769 #endif /* ENABLE_LOWER_TRIANGULAR */
4772 #ifdef LIVE_VICARIOUSLY
4775 idSIMD_AltiVec::BlendJoints
4778 void VPCALL idSIMD_AltiVec::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
4781 // since lerp is a constant, we can special case the two cases if they're true
4782 if ( lerp <= 0.0f ) {
4783 // this sets joints back to joints. No sense in doing no work, so just return
4787 if ( lerp >= 1.0f ) {
4788 // this copies each q from blendJoints to joints and copies each t from blendJoints to joints
4789 memcpy( joints[0].q.ToFloatPtr(), blendJoints[0].q.ToFloatPtr(), sizeof(idJointQuat) * numJoints );
4793 vector float vecLerp = loadSplatUnalignedScalar( &lerp );
4794 vector float zeroVector = (vector float)(0);
4796 for ( i = 0; i+3 < numJoints; i+=4 ) {
4798 int j2 = index[i+1];
4799 int j3 = index[i+2];
4800 int j4 = index[i+3];
4803 const float *jointPtr = joints[j].q.ToFloatPtr();
4804 const float *blendPtr = blendJoints[j].q.ToFloatPtr();
4805 const float *jointPtr2 = joints[j2].q.ToFloatPtr();
4806 const float *blendPtr2 = blendJoints[j2].q.ToFloatPtr();
4807 const float *jointPtr3 = joints[j3].q.ToFloatPtr();
4808 const float *blendPtr3 = blendJoints[j3].q.ToFloatPtr();
4809 const float *jointPtr4 = joints[j4].q.ToFloatPtr();
4810 const float *blendPtr4 = blendJoints[j4].q.ToFloatPtr();
4812 vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
4813 vector unsigned char permVec2 = vec_add( vec_lvsl( -1, jointPtr2 ), (vector unsigned char)(1) );
4814 vector unsigned char permVec3 = vec_add( vec_lvsl( -1, jointPtr3 ), (vector unsigned char)(1) );
4815 vector unsigned char permVec4 = vec_add( vec_lvsl( -1, jointPtr4 ), (vector unsigned char)(1) );
4817 vector unsigned char permVec5 = vec_add( vec_lvsl( -1, blendPtr ), (vector unsigned char)(1) );
4818 vector unsigned char permVec6 = vec_add( vec_lvsl( -1, blendPtr2 ), (vector unsigned char)(1) );
4819 vector unsigned char permVec7 = vec_add( vec_lvsl( -1, blendPtr3 ), (vector unsigned char)(1) );
4820 vector unsigned char permVec8 = vec_add( vec_lvsl( -1, blendPtr4 ), (vector unsigned char)(1) );
4822 vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
4823 vector float v12, v13, v14, v15, v16;
4824 vector float vecFromX, vecFromY, vecFromZ, vecFromW;
4825 vector float vecToX, vecToY, vecToZ, vecToW;
4827 // load up the the idJointQuats from joints
4828 v0 = vec_ld( 0, jointPtr );
4829 v1 = vec_ld( 15, jointPtr );
4830 v2 = vec_perm( v0, v1, permVec );
4832 v3 = vec_ld( 0, jointPtr2 );
4833 v4 = vec_ld( 15, jointPtr2 );
4834 v5 = vec_perm( v3, v4, permVec2 );
4836 v6 = vec_ld( 0, jointPtr3 );
4837 v7 = vec_ld( 15, jointPtr3 );
4838 v8 = vec_perm( v6, v7, permVec3 );
4840 v9 = vec_ld( 0, jointPtr4 );
4841 v10 = vec_ld( 15, jointPtr4 );
4842 v11 = vec_perm( v9, v10, permVec4 );
4844 // planarizing, so put each x y z w into its own vector
4845 v0 = vec_mergeh( v2, v8 );
4846 v1 = vec_mergeh( v5, v11 );
4847 v3 = vec_mergel( v2, v8 );
4848 v4 = vec_mergel( v5, v11 );
4850 vecFromX = vec_mergeh( v0, v1 );
4851 vecFromY = vec_mergel( v0, v1 );
4852 vecFromZ = vec_mergeh( v3, v4 );
4853 vecFromW = vec_mergel( v3, v4 );
4855 // load up idJointQuats from blendJoints
4856 v5 = vec_ld( 0, blendPtr );
4857 v6 = vec_ld( 15, blendPtr );
4858 v7 = vec_perm( v5, v6, permVec5 );
4860 v8 = vec_ld( 0, blendPtr2 );
4861 v9 = vec_ld( 15, blendPtr2 );
4862 v10 = vec_perm( v8, v9, permVec6 );
4864 v11 = vec_ld( 0, blendPtr3 );
4865 v12 = vec_ld( 15, blendPtr3 );
4866 v13 = vec_perm( v11, v12, permVec7 );
4868 v14 = vec_ld( 0, blendPtr4 );
4869 v15 = vec_ld( 15, blendPtr4 );
4870 v16 = vec_perm( v14, v15, permVec8 );
4872 // put these into their own vectors too
4873 v5 = vec_mergeh( v7, v13 );
4874 v6 = vec_mergeh( v10, v16 );
4875 v8 = vec_mergel( v7, v13 );
4876 v9 = vec_mergel( v10, v16 );
4878 vecToX = vec_mergeh( v5, v6 );
4879 vecToY = vec_mergel( v5, v6 );
4880 vecToZ = vec_mergeh( v8, v9 );
4881 vecToW = vec_mergel( v8, v9 );
4884 vector float vecCosom = vec_madd( vecFromX, vecToX, (vector float)(0) );
4885 vecCosom = vec_madd( vecFromY, vecToY, vecCosom );
4886 vecCosom = vec_madd( vecFromZ, vecToZ, vecCosom );
4887 vecCosom = vec_madd( vecFromW, vecToW, vecCosom );
4889 // if cosom is < 0, negate it and set temp to negated elements in to. otherwise, set temp to
4891 vector bool int vecCmp, vecCmp2;
4892 vecCmp = vec_cmplt( vecCosom, zeroVector );
4895 vecToX = vec_sel( vecToX, vec_madd( vecToX, (vector float)(-1), zeroVector ), vecCmp );
4896 vecToY = vec_sel( vecToY, vec_madd( vecToY, (vector float)(-1), zeroVector ), vecCmp );
4897 vecToZ = vec_sel( vecToZ, vec_madd( vecToZ, (vector float)(-1), zeroVector ), vecCmp );
4898 vecToW = vec_sel( vecToW, vec_madd( vecToW, (vector float)(-1), zeroVector ), vecCmp );
4899 vecCosom = vec_sel( vecCosom, vec_madd( vecCosom, (vector float)(-1), zeroVector ), vecCmp );
4901 // check if we need to calculate scale
4902 vecCmp2 = vec_cmpgt( vec_sub( (vector float)(1), vecCosom ), (vector float)(1e-6f) );
4903 vector float vecScale0 = vec_sub( (vector float)(1), vecLerp );
4904 vector float vecScale1 = vec_splat( vecLerp, 0 );
4906 vector float vecWork1 = vec_sub( (vector float)(1), vec_madd( vecCosom, vecCosom, zeroVector ) );
4907 vector float vecWork2 = ReciprocalSquareRoot( vecWork1 );
4908 vector float vecWork3 = VectorATan16( vec_madd( vecWork1, vecWork2, zeroVector ), vecCosom );
4910 vecWork1 = vec_madd( VectorSin16( vec_madd( vecScale0, vecWork3, zeroVector ) ), vecWork2, zeroVector );
4911 vecWork2 = vec_madd( VectorSin16( vec_madd( vecLerp, vecWork3, zeroVector ) ), vecWork2, zeroVector );
4913 // see which ones we have to insert into our scale0 and scale1 vectors
4914 vecScale0 = vec_sel( vecScale0, vecWork1, vecCmp2 );
4915 vecScale1 = vec_sel( vecScale1, vecWork2, vecCmp2 );
4917 // multiply each element by the scale
4918 vecFromX = vec_madd( vecFromX, vecScale0, zeroVector );
4919 vecFromY = vec_madd( vecFromY, vecScale0, zeroVector );
4920 vecFromZ = vec_madd( vecFromZ, vecScale0, zeroVector );
4921 vecFromW = vec_madd( vecFromW, vecScale0, zeroVector );
4923 // multiply temp by scale and add to result
4924 vecFromX = vec_madd( vecToX, vecScale1, vecFromX );
4925 vecFromY = vec_madd( vecToY, vecScale1, vecFromY );
4926 vecFromZ = vec_madd( vecToZ, vecScale1, vecFromZ );
4927 vecFromW = vec_madd( vecToW, vecScale1, vecFromW );
4929 // do a transform again to get the results back to vectors we can store out
4930 v5 = vec_mergeh( vecFromX, vecFromZ );
4931 v6 = vec_mergeh( vecFromY, vecFromW );
4932 v8 = vec_mergel( vecFromX, vecFromZ );
4933 v9 = vec_mergel( vecFromY, vecFromW );
4935 vecToX = vec_mergeh( v5, v6 );
4936 vecToY = vec_mergel( v5, v6 );
4937 vecToZ = vec_mergeh( v8, v9 );
4938 vecToW = vec_mergel( v8, v9 );
4940 vector unsigned char storePerm1 = vec_lvsr( 0, jointPtr );
4941 vector unsigned char storePerm2 = vec_lvsr( 0, jointPtr2 );
4942 vector unsigned char storePerm3 = vec_lvsr( 0, jointPtr3 );
4943 vector unsigned char storePerm4 = vec_lvsr( 0, jointPtr4 );
4945 // right rotate the input data
4946 vecToX = vec_perm( vecToX, vecToX, storePerm1 );
4947 vecToY = vec_perm( vecToY, vecToY, storePerm2 );
4948 vecToZ = vec_perm( vecToZ, vecToZ, storePerm3 );
4949 vecToW = vec_perm( vecToW, vecToW, storePerm4 );
4951 vec_ste( vecToX, 0, (float*) jointPtr );
4952 vec_ste( vecToX, 4, (float*) jointPtr );
4953 vec_ste( vecToX, 8, (float*) jointPtr );
4954 vec_ste( vecToX, 12, (float*) jointPtr );
4956 vec_ste( vecToY, 0, (float*) jointPtr2 );
4957 vec_ste( vecToY, 4, (float*) jointPtr2 );
4958 vec_ste( vecToY, 8, (float*) jointPtr2 );
4959 vec_ste( vecToY, 12, (float*) jointPtr2 );
4961 vec_ste( vecToZ, 0, (float*) jointPtr3 );
4962 vec_ste( vecToZ, 4, (float*) jointPtr3 );
4963 vec_ste( vecToZ, 8, (float*) jointPtr3 );
4964 vec_ste( vecToZ, 12, (float*) jointPtr3 );
4966 vec_ste( vecToW, 0, (float*) jointPtr4 );
4967 vec_ste( vecToW, 4, (float*) jointPtr4 );
4968 vec_ste( vecToW, 8, (float*) jointPtr4 );
4969 vec_ste( vecToW, 12, (float*) jointPtr4 );
4971 // lerp is v1 + l * ( v2 - v1 );
4972 // the idVec3 T is going to be 12 bytes after the Q, so we can do this without calling ToFloatPtr() again. since its
4973 float *jointVecPtr = (float*)( jointPtr + 4 );
4974 float *jointVecPtr2 = (float*)( jointPtr2 + 4 );
4975 float *jointVecPtr3 = (float*)( jointPtr3 + 4 );
4976 float *jointVecPtr4 = (float*)( jointPtr4 + 4 );
4978 v0 = vec_ld( 0, jointVecPtr );
4979 v1 = vec_ld( 11, jointVecPtr );
4980 vector float vecLd1 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, jointVecPtr ), (vector unsigned char)(1) ) );
4982 v2 = vec_ld( 0, jointVecPtr2 );
4983 v3 = vec_ld( 11, jointVecPtr2 );
4984 vector float vecLd2 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, jointVecPtr2 ), (vector unsigned char)(1) ) );
4986 v4 = vec_ld( 0, jointVecPtr3 );
4987 v5 = vec_ld( 11, jointVecPtr3 );
4988 vector float vecLd3 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, jointVecPtr3 ), (vector unsigned char)(1) ) );
4990 v6 = vec_ld( 0, jointVecPtr4 );
4991 v7 = vec_ld( 11, jointVecPtr4 );
4992 vector float vecLd4 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, jointVecPtr4 ), (vector unsigned char)(1) ) );
4994 vector float vecVecX, vecVecY, vecVecZ;
4995 vecVecX = vecVecY = vecVecZ = zeroVector;
4998 v0 = vec_mergeh( vecLd1, vecLd3 );
4999 v1 = vec_mergeh( vecLd2, vecLd4 );
5000 v3 = vec_mergel( vecLd1, vecLd3 );
5001 v4 = vec_mergel( vecLd2, vecLd4 );
5003 vecVecX = vec_mergeh( v0, v1 );
5004 vecVecY = vec_mergel( v0, v1 );
5005 vecVecZ = vec_mergeh( v3, v4 );
5007 // load blend joint idvec3's
5008 float *blendVecPtr = (float*)( blendPtr + 4 );
5009 float *blendVecPtr2 =(float*)( blendPtr2 + 4 );
5010 float *blendVecPtr3 = (float*)( blendPtr3 + 4 );
5011 float *blendVecPtr4 = (float*)( blendPtr4 + 4 );
5013 v0 = vec_ld( 0, blendVecPtr );
5014 v1 = vec_ld( 11, blendVecPtr );
5015 vector float vecLd5 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, blendVecPtr ), (vector unsigned char)(1) ) );
5017 v2 = vec_ld( 0, blendVecPtr2 );
5018 v3 = vec_ld( 11, blendVecPtr2 );
5019 vector float vecLd6 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, blendVecPtr2 ), (vector unsigned char)(1) ) );
5021 v4 = vec_ld( 0, blendVecPtr3 );
5022 v5 = vec_ld( 11, blendVecPtr3 );
5023 vector float vecLd7 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, blendVecPtr3 ), (vector unsigned char)(1) ) );
5025 v6 = vec_ld( 0, blendVecPtr4 );
5026 v7 = vec_ld( 11, blendVecPtr4 );
5027 vector float vecLd8 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, blendVecPtr4 ), (vector unsigned char)(1) ) );
5029 vector float vecBlendX, vecBlendY, vecBlendZ;
5030 vecBlendX = vecBlendY = vecBlendZ = zeroVector;
5033 v0 = vec_mergeh( vecLd5, vecLd7 );
5034 v1 = vec_mergeh( vecLd6, vecLd8 );
5035 v3 = vec_mergel( vecLd5, vecLd7 );
5036 v4 = vec_mergel( vecLd6, vecLd8 );
5038 vecBlendX = vec_mergeh( v0, v1 );
5039 vecBlendY = vec_mergel( v0, v1 );
5040 vecBlendZ = vec_mergeh( v3, v4 );
5043 vecWork1 = vec_sub( vecBlendX, vecVecX );
5044 vecWork2 = vec_sub( vecBlendY, vecVecY );
5045 vecWork3 = vec_sub( vecBlendZ, vecVecZ );
5047 // multiply by lerp and add to v1
5048 vecVecX = vec_madd( vecWork1, vecLerp, vecVecX );
5049 vecVecY = vec_madd( vecWork2, vecLerp, vecVecY );
5050 vecVecZ = vec_madd( vecWork3, vecLerp, vecVecZ );
5052 // put it back in original form
5053 v0 = vec_mergeh( vecVecX, vecVecZ );
5054 v1 = vec_mergeh( vecVecY, zeroVector );
5055 v3 = vec_mergel( vecVecX, vecVecZ );
5056 v4 = vec_mergel( vecVecY, zeroVector );
5058 // generate vectors to store
5059 vecWork1 = vec_mergeh( v0, v1 );
5060 vecWork2 = vec_mergel( v0, v1 );
5061 vecWork3 = vec_mergeh( v3, v4 );
5062 vector float vecWork4 = vec_mergel( v3, v4 );
5064 // store the T values
5065 storePerm1 = vec_lvsr( 0, jointVecPtr );
5066 storePerm2 = vec_lvsr( 0, jointVecPtr2 );
5067 storePerm3 = vec_lvsr( 0, jointVecPtr3 );
5068 storePerm4 = vec_lvsr( 0, jointVecPtr4 );
5070 // right rotate the input data
5071 vecWork1 = vec_perm( vecWork1, vecWork1, storePerm1 );
5072 vecWork2 = vec_perm( vecWork2, vecWork2, storePerm2 );
5073 vecWork3 = vec_perm( vecWork3, vecWork3, storePerm3 );
5074 vecWork4 = vec_perm( vecWork4, vecWork4, storePerm4 );
5076 vec_ste( vecWork1, 0, (float*) jointVecPtr );
5077 vec_ste( vecWork1, 4, (float*) jointVecPtr );
5078 vec_ste( vecWork1, 8, (float*) jointVecPtr );
5080 vec_ste( vecWork2, 0, (float*) jointVecPtr2 );
5081 vec_ste( vecWork2, 4, (float*) jointVecPtr2 );
5082 vec_ste( vecWork2, 8, (float*) jointVecPtr2 );
5084 vec_ste( vecWork3, 0, (float*) jointVecPtr3 );
5085 vec_ste( vecWork3, 4, (float*) jointVecPtr3 );
5086 vec_ste( vecWork3, 8, (float*) jointVecPtr3 );
5088 vec_ste( vecWork4, 0, (float*) jointVecPtr4 );
5089 vec_ste( vecWork4, 4, (float*) jointVecPtr4 );
5090 vec_ste( vecWork4, 8, (float*) jointVecPtr4 );
5094 for ( ; i < numJoints; i++ ) {
5096 joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp );
5097 joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );
5103 idSIMD_AltiVec::ConvertJointQuatsToJointMats
5107 // SSE doesn't vectorize this, and I don't think we should either. Its mainly just copying data, there's very little math involved and
5108 // it's not easily parallelizable
5109 void VPCALL idSIMD_AltiVec::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
5111 for ( int i = 0; i < numJoints; i++ ) {
5113 const float *q = jointQuats[i].q.ToFloatPtr();
5114 float *m = jointMats[i].ToFloatPtr();
5120 float x2 = q[0] + q[0];
5121 float y2 = q[1] + q[1];
5122 float z2 = q[2] + q[2];
5125 float xx = q[0] * x2;
5126 float yy = q[1] * y2;
5127 float zz = q[2] * z2;
5129 m[0*4+0] = 1.0f - yy - zz;
5130 m[1*4+1] = 1.0f - xx - zz;
5131 m[2*4+2] = 1.0f - xx - yy;
5135 float yz = q[1] * z2;
5136 float wx = q[3] * x2;
5143 float xy = q[0] * y2;
5144 float wz = q[3] * z2;
5151 float xz = q[0] * z2;
5152 float wy = q[3] * y2;
5162 idSIMD_AltiVec::ConvertJointMatsToJointQuats
5165 void VPCALL idSIMD_AltiVec::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
5169 // Since we use very little of the data we have to pull in for the altivec version, we end up with
5170 // a lot of wasted math. Rather than try to force it to use altivec, I wrote an optimized version
5171 // of InvSqrt for the G5, and made it use that instead. With only this change, we get a little
5172 // bigger than 50% speedup, which is not too shabby. Should really replace idMath::InvSqrt with
5173 // my function so everyone can benefit on G5.
5175 for ( index = 0; index < numJoints; index++ ) {
5185 static int next[3] = { 1, 2, 0 };
5187 float *mat = (float*)( jointMats[index].ToFloatPtr() );
5188 trace = mat[0 * 4 + 0] + mat[1 * 4 + 1] + mat[2 * 4 + 2];
5190 if ( trace > 0.0f ) {
5193 //s = idMath::InvSqrt( t ) * 0.5f;
5194 s = FastScalarInvSqrt( t ) * 0.5f;
5197 jq.q[0] = ( mat[1 * 4 + 2] - mat[2 * 4 + 1] ) * s;
5198 jq.q[1] = ( mat[2 * 4 + 0] - mat[0 * 4 + 2] ) * s;
5199 jq.q[2] = ( mat[0 * 4 + 1] - mat[1 * 4 + 0] ) * s;
5204 if ( mat[1 * 4 + 1] > mat[0 * 4 + 0] ) {
5207 if ( mat[2 * 4 + 2] > mat[i * 4 + i] ) {
5213 t = ( mat[i * 4 + i] - ( mat[j * 4 + j] + mat[k * 4 + k] ) ) + 1.0f;
5214 //s = idMath::InvSqrt( t ) * 0.5f;
5215 s = FastScalarInvSqrt( t ) * 0.5f;
5218 jq.q[3] = ( mat[j * 4 + k] - mat[k * 4 + j] ) * s;
5219 jq.q[j] = ( mat[i * 4 + j] + mat[j * 4 + i] ) * s;
5220 jq.q[k] = ( mat[i * 4 + k] + mat[k * 4 + i] ) * s;
5223 jq.t[0] = mat[0 * 4 + 3];
5224 jq.t[1] = mat[1 * 4 + 3];
5225 jq.t[2] = mat[2 * 4 + 3];
5226 jointQuats[index] = jq;
5232 idSIMD_AltiVec::TransformJoints
5235 void VPCALL idSIMD_AltiVec::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
5238 for( i = firstJoint; i <= lastJoint; i++ ) {
5239 assert( parents[i] < i );
5240 jointMats[i] *= jointMats[parents[i]];
5244 // I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
5245 // on what the parents array looks like. This is true in the test code.
5246 for ( i = firstJoint; i <= lastJoint; i++ ) {
5247 assert( parents[i] < i );
5248 float *jointPtr = jointMats[i].ToFloatPtr();
5249 float *parentPtr = jointMats[parents[i]].ToFloatPtr();
5251 vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
5252 vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
5253 vector float v0, v1, v2, v3, v4, v5, v6, v7;
5255 // we need to load up 12 float elements that make up the Mat
5256 v0 = vec_ld( 0, jointPtr );
5257 v1 = vec_ld( 15, jointPtr );
5258 v2 = vec_ld( 31, jointPtr );
5259 v3 = vec_ld( 47, jointPtr );
5262 v4 = vec_ld( 0, parentPtr );
5263 v5 = vec_ld( 15, parentPtr );
5264 v6 = vec_ld( 31, parentPtr );
5265 v7 = vec_ld( 47, parentPtr );
5267 // permute into vectors
5268 vector float vecJointMat1 = vec_perm( v0, v1, permVec );
5269 vector float vecJointMat2 = vec_perm( v1, v2, permVec );
5270 vector float vecJointMat3 = vec_perm( v2, v3, permVec );
5272 vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
5273 vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
5274 vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
5276 vector float zero = (vector float)(0);
5277 vector float C1, C2, C3;
5280 C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero ); // m(0 to 3) * a(0)
5281 C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat2, 0 ), zero ); // m(4 to 7) * a(4)
5282 C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat3, 0 ), zero ); // m(8 to 11) * a(8)
5284 C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat1, 1 ), C1 ); // add in m(4 to 7) * a(1)
5285 C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 ); // add in m(4 to 7) * a(5)
5286 C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat3, 1 ), C3 ); // add in m(4 to 7) * a(9)
5288 C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat1, 2 ), C1 );
5289 C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat2, 2 ), C2 );
5290 C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
5292 // do the addition at the end
5293 vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
5294 C1 = vec_add( C1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
5295 C2 = vec_add( C2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
5296 C3 = vec_add( C3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
5299 UNALIGNED_STORE3( (float*) jointPtr, C1, C2, C3 );
5306 idSIMD_AltiVec::UntransformJoints
5309 void VPCALL idSIMD_AltiVec::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
5312 for( i = lastJoint; i >= firstJoint; i-- ) {
5313 assert( parents[i] < i );
5314 jointMats[i] /= jointMats[parents[i]];
5317 // I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
5318 // on what the parents array looks like. This is true in the test code.
5319 for ( i = lastJoint; i >= firstJoint; i-- ) {
5320 assert( parents[i] < i );
5321 float *jointPtr = jointMats[i].ToFloatPtr();
5322 float *parentPtr = jointMats[parents[i]].ToFloatPtr();
5324 vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
5325 vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
5326 vector float v0, v1, v2, v3, v4, v5, v6, v7;
5328 // we need to load up 12 float elements that make up the Mat
5329 v0 = vec_ld( 0, jointPtr );
5330 v1 = vec_ld( 15, jointPtr );
5331 v2 = vec_ld( 31, jointPtr );
5332 v3 = vec_ld( 47, jointPtr );
5335 v4 = vec_ld( 0, parentPtr );
5336 v5 = vec_ld( 15, parentPtr );
5337 v6 = vec_ld( 31, parentPtr );
5338 v7 = vec_ld( 47, parentPtr );
5340 // permute into vectors
5341 vector float vecJointMat1 = vec_perm( v0, v1, permVec );
5342 vector float vecJointMat2 = vec_perm( v1, v2, permVec );
5343 vector float vecJointMat3 = vec_perm( v2, v3, permVec );
5345 vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
5346 vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
5347 vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
5349 vector float zero = (vector float)(0);
5350 vector float C1, C2, C3;
5352 // do subtraction at the beginning
5353 vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
5354 vecJointMat1 = vec_sub( vecJointMat1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
5355 vecJointMat2 = vec_sub( vecJointMat2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
5356 vecJointMat3 = vec_sub( vecJointMat3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
5359 C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero );
5360 C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 1 ), zero );
5361 C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 2 ), zero );
5363 C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 0 ), C1 );
5364 C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 );
5365 C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 2 ), C3 );
5367 C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 0 ), C1 );
5368 C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 1 ), C2 );
5369 C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
5371 // store results back
5372 vector unsigned char storePerm = vec_lvsr( 0, jointPtr );
5374 // right rotate the input data
5375 C1 = vec_perm( C1, C1, storePerm );
5376 C2 = vec_perm( C2, C2, storePerm );
5377 C3 = vec_perm( C3, C3, storePerm );
5379 vec_ste( C1, 0, (float*) jointPtr );
5380 vec_ste( C1, 4, (float*) jointPtr );
5381 vec_ste( C1, 8, (float*) jointPtr );
5382 vec_ste( C1, 12, (float*) jointPtr );
5384 vec_ste( C2, 16, (float*) jointPtr );
5385 vec_ste( C2, 20, (float*) jointPtr );
5386 vec_ste( C2, 24, (float*) jointPtr );
5387 vec_ste( C2, 28, (float*) jointPtr );
5389 vec_ste( C3, 32, (float*) jointPtr );
5390 vec_ste( C3, 36, (float*) jointPtr );
5391 vec_ste( C3, 40, (float*) jointPtr );
5392 vec_ste( C3, 44, (float*) jointPtr );
5400 idSIMD_AltiVec::TransformVerts
5404 // Here we don't have much for the vector unit to do, and the gain we get from doing the math
5405 // in parallel is eaten by doing unaligned stores.
5406 void VPCALL idSIMD_AltiVec::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, int numWeights ) {
5408 const byte *jointsPtr = (byte *)joints;
5410 for( j = i = 0; i < numVerts; i++ ) {
5413 float *matPtrOrig = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
5414 float *weightPtr = (float*) weights[j].ToFloatPtr();
5416 v[0] = matPtrOrig[0] * weightPtr[0];
5417 v[0] += matPtrOrig[1] * weightPtr[1];
5418 v[0] += matPtrOrig[2] * weightPtr[2];
5419 v[0] += matPtrOrig[3] * weightPtr[3];
5421 v[1] = matPtrOrig[4] * weightPtr[0];
5422 v[1] += matPtrOrig[5] * weightPtr[1];
5423 v[1] += matPtrOrig[6] * weightPtr[2];
5424 v[1] += matPtrOrig[7] * weightPtr[3];
5426 v[2] = matPtrOrig[8] * weightPtr[0];
5427 v[2] += matPtrOrig[9] * weightPtr[1];
5428 v[2] += matPtrOrig[10] * weightPtr[2];
5429 v[2] += matPtrOrig[11] * weightPtr[3];
5431 while( index[j*2+1] == 0 ) {
5433 float *matPtr = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
5434 weightPtr = (float*) weights[j].ToFloatPtr();
5436 v[0] += matPtr[0] * weightPtr[0];
5437 v[0] += matPtr[1] * weightPtr[1];
5438 v[0] += matPtr[2] * weightPtr[2];
5439 v[0] += matPtr[3] * weightPtr[3];
5441 v[1] += matPtr[4] * weightPtr[0];
5442 v[1] += matPtr[5] * weightPtr[1];
5443 v[1] += matPtr[6] * weightPtr[2];
5444 v[1] += matPtr[7] * weightPtr[3];
5446 v[2] += matPtr[8] * weightPtr[0];
5447 v[2] += matPtr[9] * weightPtr[1];
5448 v[2] += matPtr[10] * weightPtr[2];
5449 v[2] += matPtr[11] * weightPtr[3];
5456 #endif /* LIVE_VICARIOUSLY */
5460 #ifndef DRAWVERT_PADDED
5463 idSIMD_AltiVec::TracePointCull
5466 void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
5469 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
5475 const float *planePtr = planes[0].ToFloatPtr();
5477 vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
5478 vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
5479 vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
5480 vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
5481 vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
5482 vector unsigned char vecPerm;
5483 vector float v0, v1, v2, v3, v4, v5, v6, v7;
5484 vector float zeroVector = (vector float)(0);
5485 vector float vecRadius;
5486 vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
5487 vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
5488 vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
5489 vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
5490 vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
5491 vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
5492 vector bool int oneIntVector = (vector bool int)(1);
5493 vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
5494 vector unsigned int vecTotals;
5495 vector unsigned int tempIntSum;
5496 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
5498 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
5501 v0 = vec_ld( 0, planePtr );
5502 v1 = vec_ld( 15, planePtr );
5503 vecPlane0 = vec_perm( v0, v1, vecPerm );
5505 v2 = vec_ld( 0, planePtr + 4 );
5506 v3 = vec_ld( 15, planePtr + 4 );
5507 vecPlane1 = vec_perm( v2, v3, vecPerm );
5509 v0 = vec_ld( 0, planePtr + 8 );
5510 v1 = vec_ld( 15, planePtr + 8 );
5511 vecPlane2 = vec_perm( v0, v1, vecPerm );
5513 v2 = vec_ld( 0, planePtr + 12 );
5514 v3 = vec_ld( 15, planePtr + 12 );
5515 vecPlane3 = vec_perm( v2, v3, vecPerm );
5518 v0 = vec_mergeh( vecPlane0, vecPlane2 );
5519 v1 = vec_mergeh( vecPlane1, vecPlane3 );
5520 v2 = vec_mergel( vecPlane0, vecPlane2 );
5521 v3 = vec_mergel( vecPlane1, vecPlane3 );
5523 vecPlane0 = vec_mergeh( v0, v1 );
5524 vecPlane1 = vec_mergel( v0, v1 );
5525 vecPlane2 = vec_mergeh( v2, v3 );
5526 vecPlane3 = vec_mergel( v2, v3 );
5529 vecRadius = loadSplatUnalignedScalar( &radius );
5531 unsigned int cullBitVal[4];
5532 vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
5535 // every fourth one will have the same alignment. Make sure we've got enough here
5536 if ( i+3 < numVerts ) {
5537 vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5538 vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5539 vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5540 vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5544 for ( ; i+3 < numVerts; i+=4 ) {
5545 const float *vertPtr = verts[i].xyz.ToFloatPtr();
5546 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
5547 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
5548 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
5550 v0 = vec_ld( 0, vertPtr );
5551 v1 = vec_ld( 15, vertPtr );
5552 v2 = vec_ld( 0, vertPtr2 );
5553 v3 = vec_ld( 15, vertPtr2 );
5554 v4 = vec_ld( 0, vertPtr3 );
5555 v5 = vec_ld( 15, vertPtr3 );
5556 v6 = vec_ld( 0, vertPtr4 );
5557 v7 = vec_ld( 15, vertPtr4 );
5559 vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
5560 vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
5561 vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
5562 vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
5564 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
5565 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
5566 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
5567 vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
5569 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
5570 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
5571 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
5572 vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
5574 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
5575 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
5576 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
5577 vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
5579 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
5580 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
5581 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
5582 vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
5584 // vec1Sum1 now holds d0, d1, d2, d3. calculate the
5585 // difference with +radius and -radius
5586 vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
5587 vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
5588 vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
5589 vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
5590 vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
5591 vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
5592 vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
5593 vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
5596 vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
5597 vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
5598 vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
5599 vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
5600 vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
5601 vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
5602 vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
5603 vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
5605 //and it with 1 so we multiply by 1 not 1111's
5606 vecCmp1 = vec_and( vecCmp1, oneIntVector );
5607 vecCmp2 = vec_and( vecCmp2, oneIntVector );
5608 vecCmp3 = vec_and( vecCmp3, oneIntVector );
5609 vecCmp4 = vec_and( vecCmp4, oneIntVector );
5610 vecCmp5 = vec_and( vecCmp5, oneIntVector );
5611 vecCmp6 = vec_and( vecCmp6, oneIntVector );
5612 vecCmp7 = vec_and( vecCmp7, oneIntVector );
5613 vecCmp8 = vec_and( vecCmp8, oneIntVector );
5615 vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
5616 vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
5617 vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
5618 vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
5619 vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
5620 vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
5621 vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
5622 vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
5624 // OR (add) them all together
5625 vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
5626 vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
5627 vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
5628 vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
5630 vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
5631 vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
5632 tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
5633 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5634 vecTotals = vec_mergeh( vecTotals, tempIntSum );
5635 tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
5636 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5637 vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
5638 tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
5639 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5640 vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
5642 // store out results
5643 vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
5644 tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
5645 vec_ste( tempSt, 0, &cullBitVal[0] );
5646 vec_ste( tempSt, 4, &cullBitVal[0] );
5647 vec_ste( tempSt, 8, &cullBitVal[0] );
5648 vec_ste( tempSt, 12, &cullBitVal[0] );
5650 tOr |= cullBitVal[0];
5651 tOr |= cullBitVal[1];
5652 tOr |= cullBitVal[2];
5653 tOr |= cullBitVal[3];
5655 cullBits[i] = cullBitVal[0];
5656 cullBits[i+1] = cullBitVal[1];
5657 cullBits[i+2] = cullBitVal[2];
5658 cullBits[i+3] = cullBitVal[3];
5662 for ( ; i < numVerts; i++ ) {
5664 float d0, d1, d2, d3, t;
5665 const idVec3 &v = verts[i].xyz;
5667 d0 = planes[0].Distance( v );
5668 d1 = planes[1].Distance( v );
5669 d2 = planes[2].Distance( v );
5670 d3 = planes[3].Distance( v );
5673 bits = FLOATSIGNBITSET( t ) << 0;
5675 bits |= FLOATSIGNBITSET( t ) << 1;
5677 bits |= FLOATSIGNBITSET( t ) << 2;
5679 bits |= FLOATSIGNBITSET( t ) << 3;
5682 bits |= FLOATSIGNBITSET( t ) << 4;
5684 bits |= FLOATSIGNBITSET( t ) << 5;
5686 bits |= FLOATSIGNBITSET( t ) << 6;
5688 bits |= FLOATSIGNBITSET( t ) << 7;
5690 bits ^= 0x0F; // flip lower four bits
5702 idSIMD_AltiVec::TracePointCull
5705 void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
5708 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
5714 const float *planePtr = planes[0].ToFloatPtr();
5716 vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
5717 vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
5718 vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
5719 vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
5720 vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
5721 vector unsigned char vecPerm;
5722 vector float v0, v1, v2, v3, v4, v5, v6, v7;
5723 vector float zeroVector = (vector float)(0);
5724 vector float vecRadius;
5725 vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
5726 vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
5727 vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
5728 vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
5729 vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
5730 vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
5731 vector bool int oneIntVector = (vector bool int)(1);
5732 vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
5733 vector unsigned int vecTotals;
5734 vector unsigned int tempIntSum;
5735 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
5737 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
5740 v0 = vec_ld( 0, planePtr );
5741 v1 = vec_ld( 15, planePtr );
5742 vecPlane0 = vec_perm( v0, v1, vecPerm );
5744 v2 = vec_ld( 0, planePtr + 4 );
5745 v3 = vec_ld( 15, planePtr + 4 );
5746 vecPlane1 = vec_perm( v2, v3, vecPerm );
5748 v0 = vec_ld( 0, planePtr + 8 );
5749 v1 = vec_ld( 15, planePtr + 8 );
5750 vecPlane2 = vec_perm( v0, v1, vecPerm );
5752 v2 = vec_ld( 0, planePtr + 12 );
5753 v3 = vec_ld( 15, planePtr + 12 );
5754 vecPlane3 = vec_perm( v2, v3, vecPerm );
5757 v0 = vec_mergeh( vecPlane0, vecPlane2 );
5758 v1 = vec_mergeh( vecPlane1, vecPlane3 );
5759 v2 = vec_mergel( vecPlane0, vecPlane2 );
5760 v3 = vec_mergel( vecPlane1, vecPlane3 );
5762 vecPlane0 = vec_mergeh( v0, v1 );
5763 vecPlane1 = vec_mergel( v0, v1 );
5764 vecPlane2 = vec_mergeh( v2, v3 );
5765 vecPlane3 = vec_mergel( v2, v3 );
5768 vecRadius = loadSplatUnalignedScalar( &radius );
5770 unsigned int cullBitVal[4];
5771 vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
5775 for ( ; i+3 < numVerts; i+=4 ) {
5776 const float *vertPtr = verts[i].xyz.ToFloatPtr();
5777 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
5778 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
5779 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
5781 vecXYZ1 = vec_ld( 0, vertPtr );
5782 vecXYZ2 = vec_ld( 0, vertPtr2 );
5783 vecXYZ3 = vec_ld( 0, vertPtr3 );
5784 vecXYZ4 = vec_ld( 0, vertPtr4 );
5786 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
5787 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
5788 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
5789 vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
5791 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
5792 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
5793 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
5794 vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
5796 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
5797 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
5798 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
5799 vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
5801 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
5802 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
5803 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
5804 vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
5806 // vec1Sum1 now holds d0, d1, d2, d3. calculate the
5807 // difference with +radius and -radius
5808 vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
5809 vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
5810 vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
5811 vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
5812 vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
5813 vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
5814 vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
5815 vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
5818 vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
5819 vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
5820 vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
5821 vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
5822 vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
5823 vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
5824 vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
5825 vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
5827 //and it with 1 so we multiply by 1 not 1111's
5828 vecCmp1 = vec_and( vecCmp1, oneIntVector );
5829 vecCmp2 = vec_and( vecCmp2, oneIntVector );
5830 vecCmp3 = vec_and( vecCmp3, oneIntVector );
5831 vecCmp4 = vec_and( vecCmp4, oneIntVector );
5832 vecCmp5 = vec_and( vecCmp5, oneIntVector );
5833 vecCmp6 = vec_and( vecCmp6, oneIntVector );
5834 vecCmp7 = vec_and( vecCmp7, oneIntVector );
5835 vecCmp8 = vec_and( vecCmp8, oneIntVector );
5837 vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
5838 vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
5839 vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
5840 vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
5841 vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
5842 vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
5843 vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
5844 vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
5846 // OR (add) them all together
5847 vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
5848 vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
5849 vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
5850 vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
5852 vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
5853 vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
5854 tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
5855 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5856 vecTotals = vec_mergeh( vecTotals, tempIntSum );
5857 tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
5858 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5859 vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
5860 tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
5861 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5862 vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
5864 // store out results
5865 vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
5866 tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
5867 vec_ste( tempSt, 0, &cullBitVal[0] );
5868 vec_ste( tempSt, 4, &cullBitVal[0] );
5869 vec_ste( tempSt, 8, &cullBitVal[0] );
5870 vec_ste( tempSt, 12, &cullBitVal[0] );
5872 tOr |= cullBitVal[0];
5873 tOr |= cullBitVal[1];
5874 tOr |= cullBitVal[2];
5875 tOr |= cullBitVal[3];
5877 cullBits[i] = cullBitVal[0];
5878 cullBits[i+1] = cullBitVal[1];
5879 cullBits[i+2] = cullBitVal[2];
5880 cullBits[i+3] = cullBitVal[3];
5884 for ( ; i < numVerts; i++ ) {
5886 float d0, d1, d2, d3, t;
5887 const idVec3 &v = verts[i].xyz;
5889 d0 = planes[0].Distance( v );
5890 d1 = planes[1].Distance( v );
5891 d2 = planes[2].Distance( v );
5892 d3 = planes[3].Distance( v );
5895 bits = FLOATSIGNBITSET( t ) << 0;
5897 bits |= FLOATSIGNBITSET( t ) << 1;
5899 bits |= FLOATSIGNBITSET( t ) << 2;
5901 bits |= FLOATSIGNBITSET( t ) << 3;
5904 bits |= FLOATSIGNBITSET( t ) << 4;
5906 bits |= FLOATSIGNBITSET( t ) << 5;
5908 bits |= FLOATSIGNBITSET( t ) << 6;
5910 bits |= FLOATSIGNBITSET( t ) << 7;
5912 bits ^= 0x0F; // flip lower four bits
5921 #endif /* DRAWVERT_PADDED */
5923 #ifndef DRAWVERT_PADDED
5926 idSIMD_AltiVec::DecalPointCull
5929 void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
5932 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
5935 const float *planePtr = planes[0].ToFloatPtr();
5937 vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
5938 vector float zeroVector = (vector float)(0.0);
5939 vector unsigned char vecPerm;
5940 vector float v0, v1, v2, v3, v4, v5, v6, v7;
5942 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
5945 v0 = vec_ld( 0, planePtr );
5946 v1 = vec_ld( 15, planePtr );
5947 vecPlane0 = vec_perm( v0, v1, vecPerm );
5949 v2 = vec_ld( 0, planePtr + 4 );
5950 v3 = vec_ld( 15, planePtr + 4 );
5951 vecPlane1 = vec_perm( v2, v3, vecPerm );
5953 v0 = vec_ld( 0, planePtr + 8 );
5954 v1 = vec_ld( 15, planePtr + 8 );
5955 vecPlane2 = vec_perm( v0, v1, vecPerm );
5957 v2 = vec_ld( 0, planePtr + 12 );
5958 v3 = vec_ld( 15, planePtr + 12 );
5959 vecPlane3 = vec_perm( v2, v3, vecPerm );
5961 v0 = vec_ld( 0, planePtr + 16 );
5962 v1 = vec_ld( 15, planePtr + 16 );
5963 vecPlane4 = vec_perm( v0, v1, vecPerm );
5965 v2 = vec_ld( 0, planePtr + 20 );
5966 v3 = vec_ld( 15, planePtr + 20 );
5967 vecPlane5 = vec_perm( v2, v3, vecPerm );
5970 v0 = vec_mergeh( vecPlane0, vecPlane2 );
5971 v1 = vec_mergeh( vecPlane1, vecPlane3 );
5972 v2 = vec_mergel( vecPlane0, vecPlane2 );
5973 v3 = vec_mergel( vecPlane1, vecPlane3 );
5975 vecPlane0 = vec_mergeh( v0, v1 );
5976 vecPlane1 = vec_mergel( v0, v1 );
5977 vecPlane2 = vec_mergeh( v2, v3 );
5978 vecPlane3 = vec_mergel( v2, v3 );
5980 v0 = vec_mergeh( vecPlane4, zeroVector );
5981 v1 = vec_mergeh( vecPlane5, zeroVector );
5982 v2 = vec_mergel( vecPlane4, zeroVector );
5983 v3 = vec_mergel( vecPlane5, zeroVector );
5985 vecPlane4 = vec_mergeh( v0, v1 );
5986 vecPlane5 = vec_mergel( v0, v1 );
5987 vecPlane6 = vec_mergeh( v2, v3 );
5988 vecPlane7 = vec_mergel( v2, v3 );
5991 vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
5992 vector bool int oneIntVector = (vector bool int)(1);
5993 vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
5994 vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
5995 vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
5997 vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
5998 vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
5999 vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
6000 vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
6001 vector unsigned int vecR1, vecR2, vecR3, vecR4;
6002 vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
6003 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6004 unsigned int vBits[4];
6005 vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
6008 // every fourth one will have the same alignment. Make sure we've got enough here
6009 if ( i+3 < numVerts ) {
6010 vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6011 vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6012 vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6013 vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6017 for ( ; i+3 < numVerts; i+=4 ) {
6018 const float *vertPtr = verts[i].xyz.ToFloatPtr();
6019 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6020 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6021 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6023 v0 = vec_ld( 0, vertPtr );
6024 v1 = vec_ld( 15, vertPtr );
6025 v2 = vec_ld( 0, vertPtr2 );
6026 v3 = vec_ld( 15, vertPtr2 );
6027 v4 = vec_ld( 0, vertPtr3 );
6028 v5 = vec_ld( 15, vertPtr3 );
6029 v6 = vec_ld( 0, vertPtr4 );
6030 v7 = vec_ld( 15, vertPtr4 );
6032 vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
6033 vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
6034 vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
6035 vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
6037 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
6038 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
6039 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
6040 vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
6042 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
6043 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
6044 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
6045 vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
6047 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
6048 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
6049 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
6050 vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
6052 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
6053 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
6054 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
6055 vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
6057 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
6058 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
6059 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
6060 vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
6062 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
6063 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
6064 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
6065 vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
6067 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
6068 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
6069 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
6070 vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
6072 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
6073 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
6074 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
6075 vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
6077 vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
6078 vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
6079 vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
6080 vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
6081 vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
6082 vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
6083 vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
6084 vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
6086 //and it with 1 so we multiply by 1 not 1111's
6087 vecCmp1 = vec_and( vecCmp1, oneIntVector );
6088 vecCmp2 = vec_and( vecCmp2, oneIntVector );
6089 vecCmp3 = vec_and( vecCmp3, oneIntVector );
6090 vecCmp4 = vec_and( vecCmp4, oneIntVector );
6091 vecCmp5 = vec_and( vecCmp5, oneIntVector );
6092 vecCmp6 = vec_and( vecCmp6, oneIntVector );
6093 vecCmp7 = vec_and( vecCmp7, oneIntVector );
6094 vecCmp8 = vec_and( vecCmp8, oneIntVector );
6096 vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
6097 vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
6098 vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
6099 vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
6100 vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
6101 vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
6102 vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
6103 vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
6105 //OR them all together (this is the same as adding them, since they're all only 1 bit set)
6106 vecR1 = (vector unsigned int)(0); //zeroIntVector;
6107 vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
6108 vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
6109 vecR1 = vec_add(vecR1, vecBitShifted2 );
6110 vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6112 vecR2 = (vector unsigned int)(0); //zeroIntVector;
6113 vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
6114 vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
6115 vecR2 = vec_add(vecR2, vecBitShifted4 );
6116 vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
6118 vecR3 = (vector unsigned int)(0); //zeroIntVector;
6119 vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
6120 vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
6121 vecR3 = vec_add(vecR3, vecBitShifted6 );
6122 vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
6124 vecR4 = (vector unsigned int)(0); //zeroIntVector;
6125 vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
6126 vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
6127 vecR4 = vec_add(vecR4, vecBitShifted8 );
6128 vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
6130 // take the first element from each vector and put them into vecR1
6131 vecR1 = vec_mergeh( vecR1, vecR2 );
6132 vecR3 = vec_mergeh( vecR3, vecR4 );
6133 vecR1 = vec_perm( vecR1, vecR3, permHalves );
6135 // XOR with 0x3F to flip lower 6 bits
6136 vecR1 = vec_xor( vecR1, vecFlipBits );
6138 // store out results. don't have 16 at a time so let's just
6139 // do this and avoid alignment concerns
6140 vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
6141 vec_ste( vecR1, 0, &vBits[0] );
6142 vec_ste( vecR1, 4, &vBits[0] );
6143 vec_ste( vecR1, 8, &vBits[0] );
6144 vec_ste( vecR1, 12, &vBits[0] );
6146 cullBits[i] = vBits[0];
6147 cullBits[i+1] = vBits[1];
6148 cullBits[i+2] = vBits[2];
6149 cullBits[i+3] = vBits[3];
6152 for ( ; i < numVerts; i++ ) {
6154 float d0, d1, d2, d3, d4, d5;
6155 const idVec3 &v = verts[i].xyz;
6157 d0 = planes[0].Distance( v );
6158 d1 = planes[1].Distance( v );
6159 d2 = planes[2].Distance( v );
6160 d3 = planes[3].Distance( v );
6161 d4 = planes[4].Distance( v );
6162 d5 = planes[5].Distance( v );
6164 // they check if the sign bit is set by casting as long and shifting right 31 places.
6165 bits = FLOATSIGNBITSET( d0 ) << 0;
6166 bits |= FLOATSIGNBITSET( d1 ) << 1;
6167 bits |= FLOATSIGNBITSET( d2 ) << 2;
6168 bits |= FLOATSIGNBITSET( d3 ) << 3;
6169 bits |= FLOATSIGNBITSET( d4 ) << 4;
6170 bits |= FLOATSIGNBITSET( d5 ) << 5;
6172 cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
6180 idSIMD_AltiVec::DecalPointCull
6183 void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
6186 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6189 const float *planePtr = planes[0].ToFloatPtr();
6191 vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
6192 vector float zeroVector = (vector float)(0.0);
6193 vector unsigned char vecPerm;
6194 vector float v0, v1, v2, v3, v4, v5, v6, v7;
6196 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
6199 v0 = vec_ld( 0, planePtr );
6200 v1 = vec_ld( 15, planePtr );
6201 vecPlane0 = vec_perm( v0, v1, vecPerm );
6203 v2 = vec_ld( 0, planePtr + 4 );
6204 v3 = vec_ld( 15, planePtr + 4 );
6205 vecPlane1 = vec_perm( v2, v3, vecPerm );
6207 v0 = vec_ld( 0, planePtr + 8 );
6208 v1 = vec_ld( 15, planePtr + 8 );
6209 vecPlane2 = vec_perm( v0, v1, vecPerm );
6211 v2 = vec_ld( 0, planePtr + 12 );
6212 v3 = vec_ld( 15, planePtr + 12 );
6213 vecPlane3 = vec_perm( v2, v3, vecPerm );
6215 v0 = vec_ld( 0, planePtr + 16 );
6216 v1 = vec_ld( 15, planePtr + 16 );
6217 vecPlane4 = vec_perm( v0, v1, vecPerm );
6219 v2 = vec_ld( 0, planePtr + 20 );
6220 v3 = vec_ld( 15, planePtr + 20 );
6221 vecPlane5 = vec_perm( v2, v3, vecPerm );
6224 v0 = vec_mergeh( vecPlane0, vecPlane2 );
6225 v1 = vec_mergeh( vecPlane1, vecPlane3 );
6226 v2 = vec_mergel( vecPlane0, vecPlane2 );
6227 v3 = vec_mergel( vecPlane1, vecPlane3 );
6229 vecPlane0 = vec_mergeh( v0, v1 );
6230 vecPlane1 = vec_mergel( v0, v1 );
6231 vecPlane2 = vec_mergeh( v2, v3 );
6232 vecPlane3 = vec_mergel( v2, v3 );
6234 v0 = vec_mergeh( vecPlane4, zeroVector );
6235 v1 = vec_mergeh( vecPlane5, zeroVector );
6236 v2 = vec_mergel( vecPlane4, zeroVector );
6237 v3 = vec_mergel( vecPlane5, zeroVector );
6239 vecPlane4 = vec_mergeh( v0, v1 );
6240 vecPlane5 = vec_mergel( v0, v1 );
6241 vecPlane6 = vec_mergeh( v2, v3 );
6242 vecPlane7 = vec_mergel( v2, v3 );
6245 vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
6246 vector bool int oneIntVector = (vector bool int)(1);
6247 vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
6248 vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
6249 vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
6251 vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
6252 vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
6253 vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
6254 vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
6255 vector unsigned int vecR1, vecR2, vecR3, vecR4;
6256 vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
6257 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6258 unsigned int vBits[4];
6259 vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
6263 for ( ; i+3 < numVerts; i+=4 ) {
6264 const float *vertPtr = verts[i].xyz.ToFloatPtr();
6265 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6266 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6267 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6269 v0 = vec_ld( 0, vertPtr );
6270 v2 = vec_ld( 0, vertPtr2 );
6271 v4 = vec_ld( 0, vertPtr3 );
6272 v6 = vec_ld( 0, vertPtr4 );
6274 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
6275 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
6276 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
6277 vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
6279 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
6280 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
6281 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
6282 vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
6284 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
6285 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
6286 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
6287 vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
6289 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
6290 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
6291 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
6292 vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
6294 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
6295 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
6296 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
6297 vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
6299 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
6300 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
6301 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
6302 vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
6304 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
6305 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
6306 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
6307 vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
6309 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
6310 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
6311 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
6312 vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
6314 vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
6315 vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
6316 vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
6317 vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
6318 vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
6319 vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
6320 vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
6321 vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
6323 //and it with 1 so we multiply by 1 not 1111's
6324 vecCmp1 = vec_and( vecCmp1, oneIntVector );
6325 vecCmp2 = vec_and( vecCmp2, oneIntVector );
6326 vecCmp3 = vec_and( vecCmp3, oneIntVector );
6327 vecCmp4 = vec_and( vecCmp4, oneIntVector );
6328 vecCmp5 = vec_and( vecCmp5, oneIntVector );
6329 vecCmp6 = vec_and( vecCmp6, oneIntVector );
6330 vecCmp7 = vec_and( vecCmp7, oneIntVector );
6331 vecCmp8 = vec_and( vecCmp8, oneIntVector );
6333 vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
6334 vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
6335 vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
6336 vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
6337 vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
6338 vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
6339 vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
6340 vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
6342 //OR them all together (this is the same as adding them, since they're all only 1 bit set)
6343 vecR1 = (vector unsigned int)(0); //zeroIntVector;
6344 vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
6345 vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
6346 vecR1 = vec_add(vecR1, vecBitShifted2 );
6347 vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6349 vecR2 = (vector unsigned int)(0); //zeroIntVector;
6350 vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
6351 vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
6352 vecR2 = vec_add(vecR2, vecBitShifted4 );
6353 vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
6355 vecR3 = (vector unsigned int)(0); //zeroIntVector;
6356 vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
6357 vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
6358 vecR3 = vec_add(vecR3, vecBitShifted6 );
6359 vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
6361 vecR4 = (vector unsigned int)(0); //zeroIntVector;
6362 vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
6363 vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
6364 vecR4 = vec_add(vecR4, vecBitShifted8 );
6365 vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
6367 // take the first element from each vector and put them into vecR1
6368 vecR1 = vec_mergeh( vecR1, vecR2 );
6369 vecR3 = vec_mergeh( vecR3, vecR4 );
6370 vecR1 = vec_perm( vecR1, vecR3, permHalves );
6372 // XOR with 0x3F to flip lower 6 bits
6373 vecR1 = vec_xor( vecR1, vecFlipBits );
6375 // store out results. don't have 16 at a time so let's just
6376 // do this and avoid alignment concerns
6377 vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
6378 vec_ste( vecR1, 0, &vBits[0] );
6379 vec_ste( vecR1, 4, &vBits[0] );
6380 vec_ste( vecR1, 8, &vBits[0] );
6381 vec_ste( vecR1, 12, &vBits[0] );
6383 cullBits[i] = vBits[0];
6384 cullBits[i+1] = vBits[1];
6385 cullBits[i+2] = vBits[2];
6386 cullBits[i+3] = vBits[3];
6389 for ( ; i < numVerts; i++ ) {
6391 float d0, d1, d2, d3, d4, d5;
6392 const idVec3 &v = verts[i].xyz;
6394 d0 = planes[0].Distance( v );
6395 d1 = planes[1].Distance( v );
6396 d2 = planes[2].Distance( v );
6397 d3 = planes[3].Distance( v );
6398 d4 = planes[4].Distance( v );
6399 d5 = planes[5].Distance( v );
6401 // they check if the sign bit is set by casting as long and shifting right 31 places.
6402 bits = FLOATSIGNBITSET( d0 ) << 0;
6403 bits |= FLOATSIGNBITSET( d1 ) << 1;
6404 bits |= FLOATSIGNBITSET( d2 ) << 2;
6405 bits |= FLOATSIGNBITSET( d3 ) << 3;
6406 bits |= FLOATSIGNBITSET( d4 ) << 4;
6407 bits |= FLOATSIGNBITSET( d5 ) << 5;
6409 cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
6414 #endif /*DRAWVERT_PADDED */
6416 #ifndef DRAWVERT_PADDED
6419 idSIMD_AltiVec::OverlayPointCull
6422 void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
6425 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6429 float p0x, p0y, p0z, p0d;
6430 float p1x, p1y, p1z, p1d;
6432 const float *planePtr = planes[0].ToFloatPtr();
6433 const float *vertPtr = verts[0].xyz.ToFloatPtr();
6435 vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
6436 vector float v0, v1, v2, v3, v4, v5, v6, v7;
6437 vector unsigned char vecPerm;
6438 vector float zeroVector = (vector float)(0);
6440 p0x = *(planePtr + 0);
6441 p0y = *(planePtr + 1);
6442 p0z = *(planePtr + 2);
6443 p0d = *(planePtr + 3);
6444 p1x = *(planePtr + 4);
6445 p1y = *(planePtr + 5);
6446 p1z = *(planePtr + 6);
6447 p1d = *(planePtr + 7);
6449 // populate the planes
6450 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
6451 v0 = vec_ld( 0, planePtr );
6452 v1 = vec_ld( 15, planePtr );
6453 vecPlane0 = vec_perm( v0, v1, vecPerm );
6455 v2 = vec_ld( 31, planePtr );
6456 vecPlane1 = vec_perm( v1, v2, vecPerm );
6459 v0 = vec_mergeh( vecPlane0, vecPlane0 );
6460 v1 = vec_mergeh( vecPlane1, vecPlane1 );
6461 v2 = vec_mergel( vecPlane0, vecPlane0 );
6462 v3 = vec_mergel( vecPlane1, vecPlane1);
6464 vecPlane0 = vec_mergeh( v0, v1 );
6465 vecPlane1 = vec_mergel( v0, v1 );
6466 vecPlane2 = vec_mergeh( v2, v3 );
6467 vecPlane3 = vec_mergel( v2, v3 );
6469 vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
6470 vector float oneVector = (vector float)(1);
6472 vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
6474 vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
6475 vector float negTwoVector = (vector float)(-2);
6476 vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
6477 vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
6478 vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
6479 vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
6480 vector bool int oneIntVector = (vector bool int)(1);
6481 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6482 unsigned int cullBitVal[4];
6483 vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
6486 // every fourth one will have the same alignment. Make sure we've got enough here
6487 if ( i+3 < numVerts ) {
6488 vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6489 vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6490 vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6491 vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6495 for ( ; i+3 < numVerts; i+=4 ) {
6496 const float *vertPtr = verts[i].xyz.ToFloatPtr();
6497 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6498 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6499 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6501 v0 = vec_ld( 0, vertPtr );
6502 v1 = vec_ld( 15, vertPtr );
6503 v2 = vec_ld( 0, vertPtr2 );
6504 v3 = vec_ld( 15, vertPtr2 );
6505 v4 = vec_ld( 0, vertPtr3 );
6506 v5 = vec_ld( 15, vertPtr3 );
6507 v6 = vec_ld( 0, vertPtr4 );
6508 v7 = vec_ld( 15, vertPtr4 );
6510 vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
6511 vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
6512 vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
6513 vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
6515 // like a splat, but only doing halves
6516 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6517 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
6518 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
6519 vecSum1 = vec_add( vecSum1, vecPlane3 );
6521 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6522 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
6523 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
6524 vecSum2 = vec_add( vecSum2, vecPlane3 );
6526 // store out results
6527 UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
6530 vecCmp1 = vec_cmplt( vecSum1, zeroVector );
6531 vecCmp2 = vec_cmplt( vecSum2, zeroVector );
6533 //and it with 1 so we multiply by 1 not 1111's
6534 vecCmp1 = vec_and( vecCmp1, oneIntVector );
6535 vecCmp2 = vec_and( vecCmp2, oneIntVector );
6537 // store out and write to cullBits
6538 // finally, a use for algebra! 1-x = x + 1 - 2x
6539 vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
6540 vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
6541 vecSum1Inv = vec_add( vecSum1Inv, oneVector );
6542 vecSum2Inv = vec_add( vecSum2Inv, oneVector );
6544 // do the same comparisons for the inverted d0/d1
6545 vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
6546 vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
6548 //and it with 1 so we multiply by 1 not 1111's
6549 vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
6550 vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
6552 // shift them as needed
6553 vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
6554 vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
6555 vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
6556 vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
6558 // OR them all together. since only 1 bit is set for each value, thats
6559 // the same as adding them. add up d0 + d1 + d0Inv + d1Inv
6560 vector unsigned int vecResult;
6561 vector unsigned int vecResult2;
6562 vector unsigned int vecResult3;
6563 vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
6565 vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6567 // vecResult now holds the values without the inverses yet, so add those
6568 vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
6569 vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
6570 vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
6571 vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
6573 vecResult = vec_add( vecResult, vecResult2 );
6576 vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
6577 vec_ste( vecResult, 0, &cullBitVal[0] );
6578 vec_ste( vecResult, 4, &cullBitVal[0] );
6579 vec_ste( vecResult, 8, &cullBitVal[0] );
6580 vec_ste( vecResult, 12, &cullBitVal[0] );
6582 cullBits[i] = cullBitVal[0];
6583 cullBits[i+1] = cullBitVal[1];
6584 cullBits[i+2] = cullBitVal[2];
6585 cullBits[i+3] = cullBitVal[3];
6589 for ( ; i < numVerts; i++ ) {
6594 vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
6595 vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
6596 vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
6598 d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
6599 d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
6600 texCoords[i][0] = d0;
6601 texCoords[i][1] = d1;
6603 bits = ( d0 >= 0 ) ? 0 : 1;
6605 bits |= ( d1 >= 0 ) ? 0 : 1*2;
6608 bits |= ( d0 >= 0 ) ? 0: 1*4;
6609 bits |= ( d1 >= 0 ) ? 0: 1*8;
6618 idSIMD_AltiVec::OverlayPointCull
6621 void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
6624 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6628 float p0x, p0y, p0z, p0d;
6629 float p1x, p1y, p1z, p1d;
6631 const float *planePtr = planes[0].ToFloatPtr();
6632 const float *vertPtr = verts[0].xyz.ToFloatPtr();
6634 vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
6635 vector float v0, v1, v2, v3, v4, v5, v6, v7;
6636 vector unsigned char vecPerm;
6637 vector float zeroVector = (vector float)(0);
6639 p0x = *(planePtr + 0);
6640 p0y = *(planePtr + 1);
6641 p0z = *(planePtr + 2);
6642 p0d = *(planePtr + 3);
6643 p1x = *(planePtr + 4);
6644 p1y = *(planePtr + 5);
6645 p1z = *(planePtr + 6);
6646 p1d = *(planePtr + 7);
6648 // populate the planes
6649 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
6650 v0 = vec_ld( 0, planePtr );
6651 v1 = vec_ld( 15, planePtr );
6652 vecPlane0 = vec_perm( v0, v1, vecPerm );
6654 v2 = vec_ld( 31, planePtr );
6655 vecPlane1 = vec_perm( v1, v2, vecPerm );
6658 v0 = vec_mergeh( vecPlane0, vecPlane0 );
6659 v1 = vec_mergeh( vecPlane1, vecPlane1 );
6660 v2 = vec_mergel( vecPlane0, vecPlane0 );
6661 v3 = vec_mergel( vecPlane1, vecPlane1);
6663 vecPlane0 = vec_mergeh( v0, v1 );
6664 vecPlane1 = vec_mergel( v0, v1 );
6665 vecPlane2 = vec_mergeh( v2, v3 );
6666 vecPlane3 = vec_mergel( v2, v3 );
6668 vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
6669 vector float oneVector = (vector float)(1);
6671 vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
6673 vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
6674 vector float negTwoVector = (vector float)(-2);
6675 vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
6676 vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
6677 vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
6678 vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
6679 vector bool int oneIntVector = (vector bool int)(1);
6680 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6681 unsigned int cullBitVal[4];
6682 vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
6686 for ( ; i+3 < numVerts; i+=4 ) {
6687 const float *vertPtr = verts[i].xyz.ToFloatPtr();
6688 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6689 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6690 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6692 vecXYZ1 = vec_ld( 0, vertPtr );
6693 vecXYZ2 = vec_ld( 0, vertPtr2 );
6694 vecXYZ3 = vec_ld( 0, vertPtr3 );
6695 vecXYZ4 = vec_ld( 0, vertPtr4 );
6697 // like a splat, but only doing halves
6698 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6699 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
6700 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
6701 vecSum1 = vec_add( vecSum1, vecPlane3 );
6703 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6704 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
6705 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
6706 vecSum2 = vec_add( vecSum2, vecPlane3 );
6708 // store out results
6709 UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
6712 vecCmp1 = vec_cmplt( vecSum1, zeroVector );
6713 vecCmp2 = vec_cmplt( vecSum2, zeroVector );
6715 //and it with 1 so we multiply by 1 not 1111's
6716 vecCmp1 = vec_and( vecCmp1, oneIntVector );
6717 vecCmp2 = vec_and( vecCmp2, oneIntVector );
6719 // store out and write to cullBits
6720 // finally, a use for algebra! 1-x = x + 1 - 2x
6721 vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
6722 vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
6723 vecSum1Inv = vec_add( vecSum1Inv, oneVector );
6724 vecSum2Inv = vec_add( vecSum2Inv, oneVector );
6726 // do the same comparisons for the inverted d0/d1
6727 vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
6728 vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
6730 //and it with 1 so we multiply by 1 not 1111's
6731 vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
6732 vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
6734 // shift them as needed
6735 vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
6736 vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
6737 vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
6738 vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
6740 // OR them all together. since only 1 bit is set for each value, thats
6741 // the same as adding them. add up d0 + d1 + d0Inv + d1Inv
6742 vector unsigned int vecResult;
6743 vector unsigned int vecResult2;
6744 vector unsigned int vecResult3;
6745 vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
6747 vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6749 // vecResult now holds the values without the inverses yet, so add those
6750 vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
6751 vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
6752 vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
6753 vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
6755 vecResult = vec_add( vecResult, vecResult2 );
6758 vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
6759 vec_ste( vecResult, 0, &cullBitVal[0] );
6760 vec_ste( vecResult, 4, &cullBitVal[0] );
6761 vec_ste( vecResult, 8, &cullBitVal[0] );
6762 vec_ste( vecResult, 12, &cullBitVal[0] );
6764 cullBits[i] = cullBitVal[0];
6765 cullBits[i+1] = cullBitVal[1];
6766 cullBits[i+2] = cullBitVal[2];
6767 cullBits[i+3] = cullBitVal[3];
6771 for ( ; i < numVerts; i++ ) {
6776 vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
6777 vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
6778 vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
6780 d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
6781 d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
6782 texCoords[i][0] = d0;
6783 texCoords[i][1] = d1;
6785 bits = ( d0 >= 0 ) ? 0 : 1;
6787 bits |= ( d1 >= 0 ) ? 0 : 1*2;
6790 bits |= ( d0 >= 0 ) ? 0: 1*4;
6791 bits |= ( d1 >= 0 ) ? 0: 1*8;
6798 #endif /* DRAWVERT_PADDED */
6800 #endif /* ENABLE_CULL */
6802 #ifdef ENABLE_DERIVE
6805 idSIMD_AltiVec::DeriveTriPlanes
6807 Derives a plane equation for each triangle.
6810 void VPCALL idSIMD_AltiVec::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
6813 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6815 assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
6818 vector float vecD0, vecD1, vecD2, vecD3, vecD4, vecD5, vecD6, vecD7;
6819 vector float vecVertA, vecVertB, vecVertC;
6820 vector float vecVertA2, vecVertB2, vecVertC2;
6821 vector float vecVertA3, vecVertB3, vecVertC3;
6822 vector float vecVertA4, vecVertB4, vecVertC4;
6824 vector float vecN, vecN2, vecN3, vecN4;
6825 vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
6826 vector unsigned char vecPerm1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
6827 vector unsigned char vecPerm2 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
6829 vector float vecF1, vecF2, vecF3, vecF4;
6830 vector float zeroVector = (vector float)(0);
6831 vector float vecNegOne = (vector float)(-1);
6832 vector float vecSecondHalf, vecFirstHalf, vecSecondHalf2, vecFirstHalf2, vecSecondHalf3, vecFirstHalf3, vecFirstHalf4, vecSecondHalf4;
6834 vector unsigned char vecPermA, vecPermA2, vecPermA3, vecPermA4;
6835 vector unsigned char vecPermB, vecPermB2, vecPermB3, vecPermB4;
6836 vector unsigned char vecPermC, vecPermC2, vecPermC3, vecPermC4;
6838 vector unsigned char oneVector = (vector unsigned char)(1);
6839 vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
6840 vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
6842 const float *xyzPtr = verts[0].xyz.ToFloatPtr();
6843 float *planePtr = planes[0].ToFloatPtr();
6846 for ( j = 0, i = 0; i+11 < numIndexes; i += 12, j += 4 ) {
6848 #ifndef DRAWVERT_PADDED
6849 // calculate permute vectors to load as needed. these are all
6850 // triangle indexes and are usaully pretty close together but
6851 // not guaranteed to be in any particular order
6852 vecPermA = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) ), oneVector );
6853 vecPermB = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) ), oneVector );
6854 vecPermC = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) ), oneVector );
6855 vecPermA2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) ), oneVector );
6856 vecPermB2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) ), oneVector );
6857 vecPermC2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) ), oneVector );
6858 vecPermA3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) ), oneVector );
6859 vecPermB3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) ), oneVector );
6860 vecPermC3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) ), oneVector );
6861 vecPermA4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) ), oneVector );
6862 vecPermB4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) ), oneVector );
6863 vecPermC4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) ), oneVector );
6866 #ifndef DRAWVERT_PADDED
6868 vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
6869 vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
6870 vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
6871 vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
6872 vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
6873 vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
6875 vecVertA = vec_perm( vecLd1, vecLd2, vecPermA );
6876 vecVertB = vec_perm( vecLd3, vecLd4, vecPermB );
6877 vecVertC = vec_perm( vecLd5, vecLd6, vecPermC );
6879 // set the last element to 0
6880 vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
6881 vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
6882 vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
6884 // load second A B C
6885 vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
6886 vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
6887 vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
6888 vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
6889 vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
6890 vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
6892 vecVertA2 = vec_perm( vecLd1, vecLd2, vecPermA2 );
6893 vecVertB2 = vec_perm( vecLd3, vecLd4, vecPermB2 );
6894 vecVertC2 = vec_perm( vecLd5, vecLd6, vecPermC2 );
6896 // set the last element to 0
6897 vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
6898 vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
6899 vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
6902 vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
6903 vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
6904 vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
6905 vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
6906 vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
6907 vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
6909 vecVertA3 = vec_perm( vecLd1, vecLd2, vecPermA3 );
6910 vecVertB3 = vec_perm( vecLd3, vecLd4, vecPermB3 );
6911 vecVertC3 = vec_perm( vecLd5, vecLd6, vecPermC3 );
6913 // set the last element to 0
6914 vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
6915 vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
6916 vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
6918 // load the fourth A B C
6919 vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
6920 vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
6921 vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
6922 vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
6923 vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
6924 vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
6926 vecVertA4 = vec_perm( vecLd1, vecLd2, vecPermA4 );
6927 vecVertB4 = vec_perm( vecLd3, vecLd4, vecPermB4 );
6928 vecVertC4 = vec_perm( vecLd5, vecLd6, vecPermC4 );
6930 // set the last element to 0
6931 vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
6932 vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
6933 vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
6936 vecVertA = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
6937 vecVertB = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
6938 vecVertC = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
6940 // set the last element to 0
6941 vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
6942 vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
6943 vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
6945 // load second A B C
6946 vecVertA2 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
6947 vecVertB2 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
6948 vecVertC2 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
6950 // set the last element to 0
6951 vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
6952 vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
6953 vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
6956 vecVertA3 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
6957 vecVertB3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
6958 vecVertC3 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
6960 // set the last element to 0
6961 vecVertA3 = vec_perm( vecVertA3, zeroVector, vecPermZeroLast );
6962 vecVertB3 = vec_perm( vecVertB3, zeroVector, vecPermZeroLast );
6963 vecVertC3 = vec_perm( vecVertC3, zeroVector, vecPermZeroLast );
6965 // load the fourth A B C
6966 vecVertA4 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
6967 vecVertB4 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
6968 vecVertC4 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
6970 // set the last element to 0
6971 vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
6972 vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
6973 vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
6975 // calculate d0 and d1 for each
6976 vecD0 = vec_sub( vecVertB, vecVertA );
6977 vecD1 = vec_sub( vecVertC, vecVertA );
6979 vecD2 = vec_sub( vecVertB2, vecVertA2 );
6980 vecD3 = vec_sub( vecVertC2, vecVertA2 );
6982 vecD4 = vec_sub( vecVertB3, vecVertA3 );
6983 vecD5 = vec_sub( vecVertC3, vecVertA3 );
6985 vecD6 = vec_sub( vecVertB4, vecVertA4 );
6986 vecD7 = vec_sub( vecVertC4, vecVertA4 );
6988 vecWork1 = vec_perm( vecD0, vecD0, vecPerm1 );
6989 vecWork2 = vec_perm( vecD1, vecD1, vecPerm2 );
6990 vecWork3 = vec_perm( vecD2, vecD2, vecPerm1 );
6991 vecWork4 = vec_perm( vecD3, vecD3, vecPerm2 );
6992 vecWork5 = vec_perm( vecD4, vecD4, vecPerm1 );
6993 vecWork6 = vec_perm( vecD5, vecD5, vecPerm2 );
6994 vecWork7 = vec_perm( vecD6, vecD6, vecPerm1 );
6995 vecWork8 = vec_perm( vecD7, vecD7, vecPerm2 );
6997 vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
6998 vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
6999 vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7000 vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7002 vecWork1 = vec_perm( vecD1, vecD1, vecPerm1 );
7003 vecWork2 = vec_perm( vecD0, vecD0, vecPerm2 );
7004 vecWork3 = vec_perm( vecD3, vecD3, vecPerm1 );
7005 vecWork4 = vec_perm( vecD2, vecD2, vecPerm2 );
7006 vecWork5 = vec_perm( vecD5, vecD5, vecPerm1 );
7007 vecWork6 = vec_perm( vecD4, vecD4, vecPerm2 );
7008 vecWork7 = vec_perm( vecD7, vecD7, vecPerm1 );
7009 vecWork8 = vec_perm( vecD6, vecD6, vecPerm2 );
7011 vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7012 vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7013 vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7014 vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7016 vecN = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
7017 vecN2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
7018 vecN3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
7019 vecN4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
7022 vector float v0, v1, v2, v3;
7023 v0 = vec_mergeh( vecN, vecN3 );
7024 v1 = vec_mergeh( vecN2, vecN4 );
7025 v2 = vec_mergel( vecN, vecN3 );
7026 v3 = vec_mergel( vecN2, vecN4 );
7028 vecN = vec_mergeh( v0, v1 );
7029 vecN2 = vec_mergel( v0, v1 );
7030 vecN3 = vec_mergeh( v2, v3 );
7031 vecN4 = vec_mergel( v2, v3 );
7033 vecF = vec_madd( vecN, vecN, zeroVector );
7034 vecF = vec_madd( vecN2, vecN2, vecF );
7035 vecF = vec_madd( vecN3, vecN3, vecF );
7037 vecF = ReciprocalSquareRoot( vecF );
7039 vecF1 = vec_madd( vecF, vecN, zeroVector );
7040 vecF2 = vec_madd( vecF, vecN2, zeroVector );
7041 vecF3 = vec_madd( vecF, vecN3, zeroVector );
7042 vecF4 = vec_madd( vecF, vecN4, zeroVector );
7044 vector float v8, v9, v10, v11;
7050 // transpose vecVerts
7051 v0 = vec_mergeh( vecVertA, vecVertA3 );
7052 v1 = vec_mergeh( vecVertA2, vecVertA4 );
7053 v2 = vec_mergel( vecVertA, vecVertA3 );
7054 v3 = vec_mergel( vecVertA2, vecVertA4 );
7056 vecVertA = vec_mergeh( v0, v1 );
7057 vecVertA2 = vec_mergel( v0, v1 );
7058 vecVertA3 = vec_mergeh( v2, v3 );
7059 vecVertA4 = vec_mergel( v2, v3 );
7061 vector float vecTotals;
7062 vecTotals = vec_madd( vecVertA, v8, zeroVector );
7063 vecTotals = vec_madd( vecVertA2, v9, vecTotals );
7064 vecTotals = vec_madd( vecVertA3, v10, vecTotals );
7065 vecTotals = vec_madd( vecVertA4, v11, vecTotals );
7066 vecF = vec_madd( vecTotals, vecNegOne, zeroVector );
7069 v0 = vec_mergeh( vecF1, vecF3 );
7070 v1 = vec_mergeh( vecF2, vecF );
7071 v2 = vec_mergel( vecF1, vecF3 );
7072 v3 = vec_mergel( vecF2, vecF );
7074 vecF1 = vec_mergeh( v0, v1 );
7075 vecF2 = vec_mergel( v0, v1 );
7076 vecF3 = vec_mergeh( v2, v3 );
7077 vecF4 = vec_mergel( v2, v3 );
7080 UNALIGNED_STORE4( planePtr + ( j * PLANE_OFFSET ), vecF1, vecF2, vecF3, vecF4 );
7084 for ( ; i < numIndexes; i += 3, j++ ) {
7085 const idDrawVert *a, *b, *c;
7086 float d0[3], d1[3], f;
7089 a = verts + indexes[i + 0];
7090 b = verts + indexes[i + 1];
7091 c = verts + indexes[i + 2];
7093 d0[0] = b->xyz[0] - a->xyz[0];
7094 d0[1] = b->xyz[1] - a->xyz[1];
7095 d0[2] = b->xyz[2] - a->xyz[2];
7097 d1[0] = c->xyz[0] - a->xyz[0];
7098 d1[1] = c->xyz[1] - a->xyz[1];
7099 d1[2] = c->xyz[2] - a->xyz[2];
7101 n[0] = d1[1] * d0[2] - d1[2] * d0[1];
7102 n[1] = d1[2] * d0[0] - d1[0] * d0[2];
7103 n[2] = d1[0] * d0[1] - d1[1] * d0[0];
7105 f = FastScalarInvSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
7106 //idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
7112 planes[j].SetNormal( n );
7113 planes[j].FitThroughPoint( a->xyz );
7119 idSIMD_AltiVec::DeriveTangents
7121 Derives the normal and orthogonal tangent vectors for the triangle vertices.
7122 For each vertex the normal and tangent vectors are derived from all triangles
7123 using the vertex which results in smooth tangents across the mesh.
7124 In the process the triangle planes are calculated as well.
7128 void VPCALL idSIMD_AltiVec::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
7131 bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
7132 memset( used, 0, numVerts * sizeof( used[0] ) );
7134 idPlane *planesPtr = planes;
7135 for ( i = 0; i < numIndexes; i += 3 ) {
7136 idDrawVert *a, *b, *c;
7137 // unsigned long signBit;
7138 float d0[5], d1[5], area;
7142 int v0 = indexes[i + 0];
7143 int v1 = indexes[i + 1];
7144 int v2 = indexes[i + 2];
7150 d0[0] = b->xyz[0] - a->xyz[0];
7151 d0[1] = b->xyz[1] - a->xyz[1];
7152 d0[2] = b->xyz[2] - a->xyz[2];
7153 d0[3] = b->st[0] - a->st[0];
7154 d0[4] = b->st[1] - a->st[1];
7156 d1[0] = c->xyz[0] - a->xyz[0];
7157 d1[1] = c->xyz[1] - a->xyz[1];
7158 d1[2] = c->xyz[2] - a->xyz[2];
7159 d1[3] = c->st[0] - a->st[0];
7160 d1[4] = c->st[1] - a->st[1];
7163 n[0] = d1[1] * d0[2] - d1[2] * d0[1];
7164 n[1] = d1[2] * d0[0] - d1[0] * d0[2];
7165 n[2] = d1[0] * d0[1] - d1[1] * d0[0];
7167 f1 = n.x * n.x + n.y * n.y + n.z * n.z;
7170 area = d0[3] * d1[4] - d0[4] * d1[3];
7173 t0[0] = d0[0] * d1[4] - d0[4] * d1[0];
7174 t0[1] = d0[1] * d1[4] - d0[4] * d1[1];
7175 t0[2] = d0[2] * d1[4] - d0[4] * d1[2];
7177 f2 = t0.x * t0.x + t0.y * t0.y + t0.z * t0.z;
7180 t1[0] = d0[3] * d1[0] - d0[0] * d1[3];
7181 t1[1] = d0[3] * d1[1] - d0[1] * d1[3];
7182 t1[2] = d0[3] * d1[2] - d0[2] * d1[3];
7184 f3 = t1.x * t1.x + t1.y * t1.y + t1.z * t1.z;
7186 // Behold! The power of the pipeline
7187 FastScalarInvSqrt_x3( &f1, &f2, &f3 );
7188 #ifdef PPC_INTRINSICS
7189 f2 = __fsel( area, f2, -f2 );
7190 f3 = __fsel( area, f3, -f3 );
7192 f2 = ( area < 0.0f ) ? -f2 : f2;
7193 f3 = ( area < 0.0f ) ? -f3 : f3;
7203 planesPtr->SetNormal( n );
7204 planesPtr->FitThroughPoint( a->xyz );
7213 a->tangents[0] += t0;
7214 a->tangents[1] += t1;
7217 a->tangents[0] = t0;
7218 a->tangents[1] = t1;
7224 b->tangents[0] += t0;
7225 b->tangents[1] += t1;
7228 b->tangents[0] = t0;
7229 b->tangents[1] = t1;
7235 c->tangents[0] += t0;
7236 c->tangents[1] += t1;
7239 c->tangents[0] = t0;
7240 c->tangents[1] = t1;
7247 #ifdef DERIVE_UNSMOOTH_DRAWVERT_ALIGNED
7251 idSIMD_AltiVec::DeriveUnsmoothedTangents
7253 Derives the normal and orthogonal tangent vectors for the triangle vertices.
7254 For each vertex the normal and tangent vectors are derived from a single dominant triangle.
7257 #define DERIVE_UNSMOOTHED_BITANGENT
7258 void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
7262 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
7263 // drawverts aligned
7264 assert( IS_16BYTE_ALIGNED( verts[0] ) );
7266 vector float vecVertA, vecVertB, vecVertC;
7267 vector float vecVertA2, vecVertB2, vecVertC2;
7268 vector float vecVertA3, vecVertB3, vecVertC3;
7269 vector float vecVertA4, vecVertB4, vecVertC4;
7271 vector float v0, v1, v2, v3, v4, v5, v6, v7, v8;
7272 vector float vecS0, vecS1, vecS2;
7273 vector float vecS0_2, vecS1_2, vecS2_2;
7274 vector float vecS0_3, vecS1_3, vecS2_3;
7275 vector float vecS0_4, vecS1_4, vecS2_4;
7277 vector float vecD1, vecD2, vecD3, vecD4, vecD5, vecD6;
7278 vector float vecD7, vecD8, vecD9, vecD10, vecD11, vecD12;
7279 vector float vecT1, vecT1_2, vecT1_3, vecT1_4, vecT2, vecT2_2, vecT2_3, vecT2_4;
7280 vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
7281 vector float vecN, vecN2, vecN3, vecN4;
7283 vector unsigned char vecPermN0 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
7284 vector unsigned char vecPermN1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
7285 vector unsigned char vecPermT0 = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3);
7286 vector unsigned char vecPermT1 = (vector unsigned char)(8,9,10,11,8,9,10,11,8,9,10,11,8,9,10,11);
7287 vector float zeroVector = (vector float)(0);
7289 vector float vecNegOne = (vector float)(-1.0);
7291 vector float vecStore1, vecStore2, vecStore3;
7292 vector unsigned char vecPermFirstThreeLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
7293 vector unsigned char vecPermStoreSecond = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
7294 vector unsigned char vecPermLeadAndThree = (vector unsigned char)(0,1,2,3,16,17,18,19,20,21,22,23,24,25,26,27);
7295 vector unsigned char vecPermStore2 = (vector unsigned char)(4,5,6,7,8,9,10,11,24,25,26,27,28,29,30,31);
7296 vector unsigned char vecPermStore3 = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
7297 vector unsigned char vecPermStore4 = (vector unsigned char)(8,9,10,11,16,17,18,19,20,21,22,23,24,25,26,27);
7298 vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
7300 vector float vecLd1, vecLd2, vecLd3;
7301 vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3, vecPerm4;
7303 float *normalPtr = verts[0].normal.ToFloatPtr();
7304 float *xyzPtr = verts[0].xyz.ToFloatPtr();
7306 vector float vecFirstHalf, vecSecondHalf;
7307 vector float vecFirstHalf2, vecSecondHalf2;
7308 vector float vecFirstHalf3, vecSecondHalf3;
7309 vector float vecFirstHalf4, vecSecondHalf4;
7311 for ( i = 0; i+3 < numVerts; i+=4 ) {
7312 int bOffset1, bOffset2, bOffset3, bOffset4;
7313 int cOffset1, cOffset2, cOffset3, cOffset4;
7315 bOffset1 = dominantTris[i].v2;
7316 cOffset1 = dominantTris[i].v3;
7317 bOffset2 = dominantTris[i+1].v2;
7318 cOffset2 = dominantTris[i+1].v3;
7319 bOffset3 = dominantTris[i+2].v2;
7320 cOffset3 = dominantTris[i+2].v3;
7321 bOffset4 = dominantTris[i+3].v2;
7322 cOffset4 = dominantTris[i+3].v3;
7324 vecPerm0 = vec_lvsl( 0, xyzPtr + ( i * DRAWVERT_OFFSET ) );
7325 v0 = vec_ld( 0, xyzPtr + (i * DRAWVERT_OFFSET ) );
7326 v1 = vec_ld( 16, xyzPtr + (i * DRAWVERT_OFFSET ) );
7327 vecVertA = vec_perm( v0, v1, vecPerm0 );
7329 vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset1 * DRAWVERT_OFFSET ) );
7330 v2 = vec_ld( 0, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
7331 v3 = vec_ld( 16, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
7332 vecVertB = vec_perm( v2, v3, vecPerm1 );
7334 vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
7335 v4 = vec_ld( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
7336 v5 = vec_ld( 16, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
7337 vecVertC = vec_perm( v4, v5, vecPerm2 );
7339 // put remainder into v2
7340 v1 = vec_perm( v1, v1, vecPerm0 );
7341 v3 = vec_perm( v3, v3, vecPerm1 );
7342 v5 = vec_perm( v5, v5, vecPerm2 );
7344 v1 = vec_mergeh( v1, v5 );
7345 v2 = vec_mergeh( v3, zeroVector );
7346 v2 = vec_mergeh( v1, v2 );
7347 v2 = vec_perm( v2, v2, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7350 vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
7351 v0 = vec_ld( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
7352 v1 = vec_ld( 16, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
7353 vecVertA2 = vec_perm( v0, v1, vecPerm0 );
7355 vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset2 * DRAWVERT_OFFSET ) );
7356 v3 = vec_ld( 0, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
7357 v4 = vec_ld( 16, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
7358 vecVertB2 = vec_perm( v3, v4, vecPerm3 );
7360 vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
7361 v5 = vec_ld( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
7362 v6 = vec_ld( 16, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
7363 vecVertC2 = vec_perm( v5, v6, vecPerm4 );
7365 // put remainder into v3
7366 v1 = vec_perm( v1, v1, vecPerm0 );
7367 v4 = vec_perm( v4, v4, vecPerm3 );
7368 v5 = vec_perm( v6, v6, vecPerm4 );
7370 v1 = vec_mergeh( v1, v5 );
7371 v3 = vec_mergeh( v4, zeroVector );
7372 v3 = vec_mergeh( v1, v3 );
7373 v3 = vec_perm( v3, v3, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7376 vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
7377 v0 = vec_ld( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
7378 v1 = vec_ld( 16, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
7379 vecVertA3 = vec_perm( v0, v1, vecPerm0 );
7381 vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset3 * DRAWVERT_OFFSET ) );
7382 v4 = vec_ld( 0, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
7383 v5 = vec_ld( 16, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
7384 vecVertB3 = vec_perm( v4, v5, vecPerm1 );
7386 vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
7387 v6 = vec_ld( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
7388 v7 = vec_ld( 16, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
7389 vecVertC3 = vec_perm( v6, v7, vecPerm2 );
7391 // put remainder into v4
7392 v1 = vec_perm( v1, v1, vecPerm0 );
7393 v5 = vec_perm( v5, v5, vecPerm1 );
7394 v7 = vec_perm( v7, v7, vecPerm2 );
7396 v1 = vec_mergeh( v1, v7 );
7397 v4 = vec_mergeh( v5, zeroVector );
7398 v4 = vec_mergeh( v1, v4 );
7399 v4 = vec_perm( v4, v4, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7402 vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
7403 v0 = vec_ld( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
7404 v1 = vec_ld( 16, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
7405 vecVertA4 = vec_perm( v0, v1, vecPerm0 );
7407 vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset4 * DRAWVERT_OFFSET ) );
7408 v5 = vec_ld( 0, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
7409 v6 = vec_ld( 16, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
7410 vecVertB4 = vec_perm( v5, v6, vecPerm3 );
7412 vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
7413 v7 = vec_ld( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
7414 v8 = vec_ld( 16, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
7415 vecVertC4 = vec_perm( v7, v8, vecPerm4 );
7417 // put remainder into v5
7418 v1 = vec_perm( v1, v1, vecPerm0 );
7419 v6 = vec_perm( v6, v6, vecPerm3 );
7420 v8 = vec_perm( v8, v8, vecPerm4 );
7422 v1 = vec_mergeh( v1, v8 );
7423 v5 = vec_mergeh( v6, zeroVector );
7424 v5 = vec_mergeh( v1, v5 );
7425 v5 = vec_perm( v5, v5, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7427 // remainder vectors look like b->st[1], a->st[1], c->st[1], a->st[1]
7429 //vecD1 now holds d0, d1, d2, d3
7430 vecD1 = vec_sub( vecVertB, vecVertA );
7431 vecD4 = vec_sub( vecVertB2, vecVertA2 );
7432 vecD7 = vec_sub( vecVertB3, vecVertA3 );
7433 vecD10 = vec_sub( vecVertB4, vecVertA4 );
7435 // vecD2 how holds d5, d6, d7, d8
7436 vecD2 = vec_sub( vecVertC, vecVertA );
7437 vecD5 = vec_sub( vecVertC2, vecVertA2 );
7438 vecD8 = vec_sub( vecVertC3, vecVertA3 );
7439 vecD11 = vec_sub( vecVertC4, vecVertA4 );
7441 // vecD3 now holds d4, crap, d9, crap
7442 vecD3 = vec_sub( v2, vec_sld( v2, v2, 4 ) );
7443 vecD6 = vec_sub( v3, vec_sld( v3, v3, 4 ) );
7444 vecD9 = vec_sub( v4, vec_sld( v4, v4, 4 ) );
7445 vecD12 = vec_sub( v5, vec_sld( v5, v5, 4 ) );
7447 // get permute vectors for loading from dt
7448 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i].normalizationScale[0] ), (vector unsigned char)(1) );
7449 vecPerm2 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+1].normalizationScale[0] ), (vector unsigned char)(1) );
7450 vecPerm3 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+2].normalizationScale[0] ), (vector unsigned char)(1) );
7451 vecPerm4 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+3].normalizationScale[0] ), (vector unsigned char)(1) );
7453 // load S values from dominantTris
7454 v0 = vec_ld( 0, &dominantTris[i].normalizationScale[0] );
7455 v1 = vec_ld( 11, &dominantTris[i].normalizationScale[0] );
7456 v2 = vec_ld( 0, &dominantTris[i+1].normalizationScale[0] );
7457 v3 = vec_ld( 11, &dominantTris[i+1].normalizationScale[0] );
7458 v4 = vec_ld( 0, &dominantTris[i+2].normalizationScale[0] );
7459 v5 = vec_ld( 11, &dominantTris[i+2].normalizationScale[0] );
7460 v6 = vec_ld( 0, &dominantTris[i+3].normalizationScale[0] );
7461 v7 = vec_ld( 11, &dominantTris[i+3].normalizationScale[0] );
7463 v0 = vec_perm( v0, v1, vecPerm1 );
7464 v2 = vec_perm( v2, v3, vecPerm2 );
7465 v4 = vec_perm( v4, v5, vecPerm3 );
7466 v6 = vec_perm( v6, v7, vecPerm4 );
7468 vecS0 = vec_splat( v0, 0 );
7469 vecS1 = vec_splat( v0, 1 );
7470 vecS2 = vec_splat( v0, 2 );
7472 vecS0_2 = vec_splat( v2, 0);
7473 vecS1_2 = vec_splat( v2, 1 );
7474 vecS2_2 = vec_splat( v2, 2 );
7476 vecS0_3 = vec_splat( v4, 0 );
7477 vecS1_3 = vec_splat( v4, 1 );
7478 vecS2_3 = vec_splat( v4, 2 );
7480 vecS0_4 = vec_splat( v6, 0 );
7481 vecS1_4 = vec_splat( v6, 1 );
7482 vecS2_4 = vec_splat( v6, 2 );
7485 vecWork1 = vec_perm( vecD2, vecD2, vecPermN1 );
7486 vecWork2 = vec_perm( vecD1, vecD1, vecPermN0 );
7487 vecWork3 = vec_perm( vecD5, vecD5, vecPermN1 );
7488 vecWork4 = vec_perm( vecD4, vecD4, vecPermN0 );
7489 vecWork5 = vec_perm( vecD8, vecD8, vecPermN1 );
7490 vecWork6 = vec_perm( vecD7, vecD7, vecPermN0 );
7491 vecWork7 = vec_perm( vecD11, vecD11, vecPermN1 );
7492 vecWork8 = vec_perm( vecD10, vecD10, vecPermN0 );
7494 vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7495 vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7496 vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7497 vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7499 vecWork1 = vec_perm( vecD2, vecD2, vecPermN0 );
7500 vecWork2 = vec_perm( vecD1, vecD1, vecPermN1 );
7501 vecWork3 = vec_perm( vecD5, vecD5, vecPermN0 );
7502 vecWork4 = vec_perm( vecD4, vecD4, vecPermN1 );
7503 vecWork5 = vec_perm( vecD8, vecD8, vecPermN0 );
7504 vecWork6 = vec_perm( vecD7, vecD7, vecPermN1 );
7505 vecWork7 = vec_perm( vecD11, vecD11, vecPermN0 );
7506 vecWork8 = vec_perm( vecD10, vecD10, vecPermN1 );
7508 vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
7509 vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
7510 vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
7511 vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
7514 // calculate N values
7515 vecN = vec_madd( vecS2, vecSecondHalf, zeroVector );
7516 vecN2 = vec_madd( vecS2_2, vecSecondHalf2, zeroVector );
7517 vecN3 = vec_madd( vecS2_3, vecSecondHalf3, zeroVector );
7518 vecN4 = vec_madd( vecS2_4, vecSecondHalf4, zeroVector );
7520 // calculate both halves of the calculation for t
7522 vecWork2 = vec_perm( vecD3, vecD3, vecPermT1 );
7524 vecWork4 = vec_perm( vecD6, vecD6, vecPermT1 );
7526 vecWork6 = vec_perm( vecD9, vecD9, vecPermT1 );
7528 vecWork8 = vec_perm( vecD12, vecD12, vecPermT1 );
7530 vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7531 vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7532 vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7533 vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7536 vecWork2 = vec_perm( vecD3, vecD3, vecPermT0 );
7538 vecWork4 = vec_perm( vecD6, vecD6, vecPermT0 );
7540 vecWork6 = vec_perm( vecD9, vecD9, vecPermT0 );
7542 vecWork8 = vec_perm( vecD12, vecD12, vecPermT0 );
7544 vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
7545 vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
7546 vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
7547 vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
7549 // calculate T values
7550 vecT1 = vec_madd( vecS0, vecSecondHalf, zeroVector );
7551 vecT1_2 = vec_madd( vecS0_2, vecSecondHalf2, zeroVector );
7552 vecT1_3 = vec_madd( vecS0_3, vecSecondHalf3, zeroVector );
7553 vecT1_4 = vec_madd( vecS0_4, vecSecondHalf4, zeroVector );
7555 #ifndef DERIVE_UNSMOOTHED_BITANGENT
7557 vecWork2 = vec_perm( vecD2, vecD2, vecPermT2 );
7559 vecWork4 = vec_perm( vecD5, vecD5, vecPermT2 );
7561 vecWork6 = vec_perm( vecD8, vecD8, vecPermT2 );
7563 vecWork8 = vec_perm( vecD11, vecD11, vecPermT2 );
7565 vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7566 vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7567 vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7568 vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7570 vecWork1 = vec_perm( vecD1, vecD1, vecPermT2 );
7572 vecWork3 = vec_perm( vecD4, vecD4, vecPermT2 );
7574 vecWork5 = vec_perm( vecD7, vecD7, vecPermT2 );
7576 vecWork7 = vec_perm( vecD10, vecD10, vecPermT2 );
7579 vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7580 vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7581 vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7582 vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7585 vecWork1 = vec_perm( vecN, vecN, vecPermN1 );
7586 vecWork2 = vec_perm( vecT1, vecT1, vecPermN0 );
7587 vecWork3 = vec_perm( vecN2, vecN2, vecPermN1 );
7588 vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN0 );
7589 vecWork5 = vec_perm( vecN3, vecN3, vecPermN1 );
7590 vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN0 );
7591 vecWork7 = vec_perm( vecN4, vecN4, vecPermN1 );
7592 vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN0 );
7594 vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7595 vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7596 vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7597 vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7599 vecWork1 = vec_perm( vecN, vecN, vecPermN0 );
7600 vecWork2 = vec_perm( vecT1, vecT1, vecPermN1 );
7601 vecWork3 = vec_perm( vecN2, vecN2, vecPermN0 );
7602 vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN1 );
7603 vecWork5 = vec_perm( vecN3, vecN3, vecPermN0 );
7604 vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN1 );
7605 vecWork7 = vec_perm( vecN4, vecN4, vecPermN0 );
7606 vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN1 );
7608 vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7609 vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7610 vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7611 vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7613 // finish the calculation
7614 vecSecondHalf = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
7615 vecSecondHalf2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
7616 vecSecondHalf3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
7617 vecSecondHalf4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
7619 vecT2 = vec_madd( vecS1, vecSecondHalf, zeroVector );
7620 vecT2_2 = vec_madd( vecS1_2, vecSecondHalf2, zeroVector );
7621 vecT2_3 = vec_madd( vecS1_3, vecSecondHalf3, zeroVector );
7622 vecT2_4 = vec_madd( vecS1_4, vecSecondHalf4, zeroVector );
7626 // read values that we need to preserve
7627 vecLd1 = vec_ld( 0, normalPtr + ( i * DRAWVERT_OFFSET ) );
7628 vecLd2 = vec_ld( 32, normalPtr + ( i * DRAWVERT_OFFSET ) );
7630 //generate vectors to store
7631 vecStore1 = vec_perm( vecLd1, vecN, vecPermLeadAndThree );
7632 vecStore2 = vec_perm( vecT1, vecT2, vecPermFirstThreeLast );
7633 vecStore3 = vec_perm( vecT2, vecLd2, vecPermStore2 );
7635 // store out results
7636 ALIGNED_STORE3( normalPtr + ( i * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
7638 // read values that we need to preserve
7639 vecLd3 = vec_ld( 32, normalPtr + ( (i+1) * DRAWVERT_OFFSET ));
7641 // generate vectors to store
7642 vecStore1 = vec_perm( vecN2, vecT1_2, vecPermFirstThreeLast );
7643 vecStore2 = vec_perm( vecT1_2, vecT2_2, vecPermStoreSecond );
7644 vecStore3 = vec_perm( vecT2_2, vecLd3, (vector unsigned char)(8,9,10,11,20,21,22,23,24,25,26,27,28,29,30,31) );
7646 // instead of doing permute, shift it where it needs to be and use vec_ste
7647 // store out vectors
7648 ALIGNED_STORE3( normalPtr + ((i+1) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
7650 // read values that we need to preserve
7651 vecLd1 = vec_ld( 0, normalPtr + ( (i+2) * DRAWVERT_OFFSET ) );
7653 // generate vectors to store
7654 vecStore1 = vec_perm( vecLd1, vecN3, vecPermFirstThreeLast );
7655 vecStore2 = vec_perm( vecN3, vecT1_3, vecPermStore3 );
7656 vecStore3 = vec_perm( vecT1_3, vecT2_3, vecPermStore4 );
7658 // store out vectors
7659 ALIGNED_STORE3( normalPtr + ((i+2) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
7661 // read values that we need to preserve
7662 vecLd2 = vec_ld( 0, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
7663 vecLd3 = vec_ld( 32, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
7665 // generate vectors to store
7666 vecStore1 = vec_perm( vecLd2, vecN4, vecPermHalves );
7667 vecStore2 = vec_perm( vecN4, vecT1_4, vecPermStore4 );
7668 vecStore3 = vec_perm( vecT2_4, vecLd3, vecPermFirstThreeLast );
7670 // store out vectors
7671 ALIGNED_STORE3( normalPtr + ((i+3) * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
7675 for ( ; i < numVerts; i++ ) {
7676 idDrawVert *a, *b, *c;
7677 float d0, d1, d2, d3, d4;
7678 float d5, d6, d7, d8, d9;
7684 const dominantTri_s &dt = dominantTris[i];
7690 d0 = b->xyz[0] - a->xyz[0];
7691 d1 = b->xyz[1] - a->xyz[1];
7692 d2 = b->xyz[2] - a->xyz[2];
7693 d3 = b->st[0] - a->st[0];
7695 d4 = b->st[1] - a->st[1];
7697 d5 = c->xyz[0] - a->xyz[0];
7698 d6 = c->xyz[1] - a->xyz[1];
7699 d7 = c->xyz[2] - a->xyz[2];
7700 d8 = c->st[0] - a->st[0];
7702 d9 = c->st[1] - a->st[1];
7704 s0 = dt.normalizationScale[0];
7705 s1 = dt.normalizationScale[1];
7706 s2 = dt.normalizationScale[2];
7708 n0 = s2 * ( d6 * d2 - d7 * d1 );
7709 n1 = s2 * ( d7 * d0 - d5 * d2 );
7710 n2 = s2 * ( d5 * d1 - d6 * d0 );
7712 t0 = s0 * ( d0 * d9 - d4 * d5 );
7713 t1 = s0 * ( d1 * d9 - d4 * d6 );
7714 t2 = s0 * ( d2 * d9 - d4 * d7 );
7716 #ifndef DERIVE_UNSMOOTHED_BITANGENT
7717 t3 = s1 * ( d3 * d5 - d0 * d8 );
7718 t4 = s1 * ( d3 * d6 - d1 * d8 );
7719 t5 = s1 * ( d3 * d7 - d2 * d8 );
7721 t3 = s1 * ( n2 * t1 - n1 * t2 );
7722 t4 = s1 * ( n0 * t2 - n2 * t0 );
7723 t5 = s1 * ( n1 * t0 - n0 * t1 );
7730 a->tangents[0][0] = t0;
7731 a->tangents[0][1] = t1;
7732 a->tangents[0][2] = t2;
7734 a->tangents[1][0] = t3;
7735 a->tangents[1][1] = t4;
7736 a->tangents[1][2] = t5;
7743 idSIMD_AltiVec::DeriveUnsmoothedTangents
7745 Derives the normal and orthogonal tangent vectors for the triangle vertices.
7746 For each vertex the normal and tangent vectors are derived from a single dominant triangle.
7749 #define DERIVE_UNSMOOTHED_BITANGENT
7751 void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
7754 for ( i = 0; i < numVerts; i++ ) {
7755 idDrawVert *a, *b, *c;
7756 float d0, d1, d2, d3, d4;
7757 float d5, d6, d7, d8, d9;
7763 const dominantTri_s &dt = dominantTris[i];
7769 d0 = b->xyz[0] - a->xyz[0];
7770 d1 = b->xyz[1] - a->xyz[1];
7771 d2 = b->xyz[2] - a->xyz[2];
7772 d3 = b->st[0] - a->st[0];
7774 d4 = b->st[1] - a->st[1];
7776 d5 = c->xyz[0] - a->xyz[0];
7777 d6 = c->xyz[1] - a->xyz[1];
7778 d7 = c->xyz[2] - a->xyz[2];
7779 d8 = c->st[0] - a->st[0];
7781 d9 = c->st[1] - a->st[1];
7783 s0 = dt.normalizationScale[0];
7784 s1 = dt.normalizationScale[1];
7785 s2 = dt.normalizationScale[2];
7787 n0 = s2 * ( d6 * d2 - d7 * d1 );
7788 n1 = s2 * ( d7 * d0 - d5 * d2 );
7789 n2 = s2 * ( d5 * d1 - d6 * d0 );
7791 t0 = s0 * ( d0 * d9 - d4 * d5 );
7792 t1 = s0 * ( d1 * d9 - d4 * d6 );
7793 t2 = s0 * ( d2 * d9 - d4 * d7 );
7795 #ifndef DERIVE_UNSMOOTHED_BITANGENT
7796 t3 = s1 * ( d3 * d5 - d0 * d8 );
7797 t4 = s1 * ( d3 * d6 - d1 * d8 );
7798 t5 = s1 * ( d3 * d7 - d2 * d8 );
7800 t3 = s1 * ( n2 * t1 - n1 * t2 );
7801 t4 = s1 * ( n0 * t2 - n2 * t0 );
7802 t5 = s1 * ( n1 * t0 - n0 * t1 );
7809 a->tangents[0][0] = t0;
7810 a->tangents[0][1] = t1;
7811 a->tangents[0][2] = t2;
7813 a->tangents[1][0] = t3;
7814 a->tangents[1][1] = t4;
7815 a->tangents[1][2] = t5;
7819 #endif /* DERIVE_UNSMOOTH_DRAWVERT_ALIGNED */
7823 idSIMD_AltiVec::NormalizeTangents
7825 Normalizes each vertex normal and projects and normalizes the
7826 tangent vectors onto the plane orthogonal to the vertex normal.
7829 void VPCALL idSIMD_AltiVec::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
7832 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
7834 float *addr = verts[0].normal.ToFloatPtr();
7835 float *tAddr = verts[0].tangents[0].ToFloatPtr();
7837 // v0 through v3 maintain originally loaded values so we don't take
7838 // as much hit for unaligned stores
7839 vector float v0, v1, v2, v3;
7840 // v5 through v8 are the "working" values of the vectors
7841 vector float v5, v6, v7, v8;
7843 vector float vec1T0, vec1T1, vec2T0, vec2T1, vec3T0, vec3T1, vec4T0, vec4T1;
7844 vector float vecSum, vecTSum1, vecTSum2, tempSum, tempSum2, tempSum3;
7845 vector float vecF, vecF2;
7846 vector float vecTemp, vecTemp2, vecTemp3, vecTemp4;
7848 register vector float zeroVector = (vector float)(0.0);
7850 vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
7851 vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
7852 vector unsigned char vecPermSplatFirstWithZero = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,16,17,18,19);
7853 vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3;
7854 vector unsigned char storePerm0, storePerm1, storePerm2, storePerm3;
7856 vector float vecTan11, vecTan12, vecTan13, vecTan21, vecTan22, vecTan23;
7857 vector float vecTan31, vecTan32, vecTan33, vecTan41, vecTan42, vecTan43;
7859 vector unsigned char vec1T0Perm, vec1T1Perm, vec2T0Perm, vec2T1Perm, vec3T0Perm, vec3T1Perm, vec4T0Perm, vec4T1Perm;
7860 vector unsigned char storeT11, storeT12, storeT21, storeT22, storeT31, storeT32;
7861 vector unsigned char storeT41, storeT42;
7865 if ( i+3 < numVerts ) {
7866 // for loading normal from idDrawVert
7867 vecPerm0 = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
7868 vecPerm1 = vec_add( vec_lvsl( -1, addr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7869 vecPerm2 = vec_add( vec_lvsl( -1, addr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7870 vecPerm3 = vec_add( vec_lvsl( -1, addr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7872 // for loading tangents from idDrawVert
7873 vec1T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7874 vec1T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7875 vec2T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7876 vec2T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7877 vec3T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7878 vec3T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7879 vec4T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7880 vec4T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7882 // generate permute vectors to store normals
7883 storePerm0 = vec_lvsr( 0, addr );
7884 storePerm1 = vec_lvsr( 0, addr + ( 1 * DRAWVERT_OFFSET ) );
7885 storePerm2 = vec_lvsr( 0, addr + ( 2 * DRAWVERT_OFFSET ) );
7886 storePerm3 = vec_lvsr( 0, addr + ( 3 * DRAWVERT_OFFSET ) );
7888 // generate permute vectors to store tangents
7889 storeT11 = vec_lvsr( 0, tAddr + ( 0 * DRAWVERT_OFFSET ) );
7890 storeT12 = vec_lvsr( 12, tAddr + ( 0 * DRAWVERT_OFFSET ) );
7892 storeT21 = vec_lvsr( 0, tAddr + ( 1 * DRAWVERT_OFFSET ) );
7893 storeT22 = vec_lvsr( 12, tAddr + ( 1 * DRAWVERT_OFFSET ) );
7895 storeT31 = vec_lvsr( 0, tAddr + ( 2 * DRAWVERT_OFFSET ) );
7896 storeT32 = vec_lvsr( 12, tAddr + ( 2 * DRAWVERT_OFFSET ) );
7898 storeT41 = vec_lvsr( 0, tAddr + ( 3 * DRAWVERT_OFFSET ) );
7899 storeT42 = vec_lvsr( 12, tAddr + ( 3 * DRAWVERT_OFFSET ) );
7902 for ( ; i+3 < numVerts; i+=4 ) {
7905 vector float vecNormal11 = vec_ld( 0, addr + ( i * DRAWVERT_OFFSET ) );
7906 vector float vecNormal12 = vec_ld( 15, addr + ( i * DRAWVERT_OFFSET ) );
7907 v0 = vec_perm( vecNormal11, vecNormal12, vecPerm0 );
7909 vector float vecNormal21 = vec_ld( 0, addr + ((i+1) * DRAWVERT_OFFSET ) );
7910 vector float vecNormal22 = vec_ld( 15, addr + ((i+1) * DRAWVERT_OFFSET ) );
7911 v1 = vec_perm( vecNormal21, vecNormal22, vecPerm1 );
7913 vector float vecNormal31 = vec_ld( 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
7914 vector float vecNormal32 = vec_ld( 15, addr + ( (i+2) * DRAWVERT_OFFSET ) );
7915 v2 = vec_perm( vecNormal31, vecNormal32, vecPerm2 );
7917 vector float vecNormal41 = vec_ld( 0, addr + ((i+3) * DRAWVERT_OFFSET ) );
7918 vector float vecNormal42 = vec_ld( 15, addr + ((i+3) * DRAWVERT_OFFSET ) );
7919 v3 = vec_perm( vecNormal41, vecNormal42, vecPerm3 );
7921 // zero out the last element of each useless vector
7922 v0 = vec_perm( v0, zeroVector, vecPermLast );
7923 v1 = vec_perm( v1, zeroVector, vecPermLast );
7924 v2 = vec_perm( v2, zeroVector, vecPermLast );
7925 v3 = vec_perm( v3, zeroVector, vecPermLast );
7927 // got 4 vectors in v0 through v3, sum them each accross
7928 // and put into one vector
7929 vecTemp = vec_madd( v0, v0, zeroVector );
7931 vecSum = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
7932 vecSum = vec_add( vecSum, vec_sld( vecSum, vecSum, 4 ) );
7933 // element 0 of vecSum now has sum of v0
7935 vecTemp2 = vec_madd( v1, v1, zeroVector );
7936 tempSum = vec_add( vecTemp2, vec_sld( vecTemp2, vecTemp2, 8 ) );
7937 tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
7938 // put this into vecSum
7939 vecSum = vec_mergeh( vecSum, tempSum );
7941 vecTemp3 = vec_madd( v2, v2, zeroVector );
7942 tempSum = vec_add( vecTemp3, vec_sld( vecTemp3, vecTemp3, 8 ) );
7943 tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
7944 // put this into vecSum
7945 vecSum = vec_perm( vecSum, tempSum, vecPermHalves );
7947 vecTemp4 = vec_madd( v3, v3, zeroVector );
7948 tempSum = vec_add( vecTemp4, vec_sld( vecTemp4, vecTemp4, 8 ) );
7949 tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
7950 // put this into vecSum
7951 vecSum = vec_perm( vecSum, tempSum, vecPermLast );
7953 // take reciprocal square roots of these
7954 vecF = ReciprocalSquareRoot( vecSum );
7956 // multiply each vector by f
7957 v5 = vec_madd( v0, vec_splat( vecF, 0 ), zeroVector );
7958 v6 = vec_madd( v1, vec_splat( vecF, 1 ), zeroVector );
7959 v7 = vec_madd( v2, vec_splat( vecF, 2 ), zeroVector );
7960 v8 = vec_madd( v3, vec_splat( vecF, 3 ), zeroVector );
7962 // load tangents as unaligned
7963 vecTan11 = vec_ld( 0, tAddr + ( i * DRAWVERT_OFFSET ) );
7964 vecTan12 = vec_ld( 11, tAddr + ( i * DRAWVERT_OFFSET ) );
7965 vecTan13 = vec_ld( 23, tAddr + ( i * DRAWVERT_OFFSET ) );
7967 vecTan21 = vec_ld( 0, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
7968 vecTan22 = vec_ld( 11, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
7969 vecTan23 = vec_ld( 23, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
7971 vecTan31 = vec_ld( 0, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
7972 vecTan32 = vec_ld( 11, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
7973 vecTan33 = vec_ld( 23, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
7975 vecTan41 = vec_ld( 0, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
7976 vecTan42 = vec_ld( 11, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
7977 vecTan43 = vec_ld( 23, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
7979 vec1T0 = vec_perm( vecTan11, vecTan12, vec1T0Perm );
7980 vec1T1 = vec_perm( vecTan12, vecTan13, vec1T1Perm );
7981 vec2T0 = vec_perm( vecTan21, vecTan22, vec2T0Perm );
7982 vec2T1 = vec_perm( vecTan22, vecTan23, vec2T1Perm );
7983 vec3T0 = vec_perm( vecTan31, vecTan32, vec3T0Perm );
7984 vec3T1 = vec_perm( vecTan32, vecTan33, vec3T1Perm );
7985 vec4T0 = vec_perm( vecTan41, vecTan42, vec4T0Perm );
7986 vec4T1 = vec_perm( vecTan42, vecTan43, vec4T1Perm );
7988 //zero out last element of tangents
7989 vec1T0 = vec_perm( vec1T0, zeroVector, vecPermLast );
7990 vec1T1 = vec_perm( vec1T1, zeroVector, vecPermLast );
7991 vec2T0 = vec_perm( vec2T0, zeroVector, vecPermLast );
7992 vec2T1 = vec_perm( vec2T1, zeroVector, vecPermLast );
7993 vec3T0 = vec_perm( vec3T0, zeroVector, vecPermLast );
7994 vec3T1 = vec_perm( vec3T1, zeroVector, vecPermLast );
7995 vec4T0 = vec_perm( vec4T0, zeroVector, vecPermLast );
7996 vec4T1 = vec_perm( vec4T1, zeroVector, vecPermLast );
7999 tempSum = zeroVector;
8000 tempSum = vec_madd( vec1T0, v5, tempSum );
8001 //sum accross tempSum
8002 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8003 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8004 // put tempSum splatted accross vecTSum1
8005 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8006 vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
8008 //vec1T0 now contains what needs to be rsqrt'd and multiplied by f
8009 vec1T0 = vec_sub( vec1T0, vecTSum1 );
8011 tempSum = zeroVector;
8012 tempSum = vec_madd( vec2T0, v6, tempSum );
8014 //sum accross tempSum
8015 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8016 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8017 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8018 vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
8019 vec2T0 = vec_sub( vec2T0, vecTSum1 );
8021 tempSum = zeroVector;
8022 tempSum = vec_madd( vec3T0, v7, tempSum );
8024 //sum accross tempSum
8025 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8026 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8027 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8028 vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
8029 vec3T0 = vec_sub( vec3T0, vecTSum1 );
8031 tempSum = zeroVector;
8032 tempSum = vec_madd( vec4T0, v8, tempSum );
8034 //sum accross tempSum
8035 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8036 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8037 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8038 vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
8039 vec4T0 = vec_sub( vec4T0, vecTSum1 );
8042 tempSum = zeroVector;
8043 tempSum = vec_madd( vec1T1, v5, tempSum );
8045 //sum accross tempSum
8046 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8047 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8048 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8049 vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
8051 //vec1T0 now contains what needs to be rsqrt'd and multiplied by f
8052 vec1T1 = vec_sub( vec1T1, vecTSum1 );
8054 tempSum = zeroVector;
8055 tempSum = vec_madd( vec2T1, v6, tempSum );
8057 //sum accross tempSum
8058 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8059 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8060 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8061 vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
8062 vec2T1 = vec_sub( vec2T1, vecTSum1 );
8064 tempSum = zeroVector;
8065 tempSum = vec_madd( vec3T1, v7, tempSum );
8067 //sum accross tempSum
8068 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8069 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8070 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8071 vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
8072 vec3T1 = vec_sub( vec3T1, vecTSum1 );
8074 tempSum = zeroVector;
8075 tempSum = vec_madd( vec4T1, v8, tempSum );
8077 //sum accross tempSum
8078 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8079 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8080 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8081 vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
8082 vec4T1 = vec_sub( vec4T1, vecTSum1 );
8085 // sum accross vectors and put into one vector
8086 vecTemp = vec_madd( vec1T0, vec1T0, zeroVector );
8087 vecTSum1 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8088 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8090 // element 0 of vecSum now has sum of v0
8091 vecTemp = vec_madd( vec2T0, vec2T0, zeroVector );
8092 tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8093 tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
8094 // put this into vecSum
8095 vecTemp = vec_madd( vec3T0, vec3T0, zeroVector );
8096 vecTSum1 = vec_mergeh( vecTSum1, tempSum2 );
8097 tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8098 tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
8099 // put this into vecSum
8100 vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermHalves );
8101 vecTemp = vec_madd( vec4T0, vec4T0, zeroVector );
8102 tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8103 tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
8104 // put this into vecSum
8105 vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermLast );
8107 vecTemp = vec_madd( vec1T1, vec1T1, zeroVector );
8108 vecTSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8109 vecTSum2 = vec_add( vecTSum2, vec_sld( vecTSum2, vecTSum2, 4 ) );
8110 // element 0 of vecSum now has sum of v0
8111 vecTemp = vec_madd( vec2T1, vec2T1, zeroVector );
8112 tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8113 tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
8114 // put this into vecSum
8115 vecTSum2 = vec_mergeh( vecTSum2, tempSum3 );
8116 vecTemp = vec_madd( vec3T1, vec3T1, zeroVector );
8117 tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8118 tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
8119 // put this into vecSum
8120 vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermHalves );
8121 vecTemp = vec_madd( vec4T1, vec4T1, zeroVector );
8122 tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8123 tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
8124 // put this into vecSum
8125 vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermLast );
8128 vecF = ReciprocalSquareRoot( vecTSum1 );
8130 vecF2 = ReciprocalSquareRoot( vecTSum2 );
8132 // multiply each tangent vector by f
8134 vec1T0 = vec_madd( vec1T0, vec_splat( vecF, 0 ), zeroVector );
8135 vec2T0 = vec_madd( vec2T0, vec_splat( vecF, 1 ), zeroVector );
8136 vec3T0 = vec_madd( vec3T0, vec_splat( vecF, 2 ), zeroVector );
8137 vec4T0 = vec_madd( vec4T0, vec_splat( vecF, 3 ), zeroVector );
8139 vec1T1 = vec_madd( vec1T1, vec_splat( vecF2, 0 ), zeroVector );
8140 vec2T1 = vec_madd( vec2T1, vec_splat( vecF2, 1 ), zeroVector );
8141 vec3T1 = vec_madd( vec3T1, vec_splat( vecF2, 2 ), zeroVector );
8142 vec4T1 = vec_madd( vec4T1, vec_splat( vecF2, 3 ), zeroVector );
8144 // rotate input data
8145 v5 = vec_perm( v5, v5, storePerm0 );
8146 v6 = vec_perm( v6, v6, storePerm1 );
8147 v7 = vec_perm( v7, v7, storePerm2 );
8148 v8 = vec_perm( v8, v8, storePerm3 );
8150 vec_ste( v5, 0, addr + ( (i+0) * DRAWVERT_OFFSET ) );
8151 vec_ste( v5, 4, addr + ( (i+0) * DRAWVERT_OFFSET ) );
8152 vec_ste( v5, 8, addr + ( (i+0) * DRAWVERT_OFFSET ) );
8154 vec_ste( v6, 0, addr + ( (i+1) * DRAWVERT_OFFSET ) );
8155 vec_ste( v6, 4, addr + ( (i+1) * DRAWVERT_OFFSET ) );
8156 vec_ste( v6, 8, addr + ( (i+1) * DRAWVERT_OFFSET ) );
8158 vec_ste( v7, 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
8159 vec_ste( v7, 4, addr + ( (i+2) * DRAWVERT_OFFSET ) );
8160 vec_ste( v7, 8, addr + ( (i+2) * DRAWVERT_OFFSET ) );
8162 vec_ste( v8, 0, addr + ( (i+3) * DRAWVERT_OFFSET ) );
8163 vec_ste( v8, 4, addr + ( (i+3) * DRAWVERT_OFFSET ) );
8164 vec_ste( v8, 8, addr + ( (i+3) * DRAWVERT_OFFSET ) );
8166 // store tangents[0] and tangents[1]
8167 vec1T0 = vec_perm( vec1T0, vec1T0, storeT11 );
8168 vec1T1 = vec_perm( vec1T1, vec1T1, storeT12 );
8170 vec_ste( vec1T0, 0, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8171 vec_ste( vec1T0, 4, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8172 vec_ste( vec1T0, 8, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8173 vec_ste( vec1T1, 12, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8174 vec_ste( vec1T1, 16, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8175 vec_ste( vec1T1, 20, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8177 // store second tangents[0] and tangents[1]
8178 vec2T0 = vec_perm( vec2T0, vec2T0, storeT21 );
8179 vec2T1 = vec_perm( vec2T1, vec2T1, storeT22 );
8181 vec_ste( vec2T0, 0, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8182 vec_ste( vec2T0, 4, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8183 vec_ste( vec2T0, 8, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8184 vec_ste( vec2T1, 12, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8185 vec_ste( vec2T1, 16, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8186 vec_ste( vec2T1, 20, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8188 // store third tangents[0] and tangents[1]
8189 vec3T0 = vec_perm( vec3T0, vec3T0, storeT31 );
8190 vec3T1 = vec_perm( vec3T1, vec3T1, storeT32 );
8192 vec_ste( vec3T0, 0, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8193 vec_ste( vec3T0, 4, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8194 vec_ste( vec3T0, 8, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8195 vec_ste( vec3T1, 12, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8196 vec_ste( vec3T1, 16, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8197 vec_ste( vec3T1, 20, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8199 // store fourth tangents[0] and tangents[1]
8200 vec4T0 = vec_perm( vec4T0, vec4T0, storeT41 );
8201 vec4T1 = vec_perm( vec4T1, vec4T1, storeT42 );
8203 vec_ste( vec4T0, 0, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8204 vec_ste( vec4T0, 4, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8205 vec_ste( vec4T0, 8, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8206 vec_ste( vec4T1, 12, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8207 vec_ste( vec4T1, 16, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8208 vec_ste( vec4T1, 20, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8212 for ( ; i < numVerts; i++ ) {
8213 idVec3 &v = verts[i].normal;
8216 //f = idMath::RSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
8217 f = FastScalarInvSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
8218 v.x *= f; v.y *= f; v.z *= f;
8220 for ( int j = 0; j < 2; j++ ) {
8221 idVec3 &t = verts[i].tangents[j];
8224 // f = idMath::RSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
8225 f = FastScalarInvSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
8226 t.x *= f; t.y *= f; t.z *= f;
8230 #endif /* ENABLE_DERIVE */
8232 #ifdef ENABLE_CREATE
8236 idSIMD_AltiVec::CreateTextureSpaceLightVectors
8238 Calculates light vectors in texture space for the given triangle vertices.
8239 For each vertex the direction towards the light origin is projected onto texture space.
8240 The light vectors are only calculated for the vertices referenced by the indexes.
8244 void VPCALL idSIMD_AltiVec::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
8246 bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
8247 memset( used, 0, numVerts * sizeof( used[0] ) );
8250 for ( i = 0; i+7 < numIndexes; i+= 8 ) {
8251 used[indexes[i]] = true;
8252 used[indexes[i+1]] = true;
8253 used[indexes[i+2]] = true;
8254 used[indexes[i+3]] = true;
8255 used[indexes[i+4]] = true;
8256 used[indexes[i+5]] = true;
8257 used[indexes[i+6]] = true;
8258 used[indexes[i+7]] = true;
8261 for ( ; i < numIndexes; i++ ) {
8262 used[indexes[i]] = true;
8265 for ( i = 0; i+1 < numVerts; i+=2 ) {
8267 const idDrawVert *v = &verts[i];
8268 const idDrawVert *v2 = &verts[i+1];
8272 idVec3 lightDir, lightDir2;
8274 lightDir[0] = lightOrigin[0] - v->xyz[0];
8275 lightDir[1] = lightOrigin[1] - v->xyz[1];
8276 lightDir[2] = lightOrigin[2] - v->xyz[2];
8278 lightDir2[0] = lightOrigin[0] - v2->xyz[0];
8279 lightDir2[1] = lightOrigin[1] - v2->xyz[1];
8280 lightDir2[2] = lightOrigin[2] - v2->xyz[2];
8282 x = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
8283 y = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
8284 z = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
8286 x2 = lightDir2[0] * v2->tangents[0][0] + lightDir2[1] * v2->tangents[0][1] + lightDir2[2] * v2->tangents[0][2];
8287 y2 = lightDir2[0] * v2->tangents[1][0] + lightDir2[1] * v2->tangents[1][1] + lightDir2[2] * v2->tangents[1][2];
8288 z2 = lightDir2[0] * v2->normal[0] + lightDir2[1] * v2->normal[1] + lightDir2[2] * v2->normal[2];
8291 lightVectors[i][0] = x;
8292 lightVectors[i][1] = y;
8293 lightVectors[i][2] = z;
8297 lightVectors[i+1][0] = x2;
8298 lightVectors[i+1][1] = y2;
8299 lightVectors[i+1][2] = z2;
8304 for ( ; i < numVerts; i++ ) {
8309 const idDrawVert *v = &verts[i];
8312 lightDir[0] = lightOrigin[0] - v->xyz[0];
8313 lightDir[1] = lightOrigin[1] - v->xyz[1];
8314 lightDir[2] = lightOrigin[2] - v->xyz[2];
8316 lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
8317 lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
8318 lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
8325 idSIMD_AltiVec::CreateSpecularTextureCoords
8327 Calculates specular texture coordinates for the given triangle vertices.
8328 For each vertex the normalized direction towards the light origin is added to the
8329 normalized direction towards the view origin and the result is projected onto texture space.
8330 The texture coordinates are only calculated for the vertices referenced by the indexes.
8333 void VPCALL idSIMD_AltiVec::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
8335 bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
8336 memset( used, 0, numVerts * sizeof( used[0] ) );
8339 for ( i = 0; i+7 < numIndexes; i+= 8 ) {
8340 used[indexes[i]] = true;
8341 used[indexes[i+1]] = true;
8342 used[indexes[i+2]] = true;
8343 used[indexes[i+3]] = true;
8344 used[indexes[i+4]] = true;
8345 used[indexes[i+5]] = true;
8346 used[indexes[i+6]] = true;
8347 used[indexes[i+7]] = true;
8350 for ( ; i < numIndexes; i++ ) {
8351 used[indexes[i]] = true;
8354 // load lightOrigin and viewOrigin into vectors
8355 const float *lightOriginPtr = lightOrigin.ToFloatPtr();
8356 const float *viewOriginPtr = viewOrigin.ToFloatPtr();
8357 vector unsigned char permVec = vec_lvsl( 0, lightOriginPtr );
8358 vector unsigned char permVec2 = vec_lvsl( 0, viewOriginPtr );
8359 vector float v0 = vec_ld( 0, lightOriginPtr );
8360 vector float v1 = vec_ld( 15, lightOriginPtr );
8361 vector float v2 = vec_ld( 0, viewOriginPtr );
8362 vector float v3 = vec_ld( 15, viewOriginPtr );
8363 vector float vecLightOrigin = vec_perm( v0, v1, permVec );
8364 vector float vecViewOrigin = vec_perm( v2, v3, permVec2 );
8365 const vector float zeroVector = (vector float)(0);
8368 for ( index = 0; index+1 < numVerts; index+=2 ) {
8369 const float *vertPtr = verts[index].xyz.ToFloatPtr();
8370 const float *vertPtr2 = verts[index+1].xyz.ToFloatPtr();
8372 permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
8373 permVec2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
8375 v0 = vec_ld( 0, vertPtr );
8376 v1 = vec_ld( 15, vertPtr );
8377 vector float v2 = vec_ld( 31, vertPtr );
8378 vector float v3 = vec_ld( 47, vertPtr );
8379 vector float v4 = vec_ld( 63, vertPtr );
8381 vector float v5 = vec_ld( 0, vertPtr2 );
8382 vector float v6 = vec_ld( 15, vertPtr2 );
8383 vector float v7 = vec_ld( 31, vertPtr2 );
8384 vector float v8 = vec_ld( 47, vertPtr2 );
8385 vector float v9 = vec_ld( 63, vertPtr2 );
8387 // figure out what values go where
8388 vector float vecXYZ = vec_perm( v0, v1, permVec );
8389 vector float vecNormal = vec_perm( v1, v2, permVec );
8390 vecNormal = vec_sld( vecNormal, vecNormal, 4 );
8391 const vector float vecTangent0 = vec_perm( v2, v3, permVec );
8392 permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
8393 const vector float vecTangent1 = vec_perm( v3, v4, permVec );
8395 vector float vecXYZ2 = vec_perm( v5, v6, permVec2 );
8396 vector float vecNormal2 = vec_perm( v6, v7, permVec2 );
8397 vecNormal2 = vec_sld( vecNormal2, vecNormal2, 4 );
8398 const vector float vecTangent02 = vec_perm( v7, v8, permVec2 );
8399 permVec2 = vec_add( permVec2, (vector unsigned char)(-4) );
8400 const vector float vecTangent12 = vec_perm( v8, v9, permVec2 );
8402 // calculate lightDir
8403 vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
8404 vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
8406 vector float vecLightDir2 = vec_sub( vecLightOrigin, vecXYZ2 );
8407 vector float vecViewDir2 = vec_sub( vecViewOrigin, vecXYZ2 );
8409 // calculate distance
8410 vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
8411 vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
8413 vector float vecTempLight2 = vec_madd( vecLightDir2, vecLightDir2, zeroVector );
8414 vector float vecTempView2 = vec_madd( vecViewDir2, vecViewDir2, zeroVector );
8416 // sum accross first 3 elements of vector
8417 vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
8418 vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8419 vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
8420 vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
8422 vector float tempSum4 = vec_add( vecTempLight2, vec_sld( vecTempLight2, vecTempLight2, 4 ) );
8423 vecTempLight2 = vec_add( tempSum4, vec_sld( tempSum4, tempSum4, 8 ) );
8424 vector float tempSum5 = vec_add( vecTempView2, vec_sld( vecTempView2, vecTempView2, 4 ) );
8425 vecTempView2 = vec_add( tempSum5, vec_sld( tempSum5, tempSum5, 8 ) );
8427 // splat sum accross the whole vector
8428 vecTempLight = vec_splat( vecTempLight, 0 );
8429 vecTempView = vec_splat( vecTempView, 0 );
8431 vecTempLight2 = vec_splat( vecTempLight2, 0 );
8432 vecTempView2 = vec_splat( vecTempView2, 0 );
8434 vecTempLight = ReciprocalSquareRoot( vecTempLight );
8435 vecTempView = ReciprocalSquareRoot( vecTempView );
8437 vecTempLight2 = ReciprocalSquareRoot( vecTempLight2 );
8438 vecTempView2 = ReciprocalSquareRoot( vecTempView2 );
8440 // modify light and view vectors based on ilength
8441 vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
8442 vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
8444 vecViewDir2 = vec_madd( vecViewDir2, vecTempView2, zeroVector );
8445 vecLightDir2 = vec_madd( vecLightDir2, vecTempLight2, vecViewDir2 );
8447 // calculate what to store in each texture coord
8448 vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
8449 vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
8450 vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
8452 vector float vecTC3 = vec_madd( vecLightDir2, vecTangent02, zeroVector );
8453 vector float vecTC4 = vec_madd( vecLightDir2, vecTangent12, zeroVector );
8454 vector float vecTC5 = vec_madd( vecLightDir2, vecNormal2, zeroVector );
8456 // sum accross first 3 elements of vector
8457 vector float tempSum3;
8458 tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
8459 vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
8460 tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
8461 vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
8462 tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
8463 vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
8465 tempSum4 = vec_add( vecTC3, vec_sld( vecTC3, vecTC3, 4 ) );
8466 vecTC3 = vec_add( tempSum4, vec_sld( vecTC3, vecTC3, 8 ) );
8467 tempSum5 = vec_add( vecTC4, vec_sld( vecTC4, vecTC4, 4 ) );
8468 vecTC4 = vec_add( tempSum5, vec_sld( vecTC4, vecTC4, 8 ) );
8469 vector float tempSum6 = vec_add( vecTC5, vec_sld( vecTC5, vecTC5, 4 ) );
8470 vecTC5 = vec_add( tempSum6, vec_sld( vecTC5, vecTC5, 8 ) );
8472 vecTC0 = vec_splat( vecTC0, 0 );
8473 vecTC1 = vec_splat( vecTC1, 0 );
8474 vecTC2 = vec_splat( vecTC2, 0 );
8476 vecTC3 = vec_splat( vecTC3, 0 );
8477 vecTC4 = vec_splat( vecTC4, 0 );
8478 vecTC5 = vec_splat( vecTC5, 0 );
8480 if ( used[index] ) {
8481 // store out results
8482 vec_ste( vecTC0, 0, &texCoords[index][0] );
8483 vec_ste( vecTC1, 0, &texCoords[index][1] );
8484 vec_ste( vecTC2, 0, &texCoords[index][2] );
8485 vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
8488 if ( used[index+1] ) {
8489 vec_ste( vecTC3, 0, &texCoords[index+1][0] );
8490 vec_ste( vecTC4, 0, &texCoords[index+1][1] );
8491 vec_ste( vecTC5, 0, &texCoords[index+1][2] );
8492 vec_ste( (vector float)(1.0), 0, &texCoords[index+1][3] );
8497 for ( ; index < numVerts; index++ ) {
8498 if ( !used[index] ) {
8502 const float *vertPtr = verts[index].xyz.ToFloatPtr();
8504 permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
8506 v0 = vec_ld( 0, vertPtr );
8507 v1 = vec_ld( 15, vertPtr );
8508 vector float v2 = vec_ld( 31, vertPtr );
8509 vector float v3 = vec_ld( 47, vertPtr );
8510 vector float v4 = vec_ld( 63, vertPtr );
8512 // figure out what values go where
8513 vector float vecXYZ = vec_perm( v0, v1, permVec );
8514 vector float vecNormal = vec_perm( v1, v2, permVec );
8515 vecNormal = vec_sld( vecNormal, vecNormal, 4 );
8516 const vector float vecTangent0 = vec_perm( v2, v3, permVec );
8517 permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
8518 const vector float vecTangent1 = vec_perm( v3, v4, permVec );
8520 // calculate lightDir
8521 vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
8522 vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
8524 // calculate distance
8525 vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
8526 vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
8528 // sum accross first 3 elements of vector
8529 vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
8530 vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8531 vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
8532 vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
8534 // splat sum accross the whole vector
8535 vecTempLight = vec_splat( vecTempLight, 0 );
8536 vecTempView = vec_splat( vecTempView, 0 );
8538 vecTempLight = ReciprocalSquareRoot( vecTempLight );
8539 vecTempView = ReciprocalSquareRoot( vecTempView );
8541 // modify light and view vectors based on ilength
8542 vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
8543 vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
8545 // calculate what to store in each texture coord
8546 vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
8547 vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
8548 vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
8550 // sum accross first 3 elements of vector
8551 vector float tempSum3;
8552 tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
8553 vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
8554 tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
8555 vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
8556 tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
8557 vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
8559 vecTC0 = vec_splat( vecTC0, 0 );
8560 vecTC1 = vec_splat( vecTC1, 0 );
8561 vecTC2 = vec_splat( vecTC2, 0 );
8563 // store out results
8564 vec_ste( vecTC0, 0, &texCoords[index][0] );
8565 vec_ste( vecTC1, 0, &texCoords[index][1] );
8566 vec_ste( vecTC2, 0, &texCoords[index][2] );
8567 vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
8571 #endif /* 0 for disable spec coord */
8575 #ifdef VERTEXCACHE_ALIGNED
8578 idSIMD_AltiVec::CreateShadowCache
8581 int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
8585 assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
8587 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8588 register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
8589 register vector float zeroVector = (vector float)(0.0);
8590 register vector float oneVector = (vector float)(1);
8591 register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8593 const float *lPtr = lightOrigin.ToFloatPtr();
8599 // put values into a vector
8600 vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
8601 v0 = vec_ld( 0, lPtr );
8602 v1 = vec_ld( 15, lPtr );
8603 v0 = vec_perm( v0, v1, vecPerm );
8604 v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
8606 //v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
8607 for ( ; i+3 < numVerts; i+= 4 ) {
8608 if ( ! vertRemap[i] ) {
8609 vPtr = verts[i].xyz.ToFloatPtr();
8611 #ifndef DRAWVERT_PADDED
8612 vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
8613 v2 = vec_ld( 0, vPtr );
8614 v3 = vec_ld( 15, vPtr );
8615 v7 = vec_perm( v2, v3, vecPerm2 );
8617 v7 = vec_ld( 0, vPtr );
8619 v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
8620 v3 = vec_perm( v7, oneVector, vecPermZeroLast );
8621 v1 = vec_sub( v2, v0 );
8623 vec_st( v3, 0, &vertexCache[outVerts][0] );
8624 vec_st( v1, 0, &vertexCache[outVerts+1][0] );
8626 vertRemap[i] = outVerts;
8630 if ( ! vertRemap[i+1] ) {
8631 vPtr2 = verts[i+1].xyz.ToFloatPtr();
8633 #ifndef DRAWVERT_PADDED
8634 vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
8635 v4 = vec_ld( 0, vPtr2 );
8636 v5 = vec_ld( 15, vPtr2 );
8637 v6 = vec_perm( v4, v5, vecPerm3 );
8639 v6 = vec_ld( 0, vPtr2 );
8641 v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8642 v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8643 v6 = vec_sub( v4, v0 );
8645 vec_st( v5, 0, &vertexCache[outVerts][0] );
8646 vec_st( v6, 0, &vertexCache[outVerts+1][0] );
8648 vertRemap[i+1] = outVerts;
8652 if ( ! vertRemap[i+2] ) {
8653 vPtr3 = verts[i+2].xyz.ToFloatPtr();
8655 #ifndef DRAWVERT_PADDED
8656 vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
8657 v1 = vec_ld( 0, vPtr3 );
8658 v2 = vec_ld( 15, vPtr3 );
8659 v3 = vec_perm( v1, v2, vecPerm4 );
8661 v3 = vec_ld( 0, vPtr3 );
8663 v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
8664 v2 = vec_perm( v3, oneVector, vecPermZeroLast );
8665 v3 = vec_sub( v1, v0 );
8667 vec_st( v2, 0, &vertexCache[outVerts][0] );
8668 vec_st( v3, 0, &vertexCache[outVerts+1][0] );
8670 vertRemap[i+2] = outVerts;
8674 if ( ! vertRemap[i+3] ) {
8675 vPtr4 = verts[i+3].xyz.ToFloatPtr();
8676 #ifndef DRAWVERT_PADDED
8677 vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
8678 v4 = vec_ld( 0, vPtr4 );
8679 v5 = vec_ld( 16, vPtr4 );
8680 v6 = vec_perm( v4, v5, vecPerm5 );
8682 v6 = vec_ld( 0, vPtr4 );
8684 v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8685 v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8686 v6 = vec_sub( v4, v0 );
8688 vec_st( v5, 0, &vertexCache[outVerts][0] );
8689 vec_st( v6, 0, &vertexCache[outVerts+1][0] );
8691 vertRemap[i+3] = outVerts;
8697 for (; i < numVerts; i++ ) {
8698 if ( vertRemap[i] ) {
8701 const float *v = verts[i].xyz.ToFloatPtr();
8702 vertexCache[outVerts+0][0] = v[0];
8703 vertexCache[outVerts+0][1] = v[1];
8704 vertexCache[outVerts+0][2] = v[2];
8705 vertexCache[outVerts+0][3] = 1.0f;
8707 // R_SetupProjection() builds the projection matrix with a slight crunch
8708 // for depth, which keeps this w=0 division from rasterizing right at the
8709 // wrap around point and causing depth fighting with the rear caps
8710 vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
8711 vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
8712 vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
8713 vertexCache[outVerts+1][3] = 0.0f;
8714 vertRemap[i] = outVerts;
8724 idSIMD_AltiVec::CreateShadowCache
8727 int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
8731 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8732 register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
8733 register vector float zeroVector = (vector float)(0.0);
8734 register vector float oneVector = (vector float)(1);
8735 register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8737 const float *lPtr = lightOrigin.ToFloatPtr();
8743 // put values into a vector
8744 vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
8745 v0 = vec_ld( 0, lPtr );
8746 v1 = vec_ld( 15, lPtr );
8747 v0 = vec_perm( v0, v1, vecPerm );
8748 v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
8750 //v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
8751 for ( ; i+3 < numVerts; i+= 4 ) {
8752 if ( ! vertRemap[i] ) {
8753 vPtr = verts[i].xyz.ToFloatPtr();
8754 #ifndef DRAWVERT_PADDED
8755 vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
8756 v2 = vec_ld( 0, vPtr );
8757 v3 = vec_ld( 15, vPtr );
8758 v7 = vec_perm( v2, v3, vecPerm2 );
8760 v7 = vec_ld( 0, vPtr );
8762 v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
8763 v3 = vec_perm( v7, oneVector, vecPermZeroLast );
8764 v1 = vec_sub( v2, v0 );
8767 UNALIGNED_STORE2( &vertexCache[outVerts][0], v3, v1 );
8769 vertRemap[i] = outVerts;
8773 if ( ! vertRemap[i+1] ) {
8774 vPtr2 = verts[i+1].xyz.ToFloatPtr();
8775 #ifndef DRAWVERT_PADDED
8776 vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
8777 v4 = vec_ld( 0, vPtr2 );
8778 v5 = vec_ld( 15, vPtr2 );
8779 v6 = vec_perm( v4, v5, vecPerm3 );
8781 v6 = vec_ld( 0, vPtr2 );
8783 v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8784 v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8785 v6 = vec_sub( v4, v0 );
8788 UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
8790 vertRemap[i+1] = outVerts;
8794 if ( ! vertRemap[i+2] ) {
8795 vPtr3 = verts[i+2].xyz.ToFloatPtr();
8796 #ifndef DRAWVERT_PADDED
8797 vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
8798 v1 = vec_ld( 0, vPtr3 );
8799 v2 = vec_ld( 15, vPtr3 );
8800 v3 = vec_perm( v1, v2, vecPerm4 );
8802 v3 = vec_ld( 0, vPtr3 );
8804 v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
8805 v2 = vec_perm( v3, oneVector, vecPermZeroLast );
8806 v3 = vec_sub( v1, v0 );
8809 UNALIGNED_STORE2( &vertexCache[outVerts][0], v2, v3 );
8811 vertRemap[i+2] = outVerts;
8814 if ( ! vertRemap[i+3] ) {
8815 vPtr4 = verts[i+3].xyz.ToFloatPtr();
8816 #ifndef DRAWVERT_PADDED
8817 vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
8818 v4 = vec_ld( 0, vPtr4 );
8819 v5 = vec_ld( 16, vPtr4 );
8820 v6 = vec_perm( v4, v5, vecPerm5 );
8822 v6 = vec_ld( 0, vPtr4 );
8825 v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8826 v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8827 v6 = vec_sub( v4, v0 );
8830 UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
8833 vertRemap[i+3] = outVerts;
8839 for (; i < numVerts; i++ ) {
8840 if ( vertRemap[i] ) {
8843 const float *v = verts[i].xyz.ToFloatPtr();
8844 vertexCache[outVerts+0][0] = v[0];
8845 vertexCache[outVerts+0][1] = v[1];
8846 vertexCache[outVerts+0][2] = v[2];
8847 vertexCache[outVerts+0][3] = 1.0f;
8849 // R_SetupProjection() builds the projection matrix with a slight crunch
8850 // for depth, which keeps this w=0 division from rasterizing right at the
8851 // wrap around point and causing depth fighting with the rear caps
8852 vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
8853 vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
8854 vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
8855 vertexCache[outVerts+1][3] = 0.0f;
8856 vertRemap[i] = outVerts;
8861 #endif /* VERTEXCACHE_ALIGNED */
8863 #endif /* 0 to disable shadow cache */
8867 #ifdef VERTEXCACHE_ALIGNED
8870 idSIMD_AltiVec::CreateVertexProgramShadowCache
8873 int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
8875 // vertexCache aligned
8876 assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
8878 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
8880 assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
8882 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8883 register vector float zeroVector = (vector float)(0.0);
8884 register vector float oneVector = (vector float)(1);
8885 register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8886 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
8889 #ifndef DRAWVERT_PADDED
8890 // every fourth one will have the same alignment. Make sure we've got enough here
8891 if ( i+3 < numVerts ) {
8892 vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8893 vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8894 vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8895 vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8899 for ( ; i+3 < numVerts; i+=4 ) {
8900 const float *vertPtr = verts[i].xyz.ToFloatPtr();
8901 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
8902 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
8903 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
8905 #ifndef DRAWVERT_PADDED
8906 v0 = vec_ld( 0, vertPtr );
8907 v1 = vec_ld( 15, vertPtr );
8908 v2 = vec_ld( 0, vertPtr2 );
8909 v3 = vec_ld( 15, vertPtr2 );
8910 v4 = vec_ld( 0, vertPtr3 );
8911 v5 = vec_ld( 15, vertPtr3 );
8912 v6 = vec_ld( 0, vertPtr4 );
8913 v7 = vec_ld( 15, vertPtr4 );
8915 v0 = vec_perm( v0, v1, vertPerm1 );
8916 v1 = vec_perm( v2, v3, vertPerm2 );
8917 v2 = vec_perm( v4, v5, vertPerm3 );
8918 v3 = vec_perm( v6, v7, vertPerm4 );
8920 v0 = vec_ld( 0, vertPtr );
8921 v1 = vec_ld( 0, vertPtr2 );
8922 v2 = vec_ld( 0, vertPtr3 );
8923 v3 = vec_ld( 0, vertPtr4 );
8926 v0 = vec_perm( v0, oneVector, vecPermThreeOne );
8927 v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
8929 v1 = vec_perm( v1, oneVector, vecPermThreeOne );
8930 v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
8932 v2 = vec_perm( v2, oneVector, vecPermThreeOne );
8933 v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
8935 v3 = vec_perm( v3, oneVector, vecPermThreeOne );
8936 v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
8939 ALIGNED_STORE4( &vertexCache[i*2][0], v0, v4, v1, v5 );
8940 ALIGNED_STORE4( &vertexCache[(i+2)*2][0], v2, v6, v3, v7 );
8945 for ( ; i < numVerts; i++ ) {
8946 const float *v = verts[i].xyz.ToFloatPtr();
8947 vertexCache[i*2+0][0] = v[0];
8948 vertexCache[i*2+1][0] = v[0];
8949 vertexCache[i*2+0][1] = v[1];
8950 vertexCache[i*2+1][1] = v[1];
8951 vertexCache[i*2+0][2] = v[2];
8952 vertexCache[i*2+1][2] = v[2];
8953 vertexCache[i*2+0][3] = 1.0f;
8954 vertexCache[i*2+1][3] = 0.0f;
8956 return numVerts * 2;
8962 idSIMD_AltiVec::CreateVertexProgramShadowCache
8965 int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
8968 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
8970 assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
8972 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8973 register vector float zeroVector = (vector float)(0.0);
8974 register vector float oneVector = (vector float)(1);
8975 register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8976 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
8979 #ifndef DRAWVERT_PADDED
8980 // every fourth one will have the same alignment. Make sure we've got enough here
8981 if ( i+3 < numVerts ) {
8982 vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8983 vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8984 vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8985 vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8989 for ( ; i+3 < numVerts; i+=4 ) {
8990 const float *vertPtr = verts[i].xyz.ToFloatPtr();
8991 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
8992 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
8993 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
8995 #ifndef DRAWVERT_PADDED
8996 v0 = vec_ld( 0, vertPtr );
8997 v1 = vec_ld( 15, vertPtr );
8998 v2 = vec_ld( 0, vertPtr2 );
8999 v3 = vec_ld( 15, vertPtr2 );
9000 v4 = vec_ld( 0, vertPtr3 );
9001 v5 = vec_ld( 15, vertPtr3 );
9002 v6 = vec_ld( 0, vertPtr4 );
9003 v7 = vec_ld( 15, vertPtr4 );
9005 v0 = vec_perm( v0, v1, vertPerm1 );
9006 v1 = vec_perm( v2, v3, vertPerm2 );
9007 v2 = vec_perm( v4, v5, vertPerm3 );
9008 v3 = vec_perm( v6, v7, vertPerm4 );
9010 v0 = vec_ld( 0, vertPtr );
9011 v1 = vec_ld( 0, vertPtr2 );
9012 v2 = vec_ld( 0, vertPtr3 );
9013 v3 = vec_ld( 0, vertPtr4 );
9016 v0 = vec_perm( v0, oneVector, vecPermThreeOne );
9017 v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
9019 v1 = vec_perm( v1, oneVector, vecPermThreeOne );
9020 v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
9022 v2 = vec_perm( v2, oneVector, vecPermThreeOne );
9023 v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
9025 v3 = vec_perm( v3, oneVector, vecPermThreeOne );
9026 v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
9028 // store results as unaligned
9029 vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &vertexCache[i*2][0] ), (vector unsigned char)(1) );
9030 vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
9031 vector float vc1 = vec_ld( 0, &vertexCache[i*2][0] );
9032 vector float vc2 = vec_ld( 127, &vertexCache[i*2][0] );
9034 // right rotate input data
9035 v0 = vec_perm( v0, v0, storePerm );
9036 v4 = vec_perm( v4, v4, storePerm );
9037 v1 = vec_perm( v1, v1, storePerm );
9038 v5 = vec_perm( v5, v5, storePerm );
9039 v2 = vec_perm( v2, v2, storePerm );
9040 v6 = vec_perm( v6, v6, storePerm );
9041 v3 = vec_perm( v3, v3, storePerm );
9042 v7 = vec_perm( v7, v7, storePerm );
9044 vec_st( vec_sel( vc1, v0, mask ), 0 , &vertexCache[i*2][0] );
9045 vec_st( vec_sel( v0, v4, mask ), 15 , &vertexCache[i*2][0] );
9046 vec_st( vec_sel( v4, v1, mask ), 31 , &vertexCache[i*2][0] );
9047 vec_st( vec_sel( v1, v5, mask ), 47 , &vertexCache[i*2][0] );
9048 vec_st( vec_sel( v5, v2, mask ), 63 , &vertexCache[i*2][0] );
9049 vec_st( vec_sel( v2, v6, mask ), 79 , &vertexCache[i*2][0] );
9050 vec_st( vec_sel( v6, v3, mask ), 95 , &vertexCache[i*2][0] );
9051 vec_st( vec_sel( v3, v7, mask ), 111 , &vertexCache[i*2][0] );
9052 vec_st( vec_sel( v7, vc2, mask ), 127 , &vertexCache[i*2][0] );
9056 for ( ; i < numVerts; i++ ) {
9057 const float *v = verts[i].xyz.ToFloatPtr();
9058 vertexCache[i*2+0][0] = v[0];
9059 vertexCache[i*2+1][0] = v[0];
9060 vertexCache[i*2+0][1] = v[1];
9061 vertexCache[i*2+1][1] = v[1];
9062 vertexCache[i*2+0][2] = v[2];
9063 vertexCache[i*2+1][2] = v[2];
9064 vertexCache[i*2+0][3] = 1.0f;
9065 vertexCache[i*2+1][3] = 0.0f;
9067 return numVerts * 2;
9070 #endif /* VERTEXCACHE_ALIGNED */
9072 #endif /* 0 to kill VP shader cache */
9074 #endif /* ENABLE_CREATE */
9076 #ifdef ENABLE_SOUND_ROUTINES
9078 #ifdef SOUND_DEST_ALIGNED
9081 idSIMD_AltiVec::UpSamplePCMTo44kHz
9083 Duplicate samples for 44kHz output.
9086 Assumes that dest starts at aligned address
9089 void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
9092 assert( IS_16BYTE_ALIGNED( dest[0] ) );
9094 vector signed short vs0, vs1;
9095 register vector signed int vi0, vi1;
9096 register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
9098 register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
9099 register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
9101 register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9102 register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9104 // If this can be assumed true, we can eliminate another conditional that checks to see if we can
9105 // load up a vector before the loop
9106 assert( numSamples >= 12 );
9108 if ( kHz == 11025 ) {
9109 if ( numChannels == 1 ) {
9113 vector signed short vsOld = vec_ld( 0, &src[i] );
9114 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
9116 for ( ; i+7 < numSamples; i+= 8 ) {
9118 vs1 = vec_ld( 15, &src[i] );
9119 vs0 = vec_perm( vsOld, vs1, permVec );
9122 // unpack shorts to ints
9123 vi0 = vec_unpackh( vs0 );
9124 vi1 = vec_unpackl( vs0 );
9125 // convert ints to floats
9126 v0 = vec_ctf( vi0, 0 );
9127 v1 = vec_ctf( vi1, 0 );
9128 // permute into vectors in the order to store
9130 v2 = vec_splat( v0, 0 );
9131 v3 = vec_splat( v0, 1 );
9132 v4 = vec_splat( v0, 2 );
9133 v5 = vec_splat( v0, 3 );
9134 v6 = vec_splat( v1, 0 );
9135 v7 = vec_splat( v1, 1 );
9136 v8 = vec_splat( v1, 2 );
9137 v9 = vec_splat( v1, 3 );
9140 ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
9143 for (; i < numSamples; i++ ) {
9144 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
9149 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9150 vector signed short vsOld = vec_ld( 0, &src[0] );
9152 for ( ; i+7 < numSamples; i += 8 ) {
9154 vs1 = vec_ld( 15, &src[i] );
9155 vs0 = vec_perm( vsOld, vs1, permVec );
9158 // unpack shorts to ints
9159 vi0 = vec_unpackh( vs0 );
9160 vi1 = vec_unpackl( vs0 );
9161 // convert ints to floats
9162 v0 = vec_ctf( vi0, 0 );
9163 v1 = vec_ctf( vi1, 0 );
9164 // put into vectors in order to store
9165 v2 = vec_perm( v0, v0, vecFirstHalf );
9167 v4 = vec_perm( v0, v0, vecSecondHalf );
9169 v6 = vec_perm( v1, v1, vecFirstHalf );
9171 v8 = vec_perm (v1, v1, vecSecondHalf );
9175 ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
9178 for ( ; i < numSamples; i += 2 ) {
9179 dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
9180 dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
9183 } else if ( kHz == 22050 ) {
9184 if ( numChannels == 1 ) {
9186 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9187 vector signed short vsOld = vec_ld( 0, &src[0] );
9189 for ( i = 0; i+7 < numSamples; i += 8 ) {
9191 vs1 = vec_ld( 0, &src[i] );
9192 vs0 = vec_perm( vsOld, vs1, permVec );
9195 // unpack shorts to ints
9196 vi0 = vec_unpackh( vs0 );
9197 vi1 = vec_unpackl( vs0 );
9198 // convert ints to floats
9199 v0 = vec_ctf( vi0, 0 );
9200 v1 = vec_ctf( vi1, 0 );
9201 // put into vectors in order to store
9202 v2 = vec_perm( v0, v0, vecBottom );
9203 v3 = vec_perm( v0, v0, vecTop );
9204 v4 = vec_perm( v1, v1, vecBottom );
9205 v5 = vec_perm (v1, v1, vecTop );
9208 ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
9211 for ( ; i < numSamples; i++ ) {
9212 dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
9216 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9217 vector signed short vsOld = vec_ld( 0, &src[0] );
9219 for ( i = 0; i+7 < numSamples; i += 8 ) {
9221 vs1 = vec_ld( 15, &src[i] );
9222 vs0 = vec_perm( vsOld, vs1, permVec );
9225 // unpack shorts to ints
9226 vi0 = vec_unpackh( vs0 );
9227 vi1 = vec_unpackl( vs0 );
9228 // convert ints to floats
9229 v0 = vec_ctf( vi0, 0 );
9230 v1 = vec_ctf( vi1, 0 );
9231 // put into vectors in order to store
9232 v2 = vec_perm( v0, v0, vecFirstHalf );
9233 v3 = vec_perm( v0, v0, vecSecondHalf );
9234 v4 = vec_perm( v1, v1, vecFirstHalf );
9235 v5 = vec_perm (v1, v1, vecSecondHalf );
9238 ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
9241 for ( ; i < numSamples; i += 2 ) {
9242 dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
9243 dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
9246 } else if ( kHz == 44100 ) {
9248 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9249 vector signed short vsOld = vec_ld( 0, &src[0] );
9251 for ( i = 0; i+7 < numSamples; i += 8 ) {
9252 vs1 = vec_ld( 15, &src[i] );
9253 vs0 = vec_perm( vsOld, vs1, permVec );
9256 //unpack shorts to ints
9257 vi0 = vec_unpackh( vs0 );
9258 vi1 = vec_unpackl( vs0 );
9260 //convert ints to floats
9261 v0 = vec_ctf( vi0, 0 );
9262 v1 = vec_ctf( vi1, 0 );
9265 ALIGNED_STORE2( &dest[i], v0, v1 );
9268 for ( ; i < numSamples; i++ ) {
9269 dest[i] = (float) src[i];
9280 idSIMD_AltiVec::UpSamplePCMTo44kHz
9282 Duplicate samples for 44kHz output.
9288 void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
9290 vector signed short vs0, vs1;
9291 register vector signed int vi0, vi1;
9292 register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
9294 register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
9295 register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
9297 register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9298 register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9300 // calculate perm vector and masks for stores
9301 vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
9302 // original values of dest
9303 vector float vecDest = vec_ld( 0, &dest[0] );
9304 vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
9306 if ( kHz == 11025 ) {
9307 if ( numChannels == 1 ) {
9311 vector signed short vsOld = vec_ld( 0, &src[i] );
9312 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
9314 for ( ; i+7 < numSamples; i+= 8 ) {
9316 vs1 = vec_ld( 15, &src[i] );
9317 vs0 = vec_perm( vsOld, vs1, permVec );
9319 vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
9321 // unpack shorts to ints
9322 vi0 = vec_unpackh( vs0 );
9323 vi1 = vec_unpackl( vs0 );
9324 // convert ints to floats
9325 v0 = vec_ctf( vi0, 0 );
9326 v1 = vec_ctf( vi1, 0 );
9327 // permute into vectors in the order to store
9329 v2 = vec_splat( v0, 0 );
9330 v3 = vec_splat( v0, 1 );
9331 v4 = vec_splat( v0, 2 );
9332 v5 = vec_splat( v0, 3 );
9333 v6 = vec_splat( v1, 0 );
9334 v7 = vec_splat( v1, 1 );
9335 v8 = vec_splat( v1, 2 );
9336 v9 = vec_splat( v1, 3 );
9338 v2 = vec_perm( v2, v2, storePerm );
9339 v3 = vec_perm( v3, v3, storePerm );
9340 v4 = vec_perm( v4, v4, storePerm );
9341 v5 = vec_perm( v5, v5, storePerm );
9342 v6 = vec_perm( v6, v6, storePerm );
9343 v7 = vec_perm( v7, v7, storePerm );
9344 v8 = vec_perm( v8, v8, storePerm );
9345 v9 = vec_perm( v9, v9, storePerm );
9348 vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
9349 vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
9350 vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
9351 vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
9352 vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
9353 vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
9354 vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
9355 vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
9356 vecDest = vec_sel( v9, vecDestEnd, mask );
9357 vec_st( vecDest, 127, &dest[i*4] );
9360 for (; i < numSamples; i++ ) {
9361 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
9366 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9367 vector signed short vsOld = vec_ld( 0, &src[0] );
9369 for ( ; i+7 < numSamples; i += 8 ) {
9371 vs1 = vec_ld( 15, &src[i] );
9372 vs0 = vec_perm( vsOld, vs1, permVec );
9374 vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
9376 // unpack shorts to ints
9377 vi0 = vec_unpackh( vs0 );
9378 vi1 = vec_unpackl( vs0 );
9379 // convert ints to floats
9380 v0 = vec_ctf( vi0, 0 );
9381 v1 = vec_ctf( vi1, 0 );
9382 // put into vectors in order to store
9383 v2 = vec_perm( v0, v0, vecFirstHalf );
9385 v4 = vec_perm( v0, v0, vecSecondHalf );
9387 v6 = vec_perm( v1, v1, vecFirstHalf );
9389 v8 = vec_perm (v1, v1, vecSecondHalf );
9392 v2 = vec_perm( v2, v2, storePerm );
9393 v3 = vec_perm( v3, v3, storePerm );
9394 v4 = vec_perm( v4, v4, storePerm );
9395 v5 = vec_perm( v5, v5, storePerm );
9396 v6 = vec_perm( v6, v6, storePerm );
9397 v7 = vec_perm( v7, v7, storePerm );
9398 v8 = vec_perm( v8, v8, storePerm );
9399 v9 = vec_perm( v9, v9, storePerm );
9402 vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
9403 vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
9404 vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
9405 vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
9406 vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
9407 vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
9408 vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
9409 vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
9410 vecDest = vec_sel( v9, vecDestEnd, mask );
9411 vec_st( vecDest, 127, &dest[i*4] );
9414 for ( ; i < numSamples; i += 2 ) {
9415 dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
9416 dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
9419 } else if ( kHz == 22050 ) {
9420 if ( numChannels == 1 ) {
9422 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9423 vector signed short vsOld = vec_ld( 0, &src[0] );
9425 for ( i = 0; i+7 < numSamples; i += 8 ) {
9427 vs1 = vec_ld( 0, &src[i] );
9428 vs0 = vec_perm( vsOld, vs1, permVec );
9430 vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
9432 // unpack shorts to ints
9433 vi0 = vec_unpackh( vs0 );
9434 vi1 = vec_unpackl( vs0 );
9435 // convert ints to floats
9436 v0 = vec_ctf( vi0, 0 );
9437 v1 = vec_ctf( vi1, 0 );
9438 // put into vectors in order to store
9439 v2 = vec_perm( v0, v0, vecBottom );
9440 v3 = vec_perm( v0, v0, vecTop );
9441 v4 = vec_perm( v1, v1, vecBottom );
9442 v5 = vec_perm (v1, v1, vecTop );
9444 v2 = vec_perm( v2, v2, storePerm );
9445 v3 = vec_perm( v3, v3, storePerm );
9446 v4 = vec_perm( v4, v4, storePerm );
9447 v5 = vec_perm( v5, v5, storePerm );
9450 vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
9451 vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
9452 vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
9453 vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
9454 vecDest = vec_sel( v5, vecDestEnd, mask );
9455 vec_st( vecDest, 63, &dest[i*2] );
9459 for ( ; i < numSamples; i++ ) {
9460 dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
9464 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9465 vector signed short vsOld = vec_ld( 0, &src[0] );
9467 for ( i = 0; i+7 < numSamples; i += 8 ) {
9469 vs1 = vec_ld( 15, &src[i] );
9470 vs0 = vec_perm( vsOld, vs1, permVec );
9472 vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
9474 // unpack shorts to ints
9475 vi0 = vec_unpackh( vs0 );
9476 vi1 = vec_unpackl( vs0 );
9477 // convert ints to floats
9478 v0 = vec_ctf( vi0, 0 );
9479 v1 = vec_ctf( vi1, 0 );
9480 // put into vectors in order to store
9481 v2 = vec_perm( v0, v0, vecFirstHalf );
9482 v3 = vec_perm( v0, v0, vecSecondHalf );
9483 v4 = vec_perm( v1, v1, vecFirstHalf );
9484 v5 = vec_perm (v1, v1, vecSecondHalf );
9486 v2 = vec_perm( v2, v2, storePerm );
9487 v3 = vec_perm( v3, v3, storePerm );
9488 v4 = vec_perm( v4, v4, storePerm );
9489 v5 = vec_perm( v5, v5, storePerm );
9492 vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
9493 vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
9494 vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
9495 vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
9496 vecDest = vec_sel( v5, vecDestEnd, mask );
9497 vec_st( vecDest, 63, &dest[i*2] );
9500 for ( ; i < numSamples; i += 2 ) {
9501 dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
9502 dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
9505 } else if ( kHz == 44100 ) {
9507 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9508 vector signed short vsOld = vec_ld( 0, &src[0] );
9510 for ( i = 0; i+7 < numSamples; i += 8 ) {
9511 //vs0 = vec_ld( 0, &src[i] );
9512 vs1 = vec_ld( 15, &src[i] );
9513 vs0 = vec_perm( vsOld, vs1, permVec );
9515 vector float vecDestEnd = vec_ld( 31, &dest[i] );
9517 //unpack shorts to ints
9518 vi0 = vec_unpackh( vs0 );
9519 vi1 = vec_unpackl( vs0 );
9521 //convert ints to floats
9522 v0 = vec_ctf( vi0, 0 );
9523 v1 = vec_ctf( vi1, 0 );
9525 v0 = vec_perm( v0, v0, storePerm );
9526 v1 = vec_perm( v1, v1, storePerm );
9529 vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
9530 vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
9531 vecDest = vec_sel( v1, vecDestEnd, mask );
9532 vec_st( vecDest, 31, &dest[i] );
9535 for ( ; i < numSamples; i++ ) {
9536 dest[i] = (float) src[i];
9545 #ifdef SOUND_DEST_ALIGNED
9548 idSIMD_AltiVec::UpSampleOGGTo44kHz
9550 Duplicate samples for 44kHz output.
9553 Assumes that dest starts at aligned address
9556 void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
9558 assert( IS_16BYTE_ALIGNED( dest[0] ) );
9560 register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
9561 register vector float constVec, zeroVector;
9562 register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
9563 vector unsigned char vecPerm1;
9564 vector unsigned char vecPerm2;
9566 vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9567 vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9568 vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
9569 vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
9570 vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
9571 vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
9573 constVec = (vector float)(32768.0f);
9574 zeroVector = (vector float)(0.0);
9576 if ( kHz == 11025 ) {
9577 if ( numChannels == 1 ) {
9578 // calculate perm vector and do first load
9579 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9580 v10 = vec_ld( 0, &ogg[0][0] );
9583 for ( i = 0; i+7 < numSamples; i += 8 ) {
9584 // as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
9586 v9 = vec_ld( 15, &ogg[0][i] );
9587 v10 = vec_ld( 31, &ogg[0][i] );
9588 v0 = vec_perm( v8, v9, vecPerm1 );
9589 v1 = vec_perm( v9, v10, vecPerm1 );
9591 // now we have the elements in a vector, we want
9592 // to splat them each accross their own vector
9593 oggVec1 = vec_splat( v0, 0 );
9594 oggVec2 = vec_splat( v0, 1 );
9595 oggVec3 = vec_splat( v0, 2 );
9596 oggVec4 = vec_splat( v0, 3 );
9597 oggVec5 = vec_splat( v1, 0 );
9598 oggVec6 = vec_splat( v1, 1 );
9599 oggVec7 = vec_splat( v1, 2 );
9600 oggVec8 = vec_splat( v1, 3 );
9602 v0 = vec_madd( oggVec1, constVec, zeroVector );
9603 v1 = vec_madd( oggVec2, constVec, zeroVector );
9604 v2 = vec_madd( oggVec3, constVec, zeroVector );
9605 v3 = vec_madd( oggVec4, constVec, zeroVector );
9606 v4 = vec_madd( oggVec5, constVec, zeroVector );
9607 v5 = vec_madd( oggVec6, constVec, zeroVector );
9608 v6 = vec_madd( oggVec7, constVec, zeroVector );
9609 v7 = vec_madd( oggVec8, constVec, zeroVector );
9612 ALIGNED_STORE8( &dest[i*4], v0, v1, v2, v3, v4, v5, v6, v7 );
9617 for ( ; i < numSamples; i++ ) {
9618 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
9623 // calculate perm vec for ogg
9624 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9625 vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9626 v7 = vec_ld( 0, &ogg[1][0] );
9627 v9 = vec_ld( 0, &ogg[0][0] );
9630 for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
9631 // load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
9633 v9 = vec_ld( 15, &ogg[0][i] );
9634 v0 = vec_perm( v8, v9, vecPerm1 );
9636 // now we have the elements in a vector, we want
9637 // to splat them each accross their own vector
9638 oggVec1 = vec_splat( v0, 0 );
9639 oggVec2 = vec_splat( v0, 1 );
9640 oggVec3 = vec_splat( v0, 2 );
9641 oggVec4 = vec_splat( v0, 3 );
9643 // load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
9645 v7 = vec_ld( 15, &ogg[1][i] );
9646 v1 = vec_perm( v6, v7, vecPerm2 );
9648 // now we have the elements in a vector, we want
9649 // to splat them each accross their own vector
9650 oggVec5 = vec_splat( v1, 0 );
9651 oggVec6 = vec_splat( v1, 1 );
9652 oggVec7 = vec_splat( v1, 2 );
9653 oggVec8 = vec_splat( v1, 3 );
9655 oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
9656 oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
9657 oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
9658 oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
9659 oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
9660 oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
9661 oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
9662 oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
9664 //merge generates the interleaved pattern that we want and it
9665 //doesn't require a permute vector, so use that instead
9666 v0 = vec_mergeh( oggVec1, oggVec5 );
9667 v1 = vec_mergel( oggVec1, oggVec5 );
9668 v2 = vec_mergeh( oggVec2, oggVec6 );
9669 v3 = vec_mergel( oggVec2, oggVec6 );
9671 v4 = vec_mergeh( oggVec3, oggVec7 );
9672 v5 = vec_mergel( oggVec3, oggVec7 );
9673 v6 = vec_mergeh( oggVec4, oggVec8 );
9674 v10 = vec_mergel( oggVec4, oggVec8 );
9677 ALIGNED_STORE8( &dest[i*8], v0, v1, v2, v3, v4, v5, v6, v10 );
9681 for ( ; i < numSamples >> 1; i++ ) {
9682 dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
9683 dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
9686 } else if ( kHz == 22050 ) {
9687 if ( numChannels == 1 ) {
9689 // calculate perm vector and do first load
9690 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9691 v10 = vec_ld( 0, &ogg[0][0] );
9695 for ( i = 0; i+7 < numSamples; i += 8 ) {
9696 // load values from ogg
9698 v9 = vec_ld( 15, &ogg[0][i] );
9699 v10 = vec_ld( 31, &ogg[0][i] );
9700 v0 = vec_perm( v8, v9, vecPerm1 );
9701 v1 = vec_perm( v9, v10, vecPerm1 );
9704 v0 = vec_madd( v0, constVec, zeroVector );
9705 v1 = vec_madd( v1, constVec, zeroVector );
9707 // permute into results vectors to store
9708 v5 = vec_perm( v0, v0, vecOneTwo );
9709 v6 = vec_perm( v0, v0, vecThreeFour);
9710 v7 = vec_perm( v1, v1, vecOneTwo );
9711 v8 = vec_perm( v1, v1, vecThreeFour );
9714 ALIGNED_STORE4( &dest[i*2], v5, v6, v7, v8 );
9717 for ( ; i < numSamples; i++ ) {
9718 dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
9722 // calculate perm vector and do first load
9723 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9724 vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9725 v7 = vec_ld( 0, &ogg[1][0] );
9726 v9 = vec_ld( 0, &ogg[0][0] );
9729 for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
9730 // load ogg[0][i] to ogg[0][i+4]
9732 v9 = vec_ld( 15, &ogg[0][i] );
9733 v0 = vec_perm( v8, v9, vecPerm1 );
9735 // load ogg[1][i] to ogg[1][i+3]
9737 v7 = vec_ld( 15, &ogg[1][i] );
9738 v1 = vec_perm( v6, v7, vecPerm2 );
9741 v0 = vec_madd( v0, constVec, zeroVector );
9742 v1 = vec_madd( v1, constVec, zeroVector );
9744 // generate result vectors to store
9745 v2 = vec_perm( v0, v1, vecFirst );
9746 v3 = vec_perm( v0, v1, vecSecond );
9747 v4 = vec_perm( v0, v1, vecThird );
9748 v5 = vec_perm( v0, v1, vecFourth );
9751 ALIGNED_STORE4( &dest[i*4], v2, v3, v4, v5 );
9754 for ( ; i < numSamples >> 1; i++ ) {
9755 dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
9756 dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
9759 } else if ( kHz == 44100 ) {
9760 if ( numChannels == 1 ) {
9761 // calculate perm vector and do first load
9762 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9764 v9 = vec_ld( 0, &ogg[0][0] );
9767 for ( i = 0; i+7 < numSamples; i += 8 ) {
9768 // load values from ogg
9770 v7 = vec_ld( 15, &ogg[0][i] );
9772 v9 = vec_ld( 31, &ogg[0][i] );
9774 v0 = vec_perm( v8, v7, vecPerm1 );
9775 v1 = vec_perm( v6, v9, vecPerm1 );
9778 v0 = vec_madd( v0, constVec, zeroVector );
9779 v1 = vec_madd( v1, constVec, zeroVector );
9781 ALIGNED_STORE2( &dest[i], v0, v1 );
9785 for ( ; i < numSamples; i++ ) {
9786 dest[i*1+0] = ogg[0][i] * 32768.0f;
9790 // calculate perm vector and do first load
9791 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9792 vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9793 v7 = vec_ld( 0, &ogg[1][0] );
9794 v9 = vec_ld( 0, &ogg[0][0] );
9797 for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
9799 v9 = vec_ld( 15, &ogg[0][i] );
9800 v0 = vec_perm( v8, v9, vecPerm1 );
9802 // load ogg[1][i] to ogg[1][i+3]
9804 v7 = vec_ld( 15, &ogg[1][i] );
9805 v1 = vec_perm( v6, v7, vecPerm2 );
9808 v0 = vec_madd( v0, constVec, zeroVector );
9809 v1 = vec_madd( v1, constVec, zeroVector );
9811 // generate result vectors
9812 v2 = vec_mergeh( v0, v1 );
9813 v3 = vec_mergel( v0, v1 );
9816 ALIGNED_STORE2( &dest[i*2], v2, v3 );
9819 for ( ; i < numSamples >> 1; i++ ) {
9820 dest[i*2+0] = ogg[0][i] * 32768.0f;
9821 dest[i*2+1] = ogg[1][i] * 32768.0f;
9833 idSIMD_AltiVec::UpSampleOGGTo44kHz
9835 Duplicate samples for 44kHz output.
9841 void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
9843 register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
9844 register vector float constVec, zeroVector;
9845 register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
9846 vector unsigned char vecPerm1;
9847 vector unsigned char vecPerm2;
9849 vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9850 vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9851 vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
9852 vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
9853 vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
9854 vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
9856 vector unsigned char storePerm;
9858 constVec = (vector float)(32768.0f);
9859 zeroVector = (vector float)(0.0);
9861 // calculate perm vector and masks for stores
9862 storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
9863 // original values of dest
9864 vector float vecDest = vec_ld( 0, &dest[0] );
9865 vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
9867 if ( kHz == 11025 ) {
9868 if ( numChannels == 1 ) {
9869 // calculate perm vector and do first load
9870 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9871 v10 = vec_ld( 0, &ogg[0][0] );
9874 for ( i = 0; i+7 < numSamples; i += 8 ) {
9875 // as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
9877 v9 = vec_ld( 15, &ogg[0][i] );
9878 v10 = vec_ld( 31, &ogg[0][i] );
9879 vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
9880 v0 = vec_perm( v8, v9, vecPerm1 );
9881 v1 = vec_perm( v9, v10, vecPerm1 );
9883 // now we have the elements in a vector, we want
9884 // to splat them each accross their own vector
9885 oggVec1 = vec_splat( v0, 0 );
9886 oggVec2 = vec_splat( v0, 1 );
9887 oggVec3 = vec_splat( v0, 2 );
9888 oggVec4 = vec_splat( v0, 3 );
9889 oggVec5 = vec_splat( v1, 0 );
9890 oggVec6 = vec_splat( v1, 1 );
9891 oggVec7 = vec_splat( v1, 2 );
9892 oggVec8 = vec_splat( v1, 3 );
9894 v0 = vec_madd( oggVec1, constVec, zeroVector );
9895 v1 = vec_madd( oggVec2, constVec, zeroVector );
9896 v2 = vec_madd( oggVec3, constVec, zeroVector );
9897 v3 = vec_madd( oggVec4, constVec, zeroVector );
9898 v4 = vec_madd( oggVec5, constVec, zeroVector );
9899 v5 = vec_madd( oggVec6, constVec, zeroVector );
9900 v6 = vec_madd( oggVec7, constVec, zeroVector );
9901 v7 = vec_madd( oggVec8, constVec, zeroVector );
9903 // rotate input data
9904 v0 = vec_perm( v0, v0, storePerm );
9905 v1 = vec_perm( v1, v1, storePerm );
9906 v2 = vec_perm( v2, v2, storePerm );
9907 v3 = vec_perm( v3, v3, storePerm );
9908 v4 = vec_perm( v4, v4, storePerm );
9909 v5 = vec_perm( v5, v5, storePerm );
9910 v6 = vec_perm( v6, v6, storePerm );
9911 v7 = vec_perm( v7, v7, storePerm );
9914 vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*4] );
9915 vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*4] );
9916 vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*4] );
9917 vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*4] );
9918 vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*4] );
9919 vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*4] );
9920 vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*4] );
9921 vec_st( vec_sel( v6, v7, mask ), 111, &dest[i*4] );
9922 vecDest = vec_sel( v7, vecDestEnd, mask );
9923 vec_st( vecDest, 127, &dest[i*4] );
9927 for ( ; i < numSamples; i++ ) {
9928 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
9933 // calculate perm vec for ogg
9934 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9935 vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9936 v7 = vec_ld( 0, &ogg[1][0] );
9937 v9 = vec_ld( 0, &ogg[0][0] );
9940 for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
9941 // load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
9943 v9 = vec_ld( 15, &ogg[0][i] );
9944 vector float vecDestEnd = vec_ld( 127, &dest[i*8] );
9945 v0 = vec_perm( v8, v9, vecPerm1 );
9947 // now we have the elements in a vector, we want
9948 // to splat them each accross their own vector
9949 oggVec1 = vec_splat( v0, 0 );
9950 oggVec2 = vec_splat( v0, 1 );
9951 oggVec3 = vec_splat( v0, 2 );
9952 oggVec4 = vec_splat( v0, 3 );
9954 // load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
9956 v7 = vec_ld( 15, &ogg[1][i] );
9957 v1 = vec_perm( v6, v7, vecPerm2 );
9959 // now we have the elements in a vector, we want
9960 // to splat them each accross their own vector
9961 oggVec5 = vec_splat( v1, 0 );
9962 oggVec6 = vec_splat( v1, 1 );
9963 oggVec7 = vec_splat( v1, 2 );
9964 oggVec8 = vec_splat( v1, 3 );
9966 oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
9967 oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
9968 oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
9969 oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
9970 oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
9971 oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
9972 oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
9973 oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
9975 //merge generates the interleaved pattern that we want and it
9976 //doesn't require a permute vector, so use that instead
9977 v0 = vec_mergeh( oggVec1, oggVec5 );
9978 v1 = vec_mergel( oggVec1, oggVec5 );
9979 v2 = vec_mergeh( oggVec2, oggVec6 );
9980 v3 = vec_mergel( oggVec2, oggVec6 );
9982 v4 = vec_mergeh( oggVec3, oggVec7 );
9983 v5 = vec_mergel( oggVec3, oggVec7 );
9984 v6 = vec_mergeh( oggVec4, oggVec8 );
9985 v10 = vec_mergel( oggVec4, oggVec8 );
9987 // rotate input data
9988 v0 = vec_perm( v0, v0, storePerm );
9989 v1 = vec_perm( v1, v1, storePerm );
9990 v2 = vec_perm( v2, v2, storePerm );
9991 v3 = vec_perm( v3, v3, storePerm );
9992 v4 = vec_perm( v4, v4, storePerm );
9993 v5 = vec_perm( v5, v5, storePerm );
9994 v6 = vec_perm( v6, v6, storePerm );
9995 v10 = vec_perm( v10, v10, storePerm );
9998 vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*8] );
9999 vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*8] );
10000 vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*8] );
10001 vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*8] );
10002 vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*8] );
10003 vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*8] );
10004 vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*8] );
10005 vec_st( vec_sel( v6, v10, mask ), 111, &dest[i*8] );
10006 vecDest = vec_sel( v10, vecDestEnd, mask );
10007 vec_st( vecDest, 127, &dest[i*8] );
10011 for ( ; i < numSamples >> 1; i++ ) {
10012 dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
10013 dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
10016 } else if ( kHz == 22050 ) {
10017 if ( numChannels == 1 ) {
10019 // calculate perm vector and do first load
10020 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10021 v10 = vec_ld( 0, &ogg[0][0] );
10025 for ( i = 0; i+7 < numSamples; i += 8 ) {
10027 // load values from ogg
10029 v9 = vec_ld( 15, &ogg[0][i] );
10030 v10 = vec_ld( 31, &ogg[0][i] );
10031 vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
10032 v0 = vec_perm( v8, v9, vecPerm1 );
10033 v1 = vec_perm( v9, v10, vecPerm1 );
10036 v0 = vec_madd( v0, constVec, zeroVector );
10037 v1 = vec_madd( v1, constVec, zeroVector );
10039 // permute into results vectors to store
10040 v5 = vec_perm( v0, v0, vecOneTwo );
10041 v6 = vec_perm( v0, v0, vecThreeFour);
10042 v7 = vec_perm( v1, v1, vecOneTwo );
10043 v8 = vec_perm( v1, v1, vecThreeFour );
10045 // rotate input data
10046 v5 = vec_perm( v5, v5, storePerm );
10047 v6 = vec_perm( v6, v6, storePerm );
10048 v7 = vec_perm( v7, v7, storePerm );
10049 v8 = vec_perm( v8, v8, storePerm );
10052 vec_st( vec_sel( vecDest, v5, mask ), 0, &dest[i*2] );
10053 vec_st( vec_sel( v5, v6, mask ), 15, &dest[i*2] );
10054 vec_st( vec_sel( v6, v7, mask ), 31, &dest[i*2] );
10055 vec_st( vec_sel( v7, v8, mask ), 47, &dest[i*2] );
10056 vecDest = vec_sel( v8, vecDestEnd, mask );
10057 vec_st( vecDest, 63, &dest[i*2] );
10061 for ( ; i < numSamples; i++ ) {
10062 dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
10066 // calculate perm vector and do first load
10067 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10068 vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
10069 v7 = vec_ld( 0, &ogg[1][0] );
10070 v9 = vec_ld( 0, &ogg[0][0] );
10073 for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
10074 // load ogg[0][i] to ogg[0][i+4]
10076 v9 = vec_ld( 15, &ogg[0][i] );
10077 vector float vecDestEnd = vec_ld( 63, &dest[i*4] );
10078 v0 = vec_perm( v8, v9, vecPerm1 );
10080 // load ogg[1][i] to ogg[1][i+3]
10082 v7 = vec_ld( 15, &ogg[1][i] );
10083 v1 = vec_perm( v6, v7, vecPerm2 );
10086 v0 = vec_madd( v0, constVec, zeroVector );
10087 v1 = vec_madd( v1, constVec, zeroVector );
10089 // generate result vectors to store
10090 v2 = vec_perm( v0, v1, vecFirst );
10091 v3 = vec_perm( v0, v1, vecSecond );
10092 v4 = vec_perm( v0, v1, vecThird );
10093 v5 = vec_perm( v0, v1, vecFourth );
10095 // rotate input data
10096 v2 = vec_perm( v2, v2, storePerm );
10097 v3 = vec_perm( v3, v3, storePerm );
10098 v4 = vec_perm( v4, v4, storePerm );
10099 v5 = vec_perm( v5, v5, storePerm );
10102 vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
10103 vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
10104 vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
10105 vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
10106 vecDest = vec_sel( v5, vecDestEnd, mask );
10107 vec_st( vecDest, 63, &dest[i*4] );
10111 for ( ; i < numSamples >> 1; i++ ) {
10112 dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
10113 dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
10116 } else if ( kHz == 44100 ) {
10117 if ( numChannels == 1 ) {
10118 // calculate perm vector and do first load
10119 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10121 v9 = vec_ld( 0, &ogg[0][0] );
10124 for ( i = 0; i+7 < numSamples; i += 8 ) {
10125 // load values from ogg
10127 v7 = vec_ld( 15, &ogg[0][i] );
10129 v9 = vec_ld( 31, &ogg[0][i] );
10130 vector float vecDestEnd = vec_ld( 31, &dest[i] );
10132 v0 = vec_perm( v8, v7, vecPerm1 );
10133 v1 = vec_perm( v6, v9, vecPerm1 );
10136 v0 = vec_madd( v0, constVec, zeroVector );
10137 v1 = vec_madd( v1, constVec, zeroVector );
10140 v0 = vec_perm( v0, v0, storePerm );
10141 v1 = vec_perm( v1, v1, storePerm );
10144 vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
10145 vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
10146 vecDest = vec_sel( v1, vecDestEnd, mask );
10147 vec_st( vecDest, 31, &dest[i] );
10151 for ( ; i < numSamples; i++ ) {
10152 dest[i*1+0] = ogg[0][i] * 32768.0f;
10156 // calculate perm vector and do first load
10157 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10158 vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
10159 v7 = vec_ld( 0, &ogg[1][0] );
10160 v9 = vec_ld( 0, &ogg[0][0] );
10163 for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
10165 v9 = vec_ld( 15, &ogg[0][i] );
10166 v0 = vec_perm( v8, v9, vecPerm1 );
10168 // load ogg[1][i] to ogg[1][i+3]
10170 v7 = vec_ld( 15, &ogg[1][i] );
10171 v1 = vec_perm( v6, v7, vecPerm2 );
10174 v0 = vec_madd( v0, constVec, zeroVector );
10175 v1 = vec_madd( v1, constVec, zeroVector );
10177 // generate result vectors
10178 v2 = vec_mergeh( v0, v1 );
10179 v3 = vec_mergel( v0, v1 );
10182 UNALIGNED_STORE2( &dest[i*2], v2, v3 );
10185 for ( ; i < numSamples >> 1; i++ ) {
10186 dest[i*2+0] = ogg[0][i] * 32768.0f;
10187 dest[i*2+1] = ogg[1][i] * 32768.0f;
10194 #endif /* SOUND_DEST_ALIGNED */
10196 #ifdef SOUND_DEST_ALIGNED
10199 idSIMD_AltiVec::MixSoundTwoSpeakerMono
10202 Assumes that mixBuffer starts at aligned address
10205 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10207 // mixBuffer is aligned
10208 assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10214 register vector float vecInc;
10215 register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10216 register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10217 register vector float vecSamplesLd1, vecSamplesLd2;
10218 register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10220 register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
10221 register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
10222 register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
10223 register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
10226 vector float fourVec = (vector float)(4.0);
10227 vector float zeroVec = (vector float)(0.0);
10229 inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10230 inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10232 spkr[0] = lastV[0];
10233 spkr[1] = lastV[1];
10234 spkr[2] = lastV[0] + inc[0];
10235 spkr[3] = lastV[1] + inc[1];
10237 assert( numSamples == MIXBUFFER_SAMPLES );
10242 //load data into registers
10243 vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10244 vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10245 vecInc = vec_mergeh( v0, v1 );
10247 vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10248 vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10249 vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10250 vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10253 v0 = vec_mergeh( v2, v4 );
10254 v1 = vec_mergeh( v3, v5 );
10255 vecSpeaker1 = vec_mergeh( v0, v1 );
10257 vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10258 vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10259 vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10260 vecInc = vec_madd( vecInc, fourVec, zeroVec );
10262 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10263 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10265 //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10266 //need a cleanup loop
10267 for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10269 //load samples and mix buffers
10270 vecSamplesLd1 = vecSamplesLast; //vec_ld( 0, &samples[i] );
10271 vecSamplesLd2 = vec_ld( 15, &samples[i] );
10272 vecSamplesLast = vec_ld( 31, &samples[i] );
10274 vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
10275 vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
10277 vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
10278 vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
10279 vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
10280 vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
10282 vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
10283 vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
10284 vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
10285 vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
10287 vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10288 vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10289 vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10290 vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10293 ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10295 //add for next iteration
10296 vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10297 vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10298 vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10299 vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10307 idSIMD_AltiVec::MixSoundTwoSpeakerMono
10313 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10319 register vector float vecInc;
10320 register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10321 register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10322 register vector float vecSamplesLd1, vecSamplesLd2;
10323 register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10325 register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
10326 register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
10327 register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
10328 register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
10331 vector float fourVec = (vector float)(4.0);
10332 vector float zeroVec = (vector float)(0.0);
10334 inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10335 inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10337 spkr[0] = lastV[0];
10338 spkr[1] = lastV[1];
10339 spkr[2] = lastV[0] + inc[0];
10340 spkr[3] = lastV[1] + inc[1];
10342 assert( numSamples == MIXBUFFER_SAMPLES );
10347 //load data into registers
10348 vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10349 vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10350 vecInc = vec_mergeh( v0, v1 );
10352 vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10353 vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10354 vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10355 vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10358 v0 = vec_mergeh( v2, v4 );
10359 v1 = vec_mergeh( v3, v5 );
10360 vecSpeaker1 = vec_mergeh( v0, v1 );
10362 vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10363 vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10364 vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10365 vecInc = vec_madd( vecInc, fourVec, zeroVec );
10367 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10368 vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0]), (vector unsigned char)(1) );
10369 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10370 vector float vecDest = vec_ld( 0, &mixBuffer[0] );
10372 //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10373 //need a cleanup loop
10374 for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10376 //load samples and mix buffers
10377 vecSamplesLd1 = vecSamplesLast;
10378 vecSamplesLd2 = vec_ld( 15, &samples[i] );
10379 vecSamplesLast = vec_ld( 31, &samples[i] );
10381 vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
10382 vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
10384 vecMixBuffer1 = vecDest;
10385 vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
10386 vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
10387 vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
10388 vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
10390 vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
10391 vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
10392 vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
10393 vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
10395 vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
10396 vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
10397 vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
10398 vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
10400 vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10401 vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10402 vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10403 vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10406 UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10408 //add for next iteration
10409 vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10410 vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10411 vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10412 vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10416 #endif /* SOUND_DEST_ALIGNED */
10418 #ifdef SOUND_DEST_ALIGNED
10421 idSIMD_AltiVec::MixSoundTwoSpeakerStereo
10424 Assumes that mixBuffer starts at aligned address
10427 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10428 // mixBuffer is aligned
10429 assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10436 register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10438 register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10439 register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10440 register vector float vecInc;
10441 vector float fourVec = (vector float)(4.0);
10442 vector float zeroVec = (vector float)(0.0);
10444 assert( numSamples == MIXBUFFER_SAMPLES );
10446 inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10447 inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10449 spkr[0] = lastV[0];
10450 spkr[1] = lastV[1];
10451 spkr[2] = lastV[0] + inc[0];
10452 spkr[3] = lastV[1] + inc[1];
10454 for ( k = 0; k < 2; k++ ) {
10458 // load data in vectors
10459 vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10460 vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10461 vecInc = vec_mergeh( v0, v1 );
10463 vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10464 vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10465 vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10466 vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10469 v0 = vec_mergeh( v2, v4 );
10470 v1 = vec_mergeh( v3, v5 );
10471 vecSpeaker1 = vec_mergeh( v0, v1 );
10473 vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10474 vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10475 vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10476 vecInc = vec_madd( vecInc, fourVec, zeroVec );
10478 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10479 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10481 //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10482 //need a cleanup loop
10483 for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10484 // load mix buffers and samples
10485 vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
10486 vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
10487 vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
10488 vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
10490 vecSamples1 = vecSamplesLast;
10491 vecSamples2 = vec_ld( 15, &samples[i*2] );
10492 vecSamples3 = vec_ld( 31, &samples[i*2] );
10493 vecSamples4 = vec_ld( 47, &samples[i*2] );
10494 vecSamplesLast = vec_ld( 63, &samples[i*2] );
10496 vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
10497 vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
10498 vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
10499 vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
10501 vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10502 vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10503 vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10504 vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10506 vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10507 vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10508 vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10509 vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10512 ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10519 idSIMD_AltiVec::MixSoundTwoSpeakerStereo
10525 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10531 register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10533 register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10534 register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10535 register vector float vecInc;
10536 vector float fourVec = (vector float)(4.0);
10537 vector float zeroVec = (vector float)(0.0);
10539 assert( numSamples == MIXBUFFER_SAMPLES );
10541 inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10542 inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10544 spkr[0] = lastV[0];
10545 spkr[1] = lastV[1];
10546 spkr[2] = lastV[0] + inc[0];
10547 spkr[3] = lastV[1] + inc[1];
10549 for ( k = 0; k < 2; k++ ) {
10553 // load data in vectors
10554 vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10555 vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10556 vecInc = vec_mergeh( v0, v1 );
10558 vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10559 vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10560 vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10561 vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10564 v0 = vec_mergeh( v2, v4 );
10565 v1 = vec_mergeh( v3, v5 );
10566 vecSpeaker1 = vec_mergeh( v0, v1 );
10568 vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10569 vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10570 vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10571 vecInc = vec_madd( vecInc, fourVec, zeroVec );
10573 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10574 vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
10575 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10576 vector float vecDest = vec_ld( 0, &mixBuffer[0] );
10578 //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10579 //need a cleanup loop
10580 for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10581 // load mix buffers and samples
10582 vecMixBuffer1 = vecDest;
10583 vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
10584 vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
10585 vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
10586 vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
10588 vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
10589 vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
10590 vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
10591 vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
10593 vecSamples1 = vecSamplesLast;
10594 vecSamples2 = vec_ld( 15, &samples[i*2] );
10595 vecSamples3 = vec_ld( 31, &samples[i*2] );
10596 vecSamples4 = vec_ld( 47, &samples[i*2] );
10597 vecSamplesLast = vec_ld( 63, &samples[i*2] );
10599 vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
10600 vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
10601 vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
10602 vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
10604 vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10605 vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10606 vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10607 vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10609 vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10610 vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10611 vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10612 vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10615 UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10619 #endif /* SOUND_DEST_ALIGNED */
10621 #ifdef SOUND_DEST_ALIGNED
10624 idSIMD_AltiVec::MixSoundSixSpeakerMono
10627 Assumes that mixBuffer starts at aligned address
10630 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
10632 // mixBuffer is aligned
10633 assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10639 vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
10640 vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
10641 vector float vecSamplesLd;
10642 vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
10643 vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
10644 // permute vectors for sample
10645 vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
10646 vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
10648 assert( numSamples == MIXBUFFER_SAMPLES );
10649 assert( SPEAKER_RIGHT == 1 );
10650 assert( SPEAKER_BACKRIGHT == 5 );
10652 // incL array, 6 elements repeated
10653 incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10654 incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10655 incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
10656 incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
10657 incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
10658 incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
10660 // sL array repeated
10661 for ( k = 0; k < 6; k++ ) {
10664 for ( k = 6; k < 12; k++ ) {
10665 sL[k] = lastV[k-6] + incL[k];
10667 for ( k = 12; k < 18; k++ ) {
10668 sL[k] = lastV[k-12] + incL[k] + incL[k];
10670 for ( k = 18; k < 24; k++ ) {
10671 sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
10674 // multiply by 2 since doing 12 at a time
10675 for ( k = 0; k < 24; k++ ) {
10680 vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
10681 vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
10683 vecIncl1 = vec_ld( 0, &incL[0] );
10684 vecIncl2 = vec_ld( 15, &incL[0] );
10685 vecIncl3 = vec_ld( 31, &incL[0] );
10686 vecIncl4 = vec_ld( 47, &incL[0] );
10687 vecIncl5 = vec_ld( 63, &incL[0] );
10688 vecIncl6 = vec_ld( 79, &incL[0] );
10689 vecIncl7 = vec_ld( 95, &incL[0] );
10691 vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
10692 vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
10693 vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
10694 vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
10695 vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
10696 vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
10698 vecSL1 = vec_ld( 0, &sL[0] );
10699 vecSL2 = vec_ld( 15, &sL[0] );
10700 vecSL3 = vec_ld( 31, &sL[0] );
10701 vecSL4 = vec_ld( 47, &sL[0] );
10702 vecSL5 = vec_ld( 63, &sL[0] );
10703 vecSL6 = vec_ld( 79, &sL[0] );
10704 vecSL7 = vec_ld( 95, &sL[0] );
10706 vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
10707 vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
10708 vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
10709 vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
10710 vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
10711 vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
10714 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10715 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10717 //since MIXBUFFER_SAMPLES is a multiple of 4, we don't
10718 //need a cleanup loop
10719 for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
10720 //load mix buffer into vectors, assume aligned
10721 vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
10722 vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
10723 vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
10724 vecMixBuffer4 = vec_ld( 0, &mixBuffer[(i*6)+12] );
10725 vecMixBuffer5 = vec_ld( 0, &mixBuffer[(i*6)+16] );
10726 vecMixBuffer6 = vec_ld( 0, &mixBuffer[(i*6)+20] );
10728 //load samples into vector
10729 vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
10730 vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
10731 vecSamplesLast = vecSamplesLd2;
10733 //permute to get them ordered how we want
10734 vecSamples1 = vec_splat( vecSamplesLd, 0 );
10735 vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
10736 vecSamples3 = vec_splat( vecSamplesLd, 1 );
10737 vecSamples4 = vec_splat( vecSamplesLd, 2 );
10738 vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
10739 vecSamples6 = vec_splat( vecSamplesLd, 3 );
10742 vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
10743 vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
10744 vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
10745 vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
10746 vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
10747 vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
10749 //store out results
10750 ALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
10752 // add for next iteration
10753 vecSL1 = vec_add( vecSL1, vecIncl1 );
10754 vecSL2 = vec_add( vecSL2, vecIncl2 );
10755 vecSL3 = vec_add( vecSL3, vecIncl3 );
10756 vecSL4 = vec_add( vecSL4, vecIncl4 );
10757 vecSL5 = vec_add( vecSL5, vecIncl5 );
10758 vecSL6 = vec_add( vecSL6, vecIncl6 );
10765 idSIMD_AltiVec::MixSoundSixSpeakerMono
10771 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
10777 vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
10778 vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
10779 vector float vecSamplesLd;
10780 vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
10781 vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
10782 // permute vectors for sample
10783 register vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
10784 register vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
10786 assert( numSamples == MIXBUFFER_SAMPLES );
10787 assert( SPEAKER_RIGHT == 1 );
10788 assert( SPEAKER_BACKRIGHT == 5 );
10790 // incL array, 6 elements repeated
10791 incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10792 incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10793 incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
10794 incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
10795 incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
10796 incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
10798 // sL array repeated
10799 for ( k = 0; k < 6; k++ ) {
10802 for ( k = 6; k < 12; k++ ) {
10803 sL[k] = lastV[k-6] + incL[k];
10805 for ( k = 12; k < 18; k++ ) {
10806 sL[k] = lastV[k-12] + incL[k] + incL[k];
10808 for ( k = 18; k < 24; k++ ) {
10809 sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
10812 // multiply by 2 since doing 12 at a time
10813 for ( k = 0; k < 24; k++ ) {
10818 vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
10819 vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
10821 vecIncl1 = vec_ld( 0, &incL[0] );
10822 vecIncl2 = vec_ld( 15, &incL[0] );
10823 vecIncl3 = vec_ld( 31, &incL[0] );
10824 vecIncl4 = vec_ld( 47, &incL[0] );
10825 vecIncl5 = vec_ld( 63, &incL[0] );
10826 vecIncl6 = vec_ld( 79, &incL[0] );
10827 vecIncl7 = vec_ld( 95, &incL[0] );
10829 vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
10830 vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
10831 vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
10832 vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
10833 vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
10834 vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
10836 vecSL1 = vec_ld( 0, &sL[0] );
10837 vecSL2 = vec_ld( 15, &sL[0] );
10838 vecSL3 = vec_ld( 31, &sL[0] );
10839 vecSL4 = vec_ld( 47, &sL[0] );
10840 vecSL5 = vec_ld( 63, &sL[0] );
10841 vecSL6 = vec_ld( 79, &sL[0] );
10842 vecSL7 = vec_ld( 95, &sL[0] );
10844 vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
10845 vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
10846 vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
10847 vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
10848 vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
10849 vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
10851 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10852 vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
10853 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10854 vector float vecDest = vec_ld( 0, &mixBuffer[0] );
10856 //since MIXBUFFER_SAMPLES is a multiple of 4, we don't
10857 //need a cleanup loop
10858 for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
10859 //load mix buffer into vectors
10860 vecMixBuffer1 = vecDest;
10861 vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
10862 vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
10863 vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*6] );
10864 vecMixBuffer5 = vec_ld( 63, &mixBuffer[i*6] );
10865 vecMixBuffer6 = vec_ld( 79, &mixBuffer[i*6] );
10866 vector float vecDestEnd = vec_ld( 95, &mixBuffer[i*6] );
10868 vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
10869 vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
10870 vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
10871 vecMixBuffer4 = vec_perm( vecMixBuffer4, vecMixBuffer5, mixBufferPerm );
10872 vecMixBuffer5 = vec_perm( vecMixBuffer5, vecMixBuffer6, mixBufferPerm );
10873 vecMixBuffer6 = vec_perm( vecMixBuffer6, vecDestEnd, mixBufferPerm );
10875 //load samples into vector
10876 vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
10877 vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
10878 vecSamplesLast = vecSamplesLd2;
10880 //permute to get them ordered how we want
10881 vecSamples1 = vec_splat( vecSamplesLd, 0 );
10882 vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
10883 vecSamples3 = vec_splat( vecSamplesLd, 1 );
10884 vecSamples4 = vec_splat( vecSamplesLd, 2 );
10885 vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
10886 vecSamples6 = vec_splat( vecSamplesLd, 3 );
10889 vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
10890 vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
10891 vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
10892 vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
10893 vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
10894 vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
10897 UNALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
10899 // add for next iteration
10900 vecSL1 = vec_add( vecSL1, vecIncl1 );
10901 vecSL2 = vec_add( vecSL2, vecIncl2 );
10902 vecSL3 = vec_add( vecSL3, vecIncl3 );
10903 vecSL4 = vec_add( vecSL4, vecIncl4 );
10904 vecSL5 = vec_add( vecSL5, vecIncl5 );
10905 vecSL6 = vec_add( vecSL6, vecIncl6 );
10909 #endif /* SOUND_DEST_ALIGNED */
10911 #ifdef SOUND_DEST_ALIGNED
10914 idSIMD_AltiVec::MixSoundSixSpeakerStereo
10917 Assumes that mixBuffer starts at aligned address
10921 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
10923 // mixBuffer is aligned
10924 assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10929 vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
10930 vector float vecSL1, vecSL2, vecSL3, vecSL4;
10931 vector float vecSamplesLd;
10932 vector float vecSamples1, vecSamples2, vecSamples3;
10933 vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
10934 // permute vectors for sample
10935 vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
10936 vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
10938 assert( numSamples == MIXBUFFER_SAMPLES );
10939 assert( SPEAKER_RIGHT == 1 );
10940 assert( SPEAKER_BACKRIGHT == 5 );
10942 // incL array, 6 elements repeated
10943 incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10944 incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10945 incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
10946 incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
10947 incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
10948 incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
10950 // sL array repeated
10957 sL[6] = lastV[0] + incL[0];
10958 sL[7] = lastV[1] + incL[1];
10959 sL[8] = lastV[2] + incL[2];
10960 sL[9] = lastV[3] + incL[3];
10961 sL[10] = lastV[4] + incL[4];
10962 sL[11] = lastV[5] + incL[5];
10964 // multiply by 2 since doing 12 at a time
10978 //we aligned this data, so load it up
10979 vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
10980 vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
10981 vecIncl1 = vec_ld( 0, &incL[0] );
10982 vecIncl2 = vec_ld( 15, &incL[0] );
10983 vecIncl3 = vec_ld( 31, &incL[0] );
10984 vecIncl4 = vec_ld( 47, &incL[0] );
10986 vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
10987 vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
10988 vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
10990 vecSL1 = vec_ld( 0, &sL[0] );
10991 vecSL2 = vec_ld( 15, &sL[0] );
10992 vecSL3 = vec_ld( 31, &sL[0] );
10993 vecSL4 = vec_ld( 47, &sL[0] );
10995 vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
10996 vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
10997 vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
10999 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
11000 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
11002 for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
11004 //load mix buffer into vectors, assume aligned
11005 vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
11006 vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
11007 vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
11009 //load samples into vector
11010 vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
11011 vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
11012 vecSamplesLast = vecSamplesLd2;
11014 //permute to get them ordered how we want. For the 2nd vector,
11015 //the order happens to be the same as the order we loaded them
11016 //in, so there's no need to permute that one
11017 vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
11018 vecSamples2 = vecSamplesLd;
11019 vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
11022 vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
11023 vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
11024 vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
11026 //store out results
11027 ALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
11029 // add for next iteration
11030 vecSL1 = vec_add( vecSL1, vecIncl1 );
11031 vecSL2 = vec_add( vecSL2, vecIncl2 );
11032 vecSL3 = vec_add( vecSL3, vecIncl3 );
11039 idSIMD_AltiVec::MixSoundSixSpeakerStereo
11045 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
11051 vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
11052 vector float vecSL1, vecSL2, vecSL3, vecSL4;
11053 vector float vecSamplesLd;
11054 vector float vecSamples1, vecSamples2, vecSamples3;
11055 vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
11056 // permute vectors for sample
11057 vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
11058 vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
11060 assert( numSamples == MIXBUFFER_SAMPLES );
11061 assert( SPEAKER_RIGHT == 1 );
11062 assert( SPEAKER_BACKRIGHT == 5 );
11064 // incL array, 6 elements repeated
11065 incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
11066 incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
11067 incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
11068 incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
11069 incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
11070 incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
11072 // sL array repeated
11079 sL[6] = lastV[0] + incL[0];
11080 sL[7] = lastV[1] + incL[1];
11081 sL[8] = lastV[2] + incL[2];
11082 sL[9] = lastV[3] + incL[3];
11083 sL[10] = lastV[4] + incL[4];
11084 sL[11] = lastV[5] + incL[5];
11086 // multiply by 2 since doing 12 at a time
11101 vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
11102 vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
11103 vecIncl1 = vec_ld( 0, &incL[0] );
11104 vecIncl2 = vec_ld( 15, &incL[0] );
11105 vecIncl3 = vec_ld( 31, &incL[0] );
11106 vecIncl4 = vec_ld( 47, &incL[0] );
11108 vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
11109 vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
11110 vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
11112 vecSL1 = vec_ld( 0, &sL[0] );
11113 vecSL2 = vec_ld( 15, &sL[0] );
11114 vecSL3 = vec_ld( 31, &sL[0] );
11115 vecSL4 = vec_ld( 47, &sL[0] );
11117 vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
11118 vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
11119 vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
11121 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
11122 vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
11123 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
11124 vector float vecDest = vec_ld( 0, &mixBuffer[0] );
11126 for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
11128 //load mix buffer into vectors
11129 vecMixBuffer1 = vecDest;
11130 vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
11131 vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
11132 vector float vecDestEnd = vec_ld( 47, &mixBuffer[i*6] );
11134 vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
11135 vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
11136 vecMixBuffer3 = vec_perm( vecMixBuffer3, vecDestEnd, mixBufferPerm );
11138 //load samples into vector
11139 vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
11140 vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
11141 vecSamplesLast = vecSamplesLd2;
11143 //permute to get them ordered how we want. For the 2nd vector,
11144 //the order happens to be the same as the order we loaded them
11145 //in, so there's no need to permute that one
11146 vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
11147 vecSamples2 = vecSamplesLd;
11148 vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
11151 vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
11152 vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
11153 vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
11156 UNALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
11158 // add for next iteration
11159 vecSL1 = vec_add( vecSL1, vecIncl1 );
11160 vecSL2 = vec_add( vecSL2, vecIncl2 );
11161 vecSL3 = vec_add( vecSL3, vecIncl3 );
11169 idSIMD_AltiVec::MixedSoundToSamples
11172 void VPCALL idSIMD_AltiVec::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
11173 //this is basically a clamp for sound mixing
11174 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
11175 register vector signed int vi0, vi1, vi2, vi3;
11176 register vector signed short vs0, vs1;
11177 register vector float minVec, maxVec, constVec;
11180 //unaligned at start, since samples is not 16-byte aligned
11181 for ( ; NOT_16BYTE_ALIGNED( samples[i] ) && ( i < numSamples ); i++ ) {
11182 samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
11185 constVec = (vector float)(65536.0f);
11187 //splat min/max into a vector
11188 minVec = (vector float)(-32768.0f);
11189 maxVec = (vector float)(32767.0f);
11191 vector float vecOld = vec_ld( 0, &mixBuffer[i] );
11192 vector unsigned char permVec = vec_add( vec_lvsl( -1, &mixBuffer[i] ), (vector unsigned char)(1) );
11195 for ( ; i+15 < numSamples; i += 16 ) {
11198 v1 = vec_ld( 15, &mixBuffer[i] );
11199 v2 = vec_ld( 31, &mixBuffer[i] );
11200 v3 = vec_ld( 31, &mixBuffer[i] );
11201 vecOld = vec_ld( 47, &mixBuffer[i] );
11203 v0 = vec_perm( v0, v1, permVec );
11204 v1 = vec_perm( v1, v2, permVec );
11205 v2 = vec_perm( v2, v3, permVec );
11206 v3 = vec_perm( v3, vecOld, permVec );
11209 v4 = vec_max( v0, minVec );
11210 v5 = vec_max( v1, minVec );
11211 v6 = vec_max( v2, minVec );
11212 v7 = vec_max( v3, minVec );
11215 v4 = vec_min( v4, maxVec );
11216 v5 = vec_min( v5, maxVec );
11217 v6 = vec_min( v6, maxVec );
11218 v7 = vec_min( v7, maxVec );
11220 // convert floats to ints
11221 vi0 = vec_cts( v4, 0 );
11222 vi1 = vec_cts( v5, 0 );
11223 vi2 = vec_cts( v6, 0 );
11224 vi3 = vec_cts( v7, 0 );
11226 // pack ints into shorts
11227 vs0 = vec_pack( vi0, vi1 );
11228 vs1 = vec_pack( vi2, vi3 );
11229 ALIGNED_STORE2( &samples[i], vs0, vs1 );
11233 for ( ; i < numSamples ; i++ ) {
11234 samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
11237 #endif /* ENABLE_SOUND_ROUTINES */
11239 #endif /* MACOS_X */