2 ===========================================================================
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
7 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
26 ===========================================================================
29 #include "../precompiled.h"
32 #include "Simd_Generic.h"
35 //===============================================================
37 // Generic implementation of idSIMDProcessor
39 //===============================================================
41 #define UNROLL1(Y) { int _IX; for (_IX=0;_IX<count;_IX++) {Y(_IX);} }
42 #define UNROLL2(Y) { int _IX, _NM = count&0xfffffffe; for (_IX=0;_IX<_NM;_IX+=2){Y(_IX+0);Y(_IX+1);} if (_IX < count) {Y(_IX);}}
43 #define UNROLL4(Y) { int _IX, _NM = count&0xfffffffc; for (_IX=0;_IX<_NM;_IX+=4){Y(_IX+0);Y(_IX+1);Y(_IX+2);Y(_IX+3);}for(;_IX<count;_IX++){Y(_IX);}}
44 #define UNROLL8(Y) { int _IX, _NM = count&0xfffffff8; for (_IX=0;_IX<_NM;_IX+=8){Y(_IX+0);Y(_IX+1);Y(_IX+2);Y(_IX+3);Y(_IX+4);Y(_IX+5);Y(_IX+6);Y(_IX+7);} _NM = count&0xfffffffe; for(;_IX<_NM;_IX+=2){Y(_IX); Y(_IX+1);} if (_IX < count) {Y(_IX);} }
47 #define NODEFAULT default: assert( 0 )
49 #define NODEFAULT default: __assume( 0 )
57 idSIMD_Generic::GetName
60 const char * idSIMD_Generic::GetName( void ) const {
61 return "generic code";
68 dst[i] = constant + src[i];
71 void VPCALL idSIMD_Generic::Add( float *dst, const float constant, const float *src, const int count ) {
72 #define OPER(X) dst[(X)] = src[(X)] + constant;
81 dst[i] = src0[i] + src1[i];
84 void VPCALL idSIMD_Generic::Add( float *dst, const float *src0, const float *src1, const int count ) {
85 #define OPER(X) dst[(X)] = src0[(X)] + src1[(X)];
94 dst[i] = constant - src[i];
97 void VPCALL idSIMD_Generic::Sub( float *dst, const float constant, const float *src, const int count ) {
99 #define OPER(X) dst[(X)] = c - src[(X)];
108 dst[i] = src0[i] - src1[i];
111 void VPCALL idSIMD_Generic::Sub( float *dst, const float *src0, const float *src1, const int count ) {
112 #define OPER(X) dst[(X)] = src0[(X)] - src1[(X)];
121 dst[i] = constant * src[i];
124 void VPCALL idSIMD_Generic::Mul( float *dst, const float constant, const float *src0, const int count) {
126 #define OPER(X) (dst[(X)] = (c * src0[(X)]))
135 dst[i] = src0[i] * src1[i];
138 void VPCALL idSIMD_Generic::Mul( float *dst, const float *src0, const float *src1, const int count ) {
139 #define OPER(X) (dst[(X)] = src0[(X)] * src1[(X)])
148 dst[i] = constant / divisor[i];
151 void VPCALL idSIMD_Generic::Div( float *dst, const float constant, const float *divisor, const int count ) {
153 #define OPER(X) (dst[(X)] = (c / divisor[(X)]))
162 dst[i] = src0[i] / src1[i];
165 void VPCALL idSIMD_Generic::Div( float *dst, const float *src0, const float *src1, const int count ) {
166 #define OPER(X) (dst[(X)] = src0[(X)] / src1[(X)])
173 idSIMD_Generic::MulAdd
175 dst[i] += constant * src[i];
178 void VPCALL idSIMD_Generic::MulAdd( float *dst, const float constant, const float *src, const int count ) {
180 #define OPER(X) (dst[(X)] += c * src[(X)])
187 idSIMD_Generic::MulAdd
189 dst[i] += src0[i] * src1[i];
192 void VPCALL idSIMD_Generic::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
193 #define OPER(X) (dst[(X)] += src0[(X)] * src1[(X)])
200 idSIMD_Generic::MulSub
202 dst[i] -= constant * src[i];
205 void VPCALL idSIMD_Generic::MulSub( float *dst, const float constant, const float *src, const int count ) {
207 #define OPER(X) (dst[(X)] -= c * src[(X)])
214 idSIMD_Generic::MulSub
216 dst[i] -= src0[i] * src1[i];
219 void VPCALL idSIMD_Generic::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
220 #define OPER(X) (dst[(X)] -= src0[(X)] * src1[(X)])
229 dst[i] = constant * src[i];
232 void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
233 #define OPER(X) dst[(X)] = constant * src[(X)];
242 dst[i] = constant * src[i].Normal() + src[i][3];
245 void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
246 #define OPER(X) dst[(X)] = constant * src[(X)].Normal() + src[(X)][3];
255 dst[i] = constant * src[i].xyz;
258 void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
259 #define OPER(X) dst[(X)] = constant * src[(X)].xyz;
268 dst[i] = constant.Normal() * src[i] + constant[3];
271 void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
272 #define OPER(X) dst[(X)] = constant.Normal() * src[(X)] + constant[3];
281 dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
284 void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
285 #define OPER(X) dst[(X)] = constant.Normal() * src[(X)].Normal() + constant[3] * src[(X)][3];
294 dst[i] = constant.Normal() * src[i].xyz + constant[3];
297 void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
298 #define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
307 dst[i] = src0[i] * src1[i];
310 void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
311 #define OPER(X) dst[(X)] = src0[(X)] * src1[(X)];
320 dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
323 void VPCALL idSIMD_Generic::Dot( float &dot, const float *src1, const float *src2, const int count ) {
332 dot = src1[0] * src2[0];
336 dot = src1[0] * src2[0] + src1[1] * src2[1];
340 dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];
345 double s0, s1, s2, s3;
346 s0 = src1[0] * src2[0];
347 s1 = src1[1] * src2[1];
348 s2 = src1[2] * src2[2];
349 s3 = src1[3] * src2[3];
350 for ( i = 4; i < count-7; i += 8 ) {
351 s0 += src1[i+0] * src2[i+0];
352 s1 += src1[i+1] * src2[i+1];
353 s2 += src1[i+2] * src2[i+2];
354 s3 += src1[i+3] * src2[i+3];
355 s0 += src1[i+4] * src2[i+4];
356 s1 += src1[i+5] * src2[i+5];
357 s2 += src1[i+6] * src2[i+6];
358 s3 += src1[i+7] * src2[i+7];
360 switch( count - i ) {
362 case 7: s0 += src1[i+6] * src2[i+6];
363 case 6: s1 += src1[i+5] * src2[i+5];
364 case 5: s2 += src1[i+4] * src2[i+4];
365 case 4: s3 += src1[i+3] * src2[i+3];
366 case 3: s0 += src1[i+2] * src2[i+2];
367 case 2: s1 += src1[i+1] * src2[i+1];
368 case 1: s2 += src1[i+0] * src2[i+0];
383 for ( i = 0; i < count; i++ ) {
384 dot += src1[i] * src2[i];
392 idSIMD_Generic::CmpGT
394 dst[i] = src0[i] > constant;
397 void VPCALL idSIMD_Generic::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
398 #define OPER(X) dst[(X)] = src0[(X)] > constant;
405 idSIMD_Generic::CmpGT
407 dst[i] |= ( src0[i] > constant ) << bitNum;
410 void VPCALL idSIMD_Generic::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
411 #define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum;
418 idSIMD_Generic::CmpGE
420 dst[i] = src0[i] >= constant;
423 void VPCALL idSIMD_Generic::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
424 #define OPER(X) dst[(X)] = src0[(X)] >= constant;
431 idSIMD_Generic::CmpGE
433 dst[i] |= ( src0[i] >= constant ) << bitNum;
436 void VPCALL idSIMD_Generic::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
437 #define OPER(X) dst[(X)] |= ( src0[(X)] >= constant ) << bitNum;
444 idSIMD_Generic::CmpLT
446 dst[i] = src0[i] < constant;
449 void VPCALL idSIMD_Generic::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
450 #define OPER(X) dst[(X)] = src0[(X)] < constant;
457 idSIMD_Generic::CmpLT
459 dst[i] |= ( src0[i] < constant ) << bitNum;
462 void VPCALL idSIMD_Generic::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
463 #define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum;
470 idSIMD_Generic::CmpLE
472 dst[i] = src0[i] <= constant;
475 void VPCALL idSIMD_Generic::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
476 #define OPER(X) dst[(X)] = src0[(X)] <= constant;
483 idSIMD_Generic::CmpLE
485 dst[i] |= ( src0[i] <= constant ) << bitNum;
488 void VPCALL idSIMD_Generic::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
489 #define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum;
496 idSIMD_Generic::MinMax
499 void VPCALL idSIMD_Generic::MinMax( float &min, float &max, const float *src, const int count ) {
500 min = idMath::INFINITY; max = -idMath::INFINITY;
501 #define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];}
508 idSIMD_Generic::MinMax
511 void VPCALL idSIMD_Generic::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
512 min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY;
513 #define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; }
520 idSIMD_Generic::MinMax
523 void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
524 min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
525 #define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
532 idSIMD_Generic::MinMax
535 void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
536 min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
537 #define OPER(X) const idVec3 &v = src[(X)].xyz; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
544 idSIMD_Generic::MinMax
547 void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
548 min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
549 #define OPER(X) const idVec3 &v = src[indexes[(X)]].xyz; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
556 idSIMD_Generic::Clamp
559 void VPCALL idSIMD_Generic::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
560 #define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)];
567 idSIMD_Generic::ClampMin
570 void VPCALL idSIMD_Generic::ClampMin( float *dst, const float *src, const float min, const int count ) {
571 #define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)];
578 idSIMD_Generic::ClampMax
581 void VPCALL idSIMD_Generic::ClampMax( float *dst, const float *src, const float max, const int count ) {
582 #define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)];
589 idSIMD_Generic::Memcpy
592 void VPCALL idSIMD_Generic::Memcpy( void *dst, const void *src, const int count ) {
593 memcpy( dst, src, count );
598 idSIMD_Generic::Memset
601 void VPCALL idSIMD_Generic::Memset( void *dst, const int val, const int count ) {
602 memset( dst, val, count );
607 idSIMD_Generic::Zero16
610 void VPCALL idSIMD_Generic::Zero16( float *dst, const int count ) {
611 memset( dst, 0, count * sizeof( float ) );
616 idSIMD_Generic::Negate16
619 void VPCALL idSIMD_Generic::Negate16( float *dst, const int count ) {
620 unsigned int *ptr = reinterpret_cast<unsigned int *>(dst);
621 #define OPER(X) ptr[(X)] ^= ( 1 << 31 ) // IEEE 32 bits float sign bit
628 idSIMD_Generic::Copy16
631 void VPCALL idSIMD_Generic::Copy16( float *dst, const float *src, const int count ) {
632 #define OPER(X) dst[(X)] = src[(X)]
639 idSIMD_Generic::Add16
642 void VPCALL idSIMD_Generic::Add16( float *dst, const float *src1, const float *src2, const int count ) {
643 #define OPER(X) dst[(X)] = src1[(X)] + src2[(X)]
650 idSIMD_Generic::Sub16
653 void VPCALL idSIMD_Generic::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
654 #define OPER(X) dst[(X)] = src1[(X)] - src2[(X)]
661 idSIMD_Generic::Mul16
664 void VPCALL idSIMD_Generic::Mul16( float *dst, const float *src1, const float constant, const int count ) {
665 #define OPER(X) dst[(X)] = src1[(X)] * constant
672 idSIMD_Generic::AddAssign16
675 void VPCALL idSIMD_Generic::AddAssign16( float *dst, const float *src, const int count ) {
676 #define OPER(X) dst[(X)] += src[(X)]
683 idSIMD_Generic::SubAssign16
686 void VPCALL idSIMD_Generic::SubAssign16( float *dst, const float *src, const int count ) {
687 #define OPER(X) dst[(X)] -= src[(X)]
694 idSIMD_Generic::MulAssign16
697 void VPCALL idSIMD_Generic::MulAssign16( float *dst, const float constant, const int count ) {
698 #define OPER(X) dst[(X)] *= constant
705 idSIMD_Generic::MatX_MultiplyVecX
708 void VPCALL idSIMD_Generic::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
710 const float *mPtr, *vPtr;
713 assert( vec.GetSize() >= mat.GetNumColumns() );
714 assert( dst.GetSize() >= mat.GetNumRows() );
716 mPtr = mat.ToFloatPtr();
717 vPtr = vec.ToFloatPtr();
718 dstPtr = dst.ToFloatPtr();
719 numRows = mat.GetNumRows();
720 switch( mat.GetNumColumns() ) {
722 for ( i = 0; i < numRows; i++ ) {
723 dstPtr[i] = mPtr[0] * vPtr[0];
728 for ( i = 0; i < numRows; i++ ) {
729 dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
734 for ( i = 0; i < numRows; i++ ) {
735 dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
740 for ( i = 0; i < numRows; i++ ) {
741 dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
747 for ( i = 0; i < numRows; i++ ) {
748 dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
749 mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
754 for ( i = 0; i < numRows; i++ ) {
755 dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
756 mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
761 int numColumns = mat.GetNumColumns();
762 for ( i = 0; i < numRows; i++ ) {
763 float sum = mPtr[0] * vPtr[0];
764 for ( j = 1; j < numColumns; j++ ) {
765 sum += mPtr[j] * vPtr[j];
776 idSIMD_Generic::MatX_MultiplyAddVecX
779 void VPCALL idSIMD_Generic::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
781 const float *mPtr, *vPtr;
784 assert( vec.GetSize() >= mat.GetNumColumns() );
785 assert( dst.GetSize() >= mat.GetNumRows() );
787 mPtr = mat.ToFloatPtr();
788 vPtr = vec.ToFloatPtr();
789 dstPtr = dst.ToFloatPtr();
790 numRows = mat.GetNumRows();
791 switch( mat.GetNumColumns() ) {
793 for ( i = 0; i < numRows; i++ ) {
794 dstPtr[i] += mPtr[0] * vPtr[0];
799 for ( i = 0; i < numRows; i++ ) {
800 dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
805 for ( i = 0; i < numRows; i++ ) {
806 dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
811 for ( i = 0; i < numRows; i++ ) {
812 dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
818 for ( i = 0; i < numRows; i++ ) {
819 dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
820 mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
825 for ( i = 0; i < numRows; i++ ) {
826 dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
827 mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
832 int numColumns = mat.GetNumColumns();
833 for ( i = 0; i < numRows; i++ ) {
834 float sum = mPtr[0] * vPtr[0];
835 for ( j = 1; j < numColumns; j++ ) {
836 sum += mPtr[j] * vPtr[j];
847 idSIMD_Generic::MatX_MultiplySubVecX
850 void VPCALL idSIMD_Generic::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
852 const float *mPtr, *vPtr;
855 assert( vec.GetSize() >= mat.GetNumColumns() );
856 assert( dst.GetSize() >= mat.GetNumRows() );
858 mPtr = mat.ToFloatPtr();
859 vPtr = vec.ToFloatPtr();
860 dstPtr = dst.ToFloatPtr();
861 numRows = mat.GetNumRows();
862 switch( mat.GetNumColumns() ) {
864 for ( i = 0; i < numRows; i++ ) {
865 dstPtr[i] -= mPtr[0] * vPtr[0];
870 for ( i = 0; i < numRows; i++ ) {
871 dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
876 for ( i = 0; i < numRows; i++ ) {
877 dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
882 for ( i = 0; i < numRows; i++ ) {
883 dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
889 for ( i = 0; i < numRows; i++ ) {
890 dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
891 mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
896 for ( i = 0; i < numRows; i++ ) {
897 dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
898 mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
903 int numColumns = mat.GetNumColumns();
904 for ( i = 0; i < numRows; i++ ) {
905 float sum = mPtr[0] * vPtr[0];
906 for ( j = 1; j < numColumns; j++ ) {
907 sum += mPtr[j] * vPtr[j];
918 idSIMD_Generic::MatX_TransposeMultiplyVecX
921 void VPCALL idSIMD_Generic::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
922 int i, j, numColumns;
923 const float *mPtr, *vPtr;
926 assert( vec.GetSize() >= mat.GetNumRows() );
927 assert( dst.GetSize() >= mat.GetNumColumns() );
929 mPtr = mat.ToFloatPtr();
930 vPtr = vec.ToFloatPtr();
931 dstPtr = dst.ToFloatPtr();
932 numColumns = mat.GetNumColumns();
933 switch( mat.GetNumRows() ) {
935 for ( i = 0; i < numColumns; i++ ) {
936 dstPtr[i] = *(mPtr) * vPtr[0];
941 for ( i = 0; i < numColumns; i++ ) {
942 dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
947 for ( i = 0; i < numColumns; i++ ) {
948 dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
953 for ( i = 0; i < numColumns; i++ ) {
954 dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
955 *(mPtr+3*numColumns) * vPtr[3];
960 for ( i = 0; i < numColumns; i++ ) {
961 dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
962 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
967 for ( i = 0; i < numColumns; i++ ) {
968 dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
969 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
974 int numRows = mat.GetNumRows();
975 for ( i = 0; i < numColumns; i++ ) {
976 mPtr = mat.ToFloatPtr() + i;
977 float sum = mPtr[0] * vPtr[0];
978 for ( j = 1; j < numRows; j++ ) {
980 sum += mPtr[0] * vPtr[j];
990 idSIMD_Generic::MatX_TransposeMultiplyAddVecX
993 void VPCALL idSIMD_Generic::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
994 int i, j, numColumns;
995 const float *mPtr, *vPtr;
998 assert( vec.GetSize() >= mat.GetNumRows() );
999 assert( dst.GetSize() >= mat.GetNumColumns() );
1001 mPtr = mat.ToFloatPtr();
1002 vPtr = vec.ToFloatPtr();
1003 dstPtr = dst.ToFloatPtr();
1004 numColumns = mat.GetNumColumns();
1005 switch( mat.GetNumRows() ) {
1007 for ( i = 0; i < numColumns; i++ ) {
1008 dstPtr[i] += *(mPtr) * vPtr[0];
1013 for ( i = 0; i < numColumns; i++ ) {
1014 dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
1019 for ( i = 0; i < numColumns; i++ ) {
1020 dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
1025 for ( i = 0; i < numColumns; i++ ) {
1026 dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
1027 *(mPtr+3*numColumns) * vPtr[3];
1032 for ( i = 0; i < numColumns; i++ ) {
1033 dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
1034 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
1039 for ( i = 0; i < numColumns; i++ ) {
1040 dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
1041 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
1046 int numRows = mat.GetNumRows();
1047 for ( i = 0; i < numColumns; i++ ) {
1048 mPtr = mat.ToFloatPtr() + i;
1049 float sum = mPtr[0] * vPtr[0];
1050 for ( j = 1; j < numRows; j++ ) {
1052 sum += mPtr[0] * vPtr[j];
1062 idSIMD_Generic::MatX_TransposeMultiplySubVecX
1065 void VPCALL idSIMD_Generic::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
1067 const float *mPtr, *vPtr;
1070 assert( vec.GetSize() >= mat.GetNumRows() );
1071 assert( dst.GetSize() >= mat.GetNumColumns() );
1073 mPtr = mat.ToFloatPtr();
1074 vPtr = vec.ToFloatPtr();
1075 dstPtr = dst.ToFloatPtr();
1076 numColumns = mat.GetNumColumns();
1077 switch( mat.GetNumRows() ) {
1079 for ( i = 0; i < numColumns; i++ ) {
1080 dstPtr[i] -= *(mPtr) * vPtr[0];
1085 for ( i = 0; i < numColumns; i++ ) {
1086 dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
1091 for ( i = 0; i < numColumns; i++ ) {
1092 dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
1097 for ( i = 0; i < numColumns; i++ ) {
1098 dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
1099 *(mPtr+3*numColumns) * vPtr[3];
1104 for ( i = 0; i < numColumns; i++ ) {
1105 dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
1106 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
1111 for ( i = 0; i < numColumns; i++ ) {
1112 dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
1113 *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
1118 int numRows = mat.GetNumRows();
1119 for ( i = 0; i < numColumns; i++ ) {
1120 mPtr = mat.ToFloatPtr() + i;
1121 float sum = mPtr[0] * vPtr[0];
1122 for ( int j = 1; j < numRows; j++ ) {
1124 sum += mPtr[0] * vPtr[j];
1134 idSIMD_Generic::MatX_MultiplyMatX
1136 optimizes the following matrix multiplications:
1143 with N in the range [1-6].
1146 void VPCALL idSIMD_Generic::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
1149 const float *m1Ptr, *m2Ptr;
1152 assert( m1.GetNumColumns() == m2.GetNumRows() );
1154 dstPtr = dst.ToFloatPtr();
1155 m1Ptr = m1.ToFloatPtr();
1156 m2Ptr = m2.ToFloatPtr();
1157 k = m1.GetNumRows();
1158 l = m2.GetNumColumns();
1160 switch( m1.GetNumColumns() ) {
1163 for ( i = 0; i < k; i++ ) { // Nx1 * 1x6
1164 *dstPtr++ = m1Ptr[i] * m2Ptr[0];
1165 *dstPtr++ = m1Ptr[i] * m2Ptr[1];
1166 *dstPtr++ = m1Ptr[i] * m2Ptr[2];
1167 *dstPtr++ = m1Ptr[i] * m2Ptr[3];
1168 *dstPtr++ = m1Ptr[i] * m2Ptr[4];
1169 *dstPtr++ = m1Ptr[i] * m2Ptr[5];
1173 for ( i = 0; i < k; i++ ) {
1174 m2Ptr = m2.ToFloatPtr();
1175 for ( j = 0; j < l; j++ ) {
1176 *dstPtr++ = m1Ptr[0] * m2Ptr[0];
1185 for ( i = 0; i < k; i++ ) { // Nx2 * 2x6
1186 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6];
1187 *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7];
1188 *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8];
1189 *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9];
1190 *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10];
1191 *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11];
1196 for ( i = 0; i < k; i++ ) {
1197 m2Ptr = m2.ToFloatPtr();
1198 for ( j = 0; j < l; j++ ) {
1199 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l];
1208 for ( i = 0; i < k; i++ ) { // Nx3 * 3x6
1209 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12];
1210 *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13];
1211 *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14];
1212 *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15];
1213 *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16];
1214 *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17];
1219 for ( i = 0; i < k; i++ ) {
1220 m2Ptr = m2.ToFloatPtr();
1221 for ( j = 0; j < l; j++ ) {
1222 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l];
1231 for ( i = 0; i < k; i++ ) { // Nx4 * 4x6
1232 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12] + m1Ptr[3] * m2Ptr[18];
1233 *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13] + m1Ptr[3] * m2Ptr[19];
1234 *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14] + m1Ptr[3] * m2Ptr[20];
1235 *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15] + m1Ptr[3] * m2Ptr[21];
1236 *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16] + m1Ptr[3] * m2Ptr[22];
1237 *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17] + m1Ptr[3] * m2Ptr[23];
1242 for ( i = 0; i < k; i++ ) {
1243 m2Ptr = m2.ToFloatPtr();
1244 for ( j = 0; j < l; j++ ) {
1245 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
1246 m1Ptr[3] * m2Ptr[3*l];
1255 for ( i = 0; i < k; i++ ) { // Nx5 * 5x6
1256 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12] + m1Ptr[3] * m2Ptr[18] + m1Ptr[4] * m2Ptr[24];
1257 *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13] + m1Ptr[3] * m2Ptr[19] + m1Ptr[4] * m2Ptr[25];
1258 *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14] + m1Ptr[3] * m2Ptr[20] + m1Ptr[4] * m2Ptr[26];
1259 *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15] + m1Ptr[3] * m2Ptr[21] + m1Ptr[4] * m2Ptr[27];
1260 *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16] + m1Ptr[3] * m2Ptr[22] + m1Ptr[4] * m2Ptr[28];
1261 *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17] + m1Ptr[3] * m2Ptr[23] + m1Ptr[4] * m2Ptr[29];
1266 for ( i = 0; i < k; i++ ) {
1267 m2Ptr = m2.ToFloatPtr();
1268 for ( j = 0; j < l; j++ ) {
1269 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
1270 m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l];
1280 if ( l == 1 ) { // 1x6 * 6x1
1281 dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +
1282 m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];
1288 if ( l == 2 ) { // 2x6 * 6x2
1289 for ( i = 0; i < 2; i++ ) {
1290 for ( j = 0; j < 2; j++ ) {
1291 *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 2 + j ]
1292 + m1Ptr[1] * m2Ptr[ 1 * 2 + j ]
1293 + m1Ptr[2] * m2Ptr[ 2 * 2 + j ]
1294 + m1Ptr[3] * m2Ptr[ 3 * 2 + j ]
1295 + m1Ptr[4] * m2Ptr[ 4 * 2 + j ]
1296 + m1Ptr[5] * m2Ptr[ 5 * 2 + j ];
1306 if ( l == 3 ) { // 3x6 * 6x3
1307 for ( i = 0; i < 3; i++ ) {
1308 for ( j = 0; j < 3; j++ ) {
1309 *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 3 + j ]
1310 + m1Ptr[1] * m2Ptr[ 1 * 3 + j ]
1311 + m1Ptr[2] * m2Ptr[ 2 * 3 + j ]
1312 + m1Ptr[3] * m2Ptr[ 3 * 3 + j ]
1313 + m1Ptr[4] * m2Ptr[ 4 * 3 + j ]
1314 + m1Ptr[5] * m2Ptr[ 5 * 3 + j ];
1324 if ( l == 4 ) { // 4x6 * 6x4
1325 for ( i = 0; i < 4; i++ ) {
1326 for ( j = 0; j < 4; j++ ) {
1327 *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 4 + j ]
1328 + m1Ptr[1] * m2Ptr[ 1 * 4 + j ]
1329 + m1Ptr[2] * m2Ptr[ 2 * 4 + j ]
1330 + m1Ptr[3] * m2Ptr[ 3 * 4 + j ]
1331 + m1Ptr[4] * m2Ptr[ 4 * 4 + j ]
1332 + m1Ptr[5] * m2Ptr[ 5 * 4 + j ];
1341 if ( l == 5 ) { // 5x6 * 6x5
1342 for ( i = 0; i < 5; i++ ) {
1343 for ( j = 0; j < 5; j++ ) {
1344 *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 5 + j ]
1345 + m1Ptr[1] * m2Ptr[ 1 * 5 + j ]
1346 + m1Ptr[2] * m2Ptr[ 2 * 5 + j ]
1347 + m1Ptr[3] * m2Ptr[ 3 * 5 + j ]
1348 + m1Ptr[4] * m2Ptr[ 4 * 5 + j ]
1349 + m1Ptr[5] * m2Ptr[ 5 * 5 + j ];
1359 case 1: { // 6x6 * 6x1
1360 for ( i = 0; i < 6; i++ ) {
1361 *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 1 ]
1362 + m1Ptr[1] * m2Ptr[ 1 * 1 ]
1363 + m1Ptr[2] * m2Ptr[ 2 * 1 ]
1364 + m1Ptr[3] * m2Ptr[ 3 * 1 ]
1365 + m1Ptr[4] * m2Ptr[ 4 * 1 ]
1366 + m1Ptr[5] * m2Ptr[ 5 * 1 ];
1372 case 2: { // 6x6 * 6x2
1373 for ( i = 0; i < 6; i++ ) {
1374 for ( j = 0; j < 2; j++ ) {
1375 *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 2 + j ]
1376 + m1Ptr[1] * m2Ptr[ 1 * 2 + j ]
1377 + m1Ptr[2] * m2Ptr[ 2 * 2 + j ]
1378 + m1Ptr[3] * m2Ptr[ 3 * 2 + j ]
1379 + m1Ptr[4] * m2Ptr[ 4 * 2 + j ]
1380 + m1Ptr[5] * m2Ptr[ 5 * 2 + j ];
1387 case 3: { // 6x6 * 6x3
1388 for ( i = 0; i < 6; i++ ) {
1389 for ( j = 0; j < 3; j++ ) {
1390 *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 3 + j ]
1391 + m1Ptr[1] * m2Ptr[ 1 * 3 + j ]
1392 + m1Ptr[2] * m2Ptr[ 2 * 3 + j ]
1393 + m1Ptr[3] * m2Ptr[ 3 * 3 + j ]
1394 + m1Ptr[4] * m2Ptr[ 4 * 3 + j ]
1395 + m1Ptr[5] * m2Ptr[ 5 * 3 + j ];
1402 case 4: { // 6x6 * 6x4
1403 for ( i = 0; i < 6; i++ ) {
1404 for ( j = 0; j < 4; j++ ) {
1405 *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 4 + j ]
1406 + m1Ptr[1] * m2Ptr[ 1 * 4 + j ]
1407 + m1Ptr[2] * m2Ptr[ 2 * 4 + j ]
1408 + m1Ptr[3] * m2Ptr[ 3 * 4 + j ]
1409 + m1Ptr[4] * m2Ptr[ 4 * 4 + j ]
1410 + m1Ptr[5] * m2Ptr[ 5 * 4 + j ];
1417 case 5: { // 6x6 * 6x5
1418 for ( i = 0; i < 6; i++ ) {
1419 for ( j = 0; j < 5; j++ ) {
1420 *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 5 + j ]
1421 + m1Ptr[1] * m2Ptr[ 1 * 5 + j ]
1422 + m1Ptr[2] * m2Ptr[ 2 * 5 + j ]
1423 + m1Ptr[3] * m2Ptr[ 3 * 5 + j ]
1424 + m1Ptr[4] * m2Ptr[ 4 * 5 + j ]
1425 + m1Ptr[5] * m2Ptr[ 5 * 5 + j ];
1432 case 6: { // 6x6 * 6x6
1433 for ( i = 0; i < 6; i++ ) {
1434 for ( j = 0; j < 6; j++ ) {
1435 *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 6 + j ]
1436 + m1Ptr[1] * m2Ptr[ 1 * 6 + j ]
1437 + m1Ptr[2] * m2Ptr[ 2 * 6 + j ]
1438 + m1Ptr[3] * m2Ptr[ 3 * 6 + j ]
1439 + m1Ptr[4] * m2Ptr[ 4 * 6 + j ]
1440 + m1Ptr[5] * m2Ptr[ 5 * 6 + j ];
1450 for ( i = 0; i < k; i++ ) {
1451 m2Ptr = m2.ToFloatPtr();
1452 for ( j = 0; j < l; j++ ) {
1453 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
1454 m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l];
1462 for ( i = 0; i < k; i++ ) {
1463 for ( j = 0; j < l; j++ ) {
1464 m2Ptr = m2.ToFloatPtr() + j;
1465 sum = m1Ptr[0] * m2Ptr[0];
1466 for ( n = 1; n < m1.GetNumColumns(); n++ ) {
1468 sum += m1Ptr[n] * m2Ptr[0];
1472 m1Ptr += m1.GetNumColumns();
1481 idSIMD_Generic::MatX_TransposeMultiplyMatX
1483 optimizes the following tranpose matrix multiplications:
1488 with N in the range [1-6].
1491 void VPCALL idSIMD_Generic::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
1494 const float *m1Ptr, *m2Ptr;
1497 assert( m1.GetNumRows() == m2.GetNumRows() );
1499 m1Ptr = m1.ToFloatPtr();
1500 m2Ptr = m2.ToFloatPtr();
1501 dstPtr = dst.ToFloatPtr();
1502 k = m1.GetNumColumns();
1503 l = m2.GetNumColumns();
1505 switch( m1.GetNumRows() ) {
1507 if ( k == 6 && l == 1 ) { // 1x6 * 1x1
1508 for ( i = 0; i < 6; i++ ) {
1509 *dstPtr++ = m1Ptr[0] * m2Ptr[0];
1514 for ( i = 0; i < k; i++ ) {
1515 m2Ptr = m2.ToFloatPtr();
1516 for ( j = 0; j < l; j++ ) {
1517 *dstPtr++ = m1Ptr[0] * m2Ptr[0];
1524 if ( k == 6 && l == 2 ) { // 2x6 * 2x2
1525 for ( i = 0; i < 6; i++ ) {
1526 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*2+0] + m1Ptr[1*6] * m2Ptr[1*2+0];
1527 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*2+1] + m1Ptr[1*6] * m2Ptr[1*2+1];
1532 for ( i = 0; i < k; i++ ) {
1533 m2Ptr = m2.ToFloatPtr();
1534 for ( j = 0; j < l; j++ ) {
1535 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l];
1542 if ( k == 6 && l == 3 ) { // 3x6 * 3x3
1543 for ( i = 0; i < 6; i++ ) {
1544 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+0] + m1Ptr[1*6] * m2Ptr[1*3+0] + m1Ptr[2*6] * m2Ptr[2*3+0];
1545 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+1] + m1Ptr[1*6] * m2Ptr[1*3+1] + m1Ptr[2*6] * m2Ptr[2*3+1];
1546 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+2] + m1Ptr[1*6] * m2Ptr[1*3+2] + m1Ptr[2*6] * m2Ptr[2*3+2];
1551 for ( i = 0; i < k; i++ ) {
1552 m2Ptr = m2.ToFloatPtr();
1553 for ( j = 0; j < l; j++ ) {
1554 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l];
1561 if ( k == 6 && l == 4 ) { // 4x6 * 4x4
1562 for ( i = 0; i < 6; i++ ) {
1563 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+0] + m1Ptr[1*6] * m2Ptr[1*4+0] + m1Ptr[2*6] * m2Ptr[2*4+0] + m1Ptr[3*6] * m2Ptr[3*4+0];
1564 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+1] + m1Ptr[1*6] * m2Ptr[1*4+1] + m1Ptr[2*6] * m2Ptr[2*4+1] + m1Ptr[3*6] * m2Ptr[3*4+1];
1565 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+2] + m1Ptr[1*6] * m2Ptr[1*4+2] + m1Ptr[2*6] * m2Ptr[2*4+2] + m1Ptr[3*6] * m2Ptr[3*4+2];
1566 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+3] + m1Ptr[1*6] * m2Ptr[1*4+3] + m1Ptr[2*6] * m2Ptr[2*4+3] + m1Ptr[3*6] * m2Ptr[3*4+3];
1571 for ( i = 0; i < k; i++ ) {
1572 m2Ptr = m2.ToFloatPtr();
1573 for ( j = 0; j < l; j++ ) {
1574 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
1575 m1Ptr[3*k] * m2Ptr[3*l];
1582 if ( k == 6 && l == 5 ) { // 5x6 * 5x5
1583 for ( i = 0; i < 6; i++ ) {
1584 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+0] + m1Ptr[1*6] * m2Ptr[1*5+0] + m1Ptr[2*6] * m2Ptr[2*5+0] + m1Ptr[3*6] * m2Ptr[3*5+0] + m1Ptr[4*6] * m2Ptr[4*5+0];
1585 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+1] + m1Ptr[1*6] * m2Ptr[1*5+1] + m1Ptr[2*6] * m2Ptr[2*5+1] + m1Ptr[3*6] * m2Ptr[3*5+1] + m1Ptr[4*6] * m2Ptr[4*5+1];
1586 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+2] + m1Ptr[1*6] * m2Ptr[1*5+2] + m1Ptr[2*6] * m2Ptr[2*5+2] + m1Ptr[3*6] * m2Ptr[3*5+2] + m1Ptr[4*6] * m2Ptr[4*5+2];
1587 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+3] + m1Ptr[1*6] * m2Ptr[1*5+3] + m1Ptr[2*6] * m2Ptr[2*5+3] + m1Ptr[3*6] * m2Ptr[3*5+3] + m1Ptr[4*6] * m2Ptr[4*5+3];
1588 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+4] + m1Ptr[1*6] * m2Ptr[1*5+4] + m1Ptr[2*6] * m2Ptr[2*5+4] + m1Ptr[3*6] * m2Ptr[3*5+4] + m1Ptr[4*6] * m2Ptr[4*5+4];
1593 for ( i = 0; i < k; i++ ) {
1594 m2Ptr = m2.ToFloatPtr();
1595 for ( j = 0; j < l; j++ ) {
1596 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
1597 m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l];
1606 case 1: // 6x1 * 6x6
1607 m2Ptr = m2.ToFloatPtr();
1608 for ( j = 0; j < 6; j++ ) {
1609 *dstPtr++ = m1Ptr[0*1] * m2Ptr[0*6] +
1610 m1Ptr[1*1] * m2Ptr[1*6] +
1611 m1Ptr[2*1] * m2Ptr[2*6] +
1612 m1Ptr[3*1] * m2Ptr[3*6] +
1613 m1Ptr[4*1] * m2Ptr[4*6] +
1614 m1Ptr[5*1] * m2Ptr[5*6];
1618 case 2: // 6x2 * 6x6
1619 for ( i = 0; i < 2; i++ ) {
1620 m2Ptr = m2.ToFloatPtr();
1621 for ( j = 0; j < 6; j++ ) {
1622 *dstPtr++ = m1Ptr[0*2] * m2Ptr[0*6] +
1623 m1Ptr[1*2] * m2Ptr[1*6] +
1624 m1Ptr[2*2] * m2Ptr[2*6] +
1625 m1Ptr[3*2] * m2Ptr[3*6] +
1626 m1Ptr[4*2] * m2Ptr[4*6] +
1627 m1Ptr[5*2] * m2Ptr[5*6];
1633 case 3: // 6x3 * 6x6
1634 for ( i = 0; i < 3; i++ ) {
1635 m2Ptr = m2.ToFloatPtr();
1636 for ( j = 0; j < 6; j++ ) {
1637 *dstPtr++ = m1Ptr[0*3] * m2Ptr[0*6] +
1638 m1Ptr[1*3] * m2Ptr[1*6] +
1639 m1Ptr[2*3] * m2Ptr[2*6] +
1640 m1Ptr[3*3] * m2Ptr[3*6] +
1641 m1Ptr[4*3] * m2Ptr[4*6] +
1642 m1Ptr[5*3] * m2Ptr[5*6];
1648 case 4: // 6x4 * 6x6
1649 for ( i = 0; i < 4; i++ ) {
1650 m2Ptr = m2.ToFloatPtr();
1651 for ( j = 0; j < 6; j++ ) {
1652 *dstPtr++ = m1Ptr[0*4] * m2Ptr[0*6] +
1653 m1Ptr[1*4] * m2Ptr[1*6] +
1654 m1Ptr[2*4] * m2Ptr[2*6] +
1655 m1Ptr[3*4] * m2Ptr[3*6] +
1656 m1Ptr[4*4] * m2Ptr[4*6] +
1657 m1Ptr[5*4] * m2Ptr[5*6];
1663 case 5: // 6x5 * 6x6
1664 for ( i = 0; i < 5; i++ ) {
1665 m2Ptr = m2.ToFloatPtr();
1666 for ( j = 0; j < 6; j++ ) {
1667 *dstPtr++ = m1Ptr[0*5] * m2Ptr[0*6] +
1668 m1Ptr[1*5] * m2Ptr[1*6] +
1669 m1Ptr[2*5] * m2Ptr[2*6] +
1670 m1Ptr[3*5] * m2Ptr[3*6] +
1671 m1Ptr[4*5] * m2Ptr[4*6] +
1672 m1Ptr[5*5] * m2Ptr[5*6];
1678 case 6: // 6x6 * 6x6
1679 for ( i = 0; i < 6; i++ ) {
1680 m2Ptr = m2.ToFloatPtr();
1681 for ( j = 0; j < 6; j++ ) {
1682 *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*6] +
1683 m1Ptr[1*6] * m2Ptr[1*6] +
1684 m1Ptr[2*6] * m2Ptr[2*6] +
1685 m1Ptr[3*6] * m2Ptr[3*6] +
1686 m1Ptr[4*6] * m2Ptr[4*6] +
1687 m1Ptr[5*6] * m2Ptr[5*6];
1695 for ( i = 0; i < k; i++ ) {
1696 m2Ptr = m2.ToFloatPtr();
1697 for ( j = 0; j < l; j++ ) {
1698 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
1699 m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l];
1706 for ( i = 0; i < k; i++ ) {
1707 for ( j = 0; j < l; j++ ) {
1708 m1Ptr = m1.ToFloatPtr() + i;
1709 m2Ptr = m2.ToFloatPtr() + j;
1710 sum = m1Ptr[0] * m2Ptr[0];
1711 for ( n = 1; n < m1.GetNumRows(); n++ ) {
1714 sum += m1Ptr[0] * m2Ptr[0];
1725 idSIMD_Generic::MatX_LowerTriangularSolve
1727 solves x in Lx = b for the n * n sub-matrix of L
1728 if skip > 0 the first skip elements of x are assumed to be valid already
1729 L has to be a lower triangular matrix with (implicit) ones on the diagonal
1733 void VPCALL idSIMD_Generic::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
1743 lptr = L.ToFloatPtr();
1744 nc = L.GetNumColumns();
1746 // unrolled cases for n < 8
1748 #define NSKIP( n, s ) ((n<<3)|(s&7))
1749 switch( NSKIP( n, skip ) ) {
1750 case NSKIP( 1, 0 ): x[0] = b[0];
1752 case NSKIP( 2, 0 ): x[0] = b[0];
1753 case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
1755 case NSKIP( 3, 0 ): x[0] = b[0];
1756 case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
1757 case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
1759 case NSKIP( 4, 0 ): x[0] = b[0];
1760 case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
1761 case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
1762 case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
1764 case NSKIP( 5, 0 ): x[0] = b[0];
1765 case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
1766 case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
1767 case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
1768 case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
1770 case NSKIP( 6, 0 ): x[0] = b[0];
1771 case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
1772 case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
1773 case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
1774 case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
1775 case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
1777 case NSKIP( 7, 0 ): x[0] = b[0];
1778 case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
1779 case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
1780 case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
1781 case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
1782 case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
1783 case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
1789 // process first 4 rows
1791 case 0: x[0] = b[0];
1792 case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
1793 case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
1794 case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
1801 register double s0, s1, s2, s3;
1803 for ( i = skip; i < n; i++ ) {
1804 s0 = lptr[0] * x[0];
1805 s1 = lptr[1] * x[1];
1806 s2 = lptr[2] * x[2];
1807 s3 = lptr[3] * x[3];
1808 for ( j = 4; j < i-7; j += 8 ) {
1809 s0 += lptr[j+0] * x[j+0];
1810 s1 += lptr[j+1] * x[j+1];
1811 s2 += lptr[j+2] * x[j+2];
1812 s3 += lptr[j+3] * x[j+3];
1813 s0 += lptr[j+4] * x[j+4];
1814 s1 += lptr[j+5] * x[j+5];
1815 s2 += lptr[j+6] * x[j+6];
1816 s3 += lptr[j+7] * x[j+7];
1820 case 7: s0 += lptr[j+6] * x[j+6];
1821 case 6: s1 += lptr[j+5] * x[j+5];
1822 case 5: s2 += lptr[j+4] * x[j+4];
1823 case 4: s3 += lptr[j+3] * x[j+3];
1824 case 3: s0 += lptr[j+2] * x[j+2];
1825 case 2: s1 += lptr[j+1] * x[j+1];
1826 case 1: s2 += lptr[j+0] * x[j+0];
1845 for ( i = skip; i < n; i++ ) {
1848 for ( j = 0; j < i; j++ ) {
1849 sum -= lptr[j] * x[j];
1859 idSIMD_Generic::MatX_LowerTriangularSolveTranspose
1861 solves x in L'x = b for the n * n sub-matrix of L
1862 L has to be a lower triangular matrix with (implicit) ones on the diagonal
1866 void VPCALL idSIMD_Generic::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
1872 lptr = L.ToFloatPtr();
1873 nc = L.GetNumColumns();
1875 // unrolled cases for n < 8
1885 x[0] = b[0] - lptr[1*nc+0] * x[1];
1889 x[1] = b[1] - lptr[2*nc+1] * x[2];
1890 x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
1894 x[2] = b[2] - lptr[3*nc+2] * x[3];
1895 x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
1896 x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
1900 x[3] = b[3] - lptr[4*nc+3] * x[4];
1901 x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
1902 x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
1903 x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
1907 x[4] = b[4] - lptr[5*nc+4] * x[5];
1908 x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
1909 x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
1910 x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
1911 x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
1915 x[5] = b[5] - lptr[6*nc+5] * x[6];
1916 x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
1917 x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
1918 x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
1919 x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
1920 x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
1927 register double s0, s1, s2, s3;
1930 lptr = L.ToFloatPtr() + n * nc + n - 4;
1933 // process 4 rows at a time
1934 for ( i = n; i >= 4; i -= 4 ) {
1939 // process 4x4 blocks
1940 for ( j = 0; j < n-i; j += 4 ) {
1941 s0 -= lptr[(j+0)*nc+0] * xptr[j+0];
1942 s1 -= lptr[(j+0)*nc+1] * xptr[j+0];
1943 s2 -= lptr[(j+0)*nc+2] * xptr[j+0];
1944 s3 -= lptr[(j+0)*nc+3] * xptr[j+0];
1945 s0 -= lptr[(j+1)*nc+0] * xptr[j+1];
1946 s1 -= lptr[(j+1)*nc+1] * xptr[j+1];
1947 s2 -= lptr[(j+1)*nc+2] * xptr[j+1];
1948 s3 -= lptr[(j+1)*nc+3] * xptr[j+1];
1949 s0 -= lptr[(j+2)*nc+0] * xptr[j+2];
1950 s1 -= lptr[(j+2)*nc+1] * xptr[j+2];
1951 s2 -= lptr[(j+2)*nc+2] * xptr[j+2];
1952 s3 -= lptr[(j+2)*nc+3] * xptr[j+2];
1953 s0 -= lptr[(j+3)*nc+0] * xptr[j+3];
1954 s1 -= lptr[(j+3)*nc+1] * xptr[j+3];
1955 s2 -= lptr[(j+3)*nc+2] * xptr[j+3];
1956 s3 -= lptr[(j+3)*nc+3] * xptr[j+3];
1958 // process left over of the 4 rows
1959 s0 -= lptr[0-1*nc] * s3;
1960 s1 -= lptr[1-1*nc] * s3;
1961 s2 -= lptr[2-1*nc] * s3;
1962 s0 -= lptr[0-2*nc] * s2;
1963 s1 -= lptr[1-2*nc] * s2;
1964 s0 -= lptr[0-3*nc] * s1;
1970 // update pointers for next four rows
1974 // process left over rows
1975 for ( i--; i >= 0; i-- ) {
1978 for ( j = i + 1; j < n; j++ ) {
1979 s0 -= lptr[j*nc] * x[j];
1990 nc = L.GetNumColumns();
1991 for ( i = n - 1; i >= 0; i-- ) {
1994 for ( j = i + 1; j < n; j++ ) {
1995 sum -= ptr[j*nc] * x[j];
2005 idSIMD_Generic::MatX_LDLTFactor
2007 in-place factorization LDL' of the n * n sub-matrix of mat
2008 the reciprocal of the diagonal elements are stored in invDiag
2011 bool VPCALL idSIMD_Generic::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
2015 float *v, *diag, *mptr;
2016 double s0, s1, s2, s3, sum, d;
2018 v = (float *) _alloca16( n * sizeof( float ) );
2019 diag = (float *) _alloca16( n * sizeof( float ) );
2021 nc = mat.GetNumColumns();
2031 if ( sum == 0.0f ) {
2036 invDiag[0] = d = 1.0f / sum;
2043 for ( j = 1; j < n; j++ ) {
2044 mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
2049 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
2052 if ( sum == 0.0f ) {
2058 invDiag[1] = d = 1.0f / sum;
2065 for ( j = 2; j < n; j++ ) {
2066 mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
2071 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
2072 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
2073 sum = mptr[2] - s0 - s1;
2075 if ( sum == 0.0f ) {
2081 invDiag[2] = d = 1.0f / sum;
2088 for ( j = 3; j < n; j++ ) {
2089 mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
2094 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
2095 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
2096 v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
2097 sum = mptr[3] - s0 - s1 - s2;
2099 if ( sum == 0.0f ) {
2105 invDiag[3] = d = 1.0f / sum;
2112 for ( j = 4; j < n; j++ ) {
2113 mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
2116 for ( i = 4; i < n; i++ ) {
2120 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
2121 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
2122 v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
2123 v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
2124 for ( k = 4; k < i-3; k += 4 ) {
2125 v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
2126 v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
2127 v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
2128 v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
2132 case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
2133 case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
2134 case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
2141 sum = mptr[i] - sum;
2143 if ( sum == 0.0f ) {
2149 invDiag[i] = d = 1.0f / sum;
2156 for ( j = i+1; j < n; j++ ) {
2157 s0 = mptr[0] * v[0];
2158 s1 = mptr[1] * v[1];
2159 s2 = mptr[2] * v[2];
2160 s3 = mptr[3] * v[3];
2161 for ( k = 4; k < i-7; k += 8 ) {
2162 s0 += mptr[k+0] * v[k+0];
2163 s1 += mptr[k+1] * v[k+1];
2164 s2 += mptr[k+2] * v[k+2];
2165 s3 += mptr[k+3] * v[k+3];
2166 s0 += mptr[k+4] * v[k+4];
2167 s1 += mptr[k+5] * v[k+5];
2168 s2 += mptr[k+6] * v[k+6];
2169 s3 += mptr[k+7] * v[k+7];
2173 case 7: s0 += mptr[k+6] * v[k+6];
2174 case 6: s1 += mptr[k+5] * v[k+5];
2175 case 5: s2 += mptr[k+4] * v[k+4];
2176 case 4: s3 += mptr[k+3] * v[k+3];
2177 case 3: s0 += mptr[k+2] * v[k+2];
2178 case 2: s1 += mptr[k+1] * v[k+1];
2179 case 1: s2 += mptr[k+0] * v[k+0];
2186 mptr[i] = ( mptr[i] - sum ) * d;
2196 float *v, *ptr, *diagPtr;
2199 v = (float *) _alloca16( n * sizeof( float ) );
2200 nc = mat.GetNumColumns();
2202 for ( i = 0; i < n; i++ ) {
2207 for ( j = 0; j < i; j++ ) {
2209 v[j] = diagPtr[0] * d;
2214 if ( sum == 0.0f ) {
2219 invDiag[i] = d = 1.0f / sum;
2226 for ( j = i + 1; j < n; j++ ) {
2228 for ( k = 0; k < i; k++ ) {
2229 sum -= ptr[k] * v[k];
2243 idSIMD_Generic::BlendJoints
2246 void VPCALL idSIMD_Generic::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
2249 for ( i = 0; i < numJoints; i++ ) {
2251 joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp );
2252 joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );
2258 idSIMD_Generic::ConvertJointQuatsToJointMats
2261 void VPCALL idSIMD_Generic::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
2264 for ( i = 0; i < numJoints; i++ ) {
2265 jointMats[i].SetRotation( jointQuats[i].q.ToMat3() );
2266 jointMats[i].SetTranslation( jointQuats[i].t );
2272 idSIMD_Generic::ConvertJointMatsToJointQuats
2275 void VPCALL idSIMD_Generic::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
2278 for ( i = 0; i < numJoints; i++ ) {
2279 jointQuats[i] = jointMats[i].ToJointQuat();
2285 idSIMD_Generic::TransformJoints
2288 void VPCALL idSIMD_Generic::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
2291 for( i = firstJoint; i <= lastJoint; i++ ) {
2292 assert( parents[i] < i );
2293 jointMats[i] *= jointMats[parents[i]];
2299 idSIMD_Generic::UntransformJoints
2302 void VPCALL idSIMD_Generic::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
2305 for( i = lastJoint; i >= firstJoint; i-- ) {
2306 assert( parents[i] < i );
2307 jointMats[i] /= jointMats[parents[i]];
2313 idSIMD_Generic::TransformVerts
2316 void VPCALL idSIMD_Generic::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, int numWeights ) {
2318 const byte *jointsPtr = (byte *)joints;
2320 for( j = i = 0; i < numVerts; i++ ) {
2323 v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
2324 while( index[j*2+1] == 0 ) {
2326 v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
2336 idSIMD_Generic::TracePointCull
2339 void VPCALL idSIMD_Generic::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
2345 for ( i = 0; i < numVerts; i++ ) {
2347 float d0, d1, d2, d3, t;
2348 const idVec3 &v = verts[i].xyz;
2350 d0 = planes[0].Distance( v );
2351 d1 = planes[1].Distance( v );
2352 d2 = planes[2].Distance( v );
2353 d3 = planes[3].Distance( v );
2356 bits = FLOATSIGNBITSET( t ) << 0;
2358 bits |= FLOATSIGNBITSET( t ) << 1;
2360 bits |= FLOATSIGNBITSET( t ) << 2;
2362 bits |= FLOATSIGNBITSET( t ) << 3;
2365 bits |= FLOATSIGNBITSET( t ) << 4;
2367 bits |= FLOATSIGNBITSET( t ) << 5;
2369 bits |= FLOATSIGNBITSET( t ) << 6;
2371 bits |= FLOATSIGNBITSET( t ) << 7;
2373 bits ^= 0x0F; // flip lower four bits
2384 idSIMD_Generic::DecalPointCull
2387 void VPCALL idSIMD_Generic::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
2390 for ( i = 0; i < numVerts; i++ ) {
2392 float d0, d1, d2, d3, d4, d5;
2393 const idVec3 &v = verts[i].xyz;
2395 d0 = planes[0].Distance( v );
2396 d1 = planes[1].Distance( v );
2397 d2 = planes[2].Distance( v );
2398 d3 = planes[3].Distance( v );
2399 d4 = planes[4].Distance( v );
2400 d5 = planes[5].Distance( v );
2402 bits = FLOATSIGNBITSET( d0 ) << 0;
2403 bits |= FLOATSIGNBITSET( d1 ) << 1;
2404 bits |= FLOATSIGNBITSET( d2 ) << 2;
2405 bits |= FLOATSIGNBITSET( d3 ) << 3;
2406 bits |= FLOATSIGNBITSET( d4 ) << 4;
2407 bits |= FLOATSIGNBITSET( d5 ) << 5;
2409 cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
2415 idSIMD_Generic::OverlayPointCull
2418 void VPCALL idSIMD_Generic::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
2421 for ( i = 0; i < numVerts; i++ ) {
2424 const idVec3 &v = verts[i].xyz;
2426 texCoords[i][0] = d0 = planes[0].Distance( v );
2427 texCoords[i][1] = d1 = planes[1].Distance( v );
2429 bits = FLOATSIGNBITSET( d0 ) << 0;
2431 bits |= FLOATSIGNBITSET( d1 ) << 1;
2433 bits |= FLOATSIGNBITSET( d0 ) << 2;
2434 bits |= FLOATSIGNBITSET( d1 ) << 3;
2442 idSIMD_Generic::DeriveTriPlanes
2444 Derives a plane equation for each triangle.
2447 void VPCALL idSIMD_Generic::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
2450 for ( i = 0; i < numIndexes; i += 3 ) {
2451 const idDrawVert *a, *b, *c;
2452 float d0[3], d1[3], f;
2455 a = verts + indexes[i + 0];
2456 b = verts + indexes[i + 1];
2457 c = verts + indexes[i + 2];
2459 d0[0] = b->xyz[0] - a->xyz[0];
2460 d0[1] = b->xyz[1] - a->xyz[1];
2461 d0[2] = b->xyz[2] - a->xyz[2];
2463 d1[0] = c->xyz[0] - a->xyz[0];
2464 d1[1] = c->xyz[1] - a->xyz[1];
2465 d1[2] = c->xyz[2] - a->xyz[2];
2467 n[0] = d1[1] * d0[2] - d1[2] * d0[1];
2468 n[1] = d1[2] * d0[0] - d1[0] * d0[2];
2469 n[2] = d1[0] * d0[1] - d1[1] * d0[0];
2471 f = idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
2477 planes->SetNormal( n );
2478 planes->FitThroughPoint( a->xyz );
2485 idSIMD_Generic::DeriveTangents
2487 Derives the normal and orthogonal tangent vectors for the triangle vertices.
2488 For each vertex the normal and tangent vectors are derived from all triangles
2489 using the vertex which results in smooth tangents across the mesh.
2490 In the process the triangle planes are calculated as well.
2493 void VPCALL idSIMD_Generic::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
2496 bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
2497 memset( used, 0, numVerts * sizeof( used[0] ) );
2499 idPlane *planesPtr = planes;
2500 for ( i = 0; i < numIndexes; i += 3 ) {
2501 idDrawVert *a, *b, *c;
2502 unsigned long signBit;
2503 float d0[5], d1[5], f, area;
2506 int v0 = indexes[i + 0];
2507 int v1 = indexes[i + 1];
2508 int v2 = indexes[i + 2];
2514 d0[0] = b->xyz[0] - a->xyz[0];
2515 d0[1] = b->xyz[1] - a->xyz[1];
2516 d0[2] = b->xyz[2] - a->xyz[2];
2517 d0[3] = b->st[0] - a->st[0];
2518 d0[4] = b->st[1] - a->st[1];
2520 d1[0] = c->xyz[0] - a->xyz[0];
2521 d1[1] = c->xyz[1] - a->xyz[1];
2522 d1[2] = c->xyz[2] - a->xyz[2];
2523 d1[3] = c->st[0] - a->st[0];
2524 d1[4] = c->st[1] - a->st[1];
2527 n[0] = d1[1] * d0[2] - d1[2] * d0[1];
2528 n[1] = d1[2] * d0[0] - d1[0] * d0[2];
2529 n[2] = d1[0] * d0[1] - d1[1] * d0[0];
2531 f = idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
2537 planesPtr->SetNormal( n );
2538 planesPtr->FitThroughPoint( a->xyz );
2542 area = d0[3] * d1[4] - d0[4] * d1[3];
2543 signBit = ( *(unsigned long *)&area ) & ( 1 << 31 );
2546 t0[0] = d0[0] * d1[4] - d0[4] * d1[0];
2547 t0[1] = d0[1] * d1[4] - d0[4] * d1[1];
2548 t0[2] = d0[2] * d1[4] - d0[4] * d1[2];
2550 f = idMath::RSqrt( t0.x * t0.x + t0.y * t0.y + t0.z * t0.z );
2551 *(unsigned long *)&f ^= signBit;
2558 t1[0] = d0[3] * d1[0] - d0[0] * d1[3];
2559 t1[1] = d0[3] * d1[1] - d0[1] * d1[3];
2560 t1[2] = d0[3] * d1[2] - d0[2] * d1[3];
2562 f = idMath::RSqrt( t1.x * t1.x + t1.y * t1.y + t1.z * t1.z );
2563 *(unsigned long *)&f ^= signBit;
2571 a->tangents[0] += t0;
2572 a->tangents[1] += t1;
2575 a->tangents[0] = t0;
2576 a->tangents[1] = t1;
2582 b->tangents[0] += t0;
2583 b->tangents[1] += t1;
2586 b->tangents[0] = t0;
2587 b->tangents[1] = t1;
2593 c->tangents[0] += t0;
2594 c->tangents[1] += t1;
2597 c->tangents[0] = t0;
2598 c->tangents[1] = t1;
2606 idSIMD_Generic::DeriveUnsmoothedTangents
2608 Derives the normal and orthogonal tangent vectors for the triangle vertices.
2609 For each vertex the normal and tangent vectors are derived from a single dominant triangle.
2612 #define DERIVE_UNSMOOTHED_BITANGENT
2614 void VPCALL idSIMD_Generic::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
2617 for ( i = 0; i < numVerts; i++ ) {
2618 idDrawVert *a, *b, *c;
2619 float d0, d1, d2, d3, d4;
2620 float d5, d6, d7, d8, d9;
2626 const dominantTri_s &dt = dominantTris[i];
2632 d0 = b->xyz[0] - a->xyz[0];
2633 d1 = b->xyz[1] - a->xyz[1];
2634 d2 = b->xyz[2] - a->xyz[2];
2635 d3 = b->st[0] - a->st[0];
2636 d4 = b->st[1] - a->st[1];
2638 d5 = c->xyz[0] - a->xyz[0];
2639 d6 = c->xyz[1] - a->xyz[1];
2640 d7 = c->xyz[2] - a->xyz[2];
2641 d8 = c->st[0] - a->st[0];
2642 d9 = c->st[1] - a->st[1];
2644 s0 = dt.normalizationScale[0];
2645 s1 = dt.normalizationScale[1];
2646 s2 = dt.normalizationScale[2];
2648 n0 = s2 * ( d6 * d2 - d7 * d1 );
2649 n1 = s2 * ( d7 * d0 - d5 * d2 );
2650 n2 = s2 * ( d5 * d1 - d6 * d0 );
2652 t0 = s0 * ( d0 * d9 - d4 * d5 );
2653 t1 = s0 * ( d1 * d9 - d4 * d6 );
2654 t2 = s0 * ( d2 * d9 - d4 * d7 );
2656 #ifndef DERIVE_UNSMOOTHED_BITANGENT
2657 t3 = s1 * ( d3 * d5 - d0 * d8 );
2658 t4 = s1 * ( d3 * d6 - d1 * d8 );
2659 t5 = s1 * ( d3 * d7 - d2 * d8 );
2661 t3 = s1 * ( n2 * t1 - n1 * t2 );
2662 t4 = s1 * ( n0 * t2 - n2 * t0 );
2663 t5 = s1 * ( n1 * t0 - n0 * t1 );
2670 a->tangents[0][0] = t0;
2671 a->tangents[0][1] = t1;
2672 a->tangents[0][2] = t2;
2674 a->tangents[1][0] = t3;
2675 a->tangents[1][1] = t4;
2676 a->tangents[1][2] = t5;
2682 idSIMD_Generic::NormalizeTangents
2684 Normalizes each vertex normal and projects and normalizes the
2685 tangent vectors onto the plane orthogonal to the vertex normal.
2688 void VPCALL idSIMD_Generic::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
2690 for ( int i = 0; i < numVerts; i++ ) {
2691 idVec3 &v = verts[i].normal;
2694 f = idMath::RSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
2695 v.x *= f; v.y *= f; v.z *= f;
2697 for ( int j = 0; j < 2; j++ ) {
2698 idVec3 &t = verts[i].tangents[j];
2701 f = idMath::RSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
2702 t.x *= f; t.y *= f; t.z *= f;
2709 idSIMD_Generic::CreateTextureSpaceLightVectors
2711 Calculates light vectors in texture space for the given triangle vertices.
2712 For each vertex the direction towards the light origin is projected onto texture space.
2713 The light vectors are only calculated for the vertices referenced by the indexes.
2716 void VPCALL idSIMD_Generic::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
2718 bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
2719 memset( used, 0, numVerts * sizeof( used[0] ) );
2721 for ( int i = numIndexes - 1; i >= 0; i-- ) {
2722 used[indexes[i]] = true;
2725 for ( int i = 0; i < numVerts; i++ ) {
2730 const idDrawVert *v = &verts[i];
2732 idVec3 lightDir = lightOrigin - v->xyz;
2734 lightVectors[i][0] = lightDir * v->tangents[0];
2735 lightVectors[i][1] = lightDir * v->tangents[1];
2736 lightVectors[i][2] = lightDir * v->normal;
2742 idSIMD_Generic::CreateSpecularTextureCoords
2744 Calculates specular texture coordinates for the given triangle vertices.
2745 For each vertex the normalized direction towards the light origin is added to the
2746 normalized direction towards the view origin and the result is projected onto texture space.
2747 The texture coordinates are only calculated for the vertices referenced by the indexes.
2750 void VPCALL idSIMD_Generic::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
2752 bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
2753 memset( used, 0, numVerts * sizeof( used[0] ) );
2755 for ( int i = numIndexes - 1; i >= 0; i-- ) {
2756 used[indexes[i]] = true;
2759 for ( int i = 0; i < numVerts; i++ ) {
2764 const idDrawVert *v = &verts[i];
2766 idVec3 lightDir = lightOrigin - v->xyz;
2767 idVec3 viewDir = viewOrigin - v->xyz;
2771 ilength = idMath::RSqrt( lightDir * lightDir );
2772 lightDir[0] *= ilength;
2773 lightDir[1] *= ilength;
2774 lightDir[2] *= ilength;
2776 ilength = idMath::RSqrt( viewDir * viewDir );
2777 viewDir[0] *= ilength;
2778 viewDir[1] *= ilength;
2779 viewDir[2] *= ilength;
2781 lightDir += viewDir;
2783 texCoords[i][0] = lightDir * v->tangents[0];
2784 texCoords[i][1] = lightDir * v->tangents[1];
2785 texCoords[i][2] = lightDir * v->normal;
2786 texCoords[i][3] = 1.0f;
2792 idSIMD_Generic::CreateShadowCache
2795 int VPCALL idSIMD_Generic::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
2798 for ( int i = 0; i < numVerts; i++ ) {
2799 if ( vertRemap[i] ) {
2802 const float *v = verts[i].xyz.ToFloatPtr();
2803 vertexCache[outVerts+0][0] = v[0];
2804 vertexCache[outVerts+0][1] = v[1];
2805 vertexCache[outVerts+0][2] = v[2];
2806 vertexCache[outVerts+0][3] = 1.0f;
2808 // R_SetupProjection() builds the projection matrix with a slight crunch
2809 // for depth, which keeps this w=0 division from rasterizing right at the
2810 // wrap around point and causing depth fighting with the rear caps
2811 vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
2812 vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
2813 vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
2814 vertexCache[outVerts+1][3] = 0.0f;
2815 vertRemap[i] = outVerts;
2823 idSIMD_Generic::CreateVertexProgramShadowCache
2826 int VPCALL idSIMD_Generic::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
2827 for ( int i = 0; i < numVerts; i++ ) {
2828 const float *v = verts[i].xyz.ToFloatPtr();
2829 vertexCache[i*2+0][0] = v[0];
2830 vertexCache[i*2+1][0] = v[0];
2831 vertexCache[i*2+0][1] = v[1];
2832 vertexCache[i*2+1][1] = v[1];
2833 vertexCache[i*2+0][2] = v[2];
2834 vertexCache[i*2+1][2] = v[2];
2835 vertexCache[i*2+0][3] = 1.0f;
2836 vertexCache[i*2+1][3] = 0.0f;
2838 return numVerts * 2;
2843 idSIMD_Generic::UpSamplePCMTo44kHz
2845 Duplicate samples for 44kHz output.
2848 void idSIMD_Generic::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
2849 if ( kHz == 11025 ) {
2850 if ( numChannels == 1 ) {
2851 for ( int i = 0; i < numSamples; i++ ) {
2852 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
2855 for ( int i = 0; i < numSamples; i += 2 ) {
2856 dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
2857 dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
2860 } else if ( kHz == 22050 ) {
2861 if ( numChannels == 1 ) {
2862 for ( int i = 0; i < numSamples; i++ ) {
2863 dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
2866 for ( int i = 0; i < numSamples; i += 2 ) {
2867 dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
2868 dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
2871 } else if ( kHz == 44100 ) {
2872 for ( int i = 0; i < numSamples; i++ ) {
2873 dest[i] = (float) src[i];
2882 idSIMD_Generic::UpSampleOGGTo44kHz
2884 Duplicate samples for 44kHz output.
2887 void idSIMD_Generic::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
2888 if ( kHz == 11025 ) {
2889 if ( numChannels == 1 ) {
2890 for ( int i = 0; i < numSamples; i++ ) {
2891 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
2894 for ( int i = 0; i < numSamples >> 1; i++ ) {
2895 dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
2896 dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
2899 } else if ( kHz == 22050 ) {
2900 if ( numChannels == 1 ) {
2901 for ( int i = 0; i < numSamples; i++ ) {
2902 dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
2905 for ( int i = 0; i < numSamples >> 1; i++ ) {
2906 dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
2907 dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
2910 } else if ( kHz == 44100 ) {
2911 if ( numChannels == 1 ) {
2912 for ( int i = 0; i < numSamples; i++ ) {
2913 dest[i*1+0] = ogg[0][i] * 32768.0f;
2916 for ( int i = 0; i < numSamples >> 1; i++ ) {
2917 dest[i*2+0] = ogg[0][i] * 32768.0f;
2918 dest[i*2+1] = ogg[1][i] * 32768.0f;
2928 idSIMD_Generic::MixSoundTwoSpeakerMono
2931 void VPCALL idSIMD_Generic::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
2932 float sL = lastV[0];
2933 float sR = lastV[1];
2934 float incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
2935 float incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
2937 assert( numSamples == MIXBUFFER_SAMPLES );
2939 for( int j = 0; j < MIXBUFFER_SAMPLES; j++ ) {
2940 mixBuffer[j*2+0] += samples[j] * sL;
2941 mixBuffer[j*2+1] += samples[j] * sR;
2949 idSIMD_Generic::MixSoundTwoSpeakerStereo
2952 void VPCALL idSIMD_Generic::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
2953 float sL = lastV[0];
2954 float sR = lastV[1];
2955 float incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
2956 float incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
2958 assert( numSamples == MIXBUFFER_SAMPLES );
2960 for( int j = 0; j < MIXBUFFER_SAMPLES; j++ ) {
2961 mixBuffer[j*2+0] += samples[j*2+0] * sL;
2962 mixBuffer[j*2+1] += samples[j*2+1] * sR;
2970 idSIMD_Generic::MixSoundSixSpeakerMono
2973 void VPCALL idSIMD_Generic::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
2974 float sL0 = lastV[0];
2975 float sL1 = lastV[1];
2976 float sL2 = lastV[2];
2977 float sL3 = lastV[3];
2978 float sL4 = lastV[4];
2979 float sL5 = lastV[5];
2981 float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
2982 float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
2983 float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
2984 float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
2985 float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
2986 float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
2988 assert( numSamples == MIXBUFFER_SAMPLES );
2990 for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {
2991 mixBuffer[i*6+0] += samples[i] * sL0;
2992 mixBuffer[i*6+1] += samples[i] * sL1;
2993 mixBuffer[i*6+2] += samples[i] * sL2;
2994 mixBuffer[i*6+3] += samples[i] * sL3;
2995 mixBuffer[i*6+4] += samples[i] * sL4;
2996 mixBuffer[i*6+5] += samples[i] * sL5;
3008 idSIMD_Generic::MixSoundSixSpeakerStereo
3011 void VPCALL idSIMD_Generic::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
3012 float sL0 = lastV[0];
3013 float sL1 = lastV[1];
3014 float sL2 = lastV[2];
3015 float sL3 = lastV[3];
3016 float sL4 = lastV[4];
3017 float sL5 = lastV[5];
3019 float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
3020 float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
3021 float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
3022 float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
3023 float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
3024 float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
3026 assert( numSamples == MIXBUFFER_SAMPLES );
3028 for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {
3029 mixBuffer[i*6+0] += samples[i*2+0] * sL0;
3030 mixBuffer[i*6+1] += samples[i*2+1] * sL1;
3031 mixBuffer[i*6+2] += samples[i*2+0] * sL2;
3032 mixBuffer[i*6+3] += samples[i*2+0] * sL3;
3033 mixBuffer[i*6+4] += samples[i*2+0] * sL4;
3034 mixBuffer[i*6+5] += samples[i*2+1] * sL5;
3046 idSIMD_Generic::MixedSoundToSamples
3049 void VPCALL idSIMD_Generic::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
3051 for ( int i = 0; i < numSamples; i++ ) {
3052 if ( mixBuffer[i] <= -32768.0f ) {
3053 samples[i] = -32768;
3054 } else if ( mixBuffer[i] >= 32767.0f ) {
3057 samples[i] = (short) mixBuffer[i];