neo/idlib/math/Lcp.cpp

   1 /*
   2 ===========================================================================
   3
   4 Doom 3 GPL Source Code
   5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
   6
   7 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
   8
   9 Doom 3 Source Code is free software: you can redistribute it and/or modify
  10 it under the terms of the GNU General Public License as published by
  11 the Free Software Foundation, either version 3 of the License, or
  12 (at your option) any later version.
  13
  14 Doom 3 Source Code is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with Doom 3 Source Code.  If not, see <http://www.gnu.org/licenses/>.
  21
  22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code.  If not, please request a copy in writing from id Software at the address below.
  23
  24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  25
  26 ===========================================================================
  27 */
  28
  29 #include "../precompiled.h"
  30 #pragma hdrstop
  31
  32 static idCVar lcp_showFailures( "lcp_showFailures", "0", CVAR_SYSTEM | CVAR_BOOL, "show LCP solver failures" );
  33
  34 const float LCP_BOUND_EPSILON                   = 1e-5f;
  35 const float LCP_ACCEL_EPSILON                   = 1e-5f;
  36 const float LCP_DELTA_ACCEL_EPSILON             = 1e-9f;
  37 const float LCP_DELTA_FORCE_EPSILON             = 1e-9f;
  38
  39 #define IGNORE_UNSATISFIABLE_VARIABLES
  40
  41 //===============================================================
  42 //                                                        M
  43 //  idLCP_Square                                         MrE
  44 //                                                        E
  45 //===============================================================
  46
  47 class idLCP_Square : public idLCP {
  48 public:
  49         virtual bool    Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex );
  50
  51 private:
  52         idMatX                  m;                                      // original matrix
  53         idVecX                  b;                                      // right hand side
  54         idVecX                  lo, hi;                         // low and high bounds
  55         idVecX                  f, a;                           // force and acceleration
  56         idVecX                  delta_f, delta_a;       // delta force and delta acceleration
  57         idMatX                  clamped;                        // LU factored sub matrix for clamped variables
  58         idVecX                  diagonal;                       // reciprocal of diagonal of U of the LU factored sub matrix for clamped variables
  59         int                             numUnbounded;           // number of unbounded variables
  60         int                             numClamped;                     // number of clamped variables
  61         float **                rowPtrs;                        // pointers to the rows of m
  62         int *                   boxIndex;                       // box index
  63         int *                   side;                           // tells if a variable is at the low boundary = -1, high boundary = 1 or inbetween = 0
  64         int *                   permuted;                       // index to keep track of the permutation
  65         bool                    padded;                         // set to true if the rows of the initial matrix are 16 byte padded
  66
  67 private:
  68         bool                    FactorClamped( void );
  69         void                    SolveClamped( idVecX &x, const float *b );
  70         void                    Swap( int i, int j );
  71         void                    AddClamped( int r );
  72         void                    RemoveClamped( int r );
  73         void                    CalcForceDelta( int d, float dir );
  74         void                    CalcAccelDelta( int d );
  75         void                    ChangeForce( int d, float step );
  76         void                    ChangeAccel( int d, float step );
  77         void                    GetMaxStep( int d, float dir, float &maxStep, int &limit, int &limitSide ) const;
  78 };
  79
  80 /*
  81 ============
  82 idLCP_Square::FactorClamped
  83 ============
  84 */
  85 bool idLCP_Square::FactorClamped( void ) {
  86         int i, j, k;
  87         float s, d;
  88
  89         for ( i = 0; i < numClamped; i++ ) {
  90                 memcpy( clamped[i], rowPtrs[i], numClamped * sizeof( float ) );
  91         }
  92
  93         for ( i = 0; i < numClamped; i++ ) {
  94
  95                 s = idMath::Fabs( clamped[i][i] );
  96
  97                 if ( s == 0.0f ) {
  98                         return false;
  99                 }
 100
 101                 diagonal[i] = d = 1.0f / clamped[i][i];
 102                 for ( j = i + 1; j < numClamped; j++ ) {
 103                         clamped[j][i] *= d;
 104                 }
 105
 106                 for ( j = i + 1; j < numClamped; j++ ) {
 107                         d = clamped[j][i];
 108                         for ( k = i + 1; k < numClamped; k++ ) {
 109                                 clamped[j][k] -= d * clamped[i][k];
 110                         }
 111                 }
 112         }
 113
 114         return true;
 115 }
 116
 117 /*
 118 ============
 119 idLCP_Square::SolveClamped
 120 ============
 121 */
 122 void idLCP_Square::SolveClamped( idVecX &x, const float *b ) {
 123         int i, j;
 124         float sum;
 125
 126         // solve L
 127         for ( i = 0; i < numClamped; i++ ) {
 128                 sum = b[i];
 129                 for ( j = 0; j < i; j++ ) {
 130                         sum -= clamped[i][j] * x[j];
 131                 }
 132                 x[i] = sum;
 133         }
 134
 135         // solve U
 136         for ( i = numClamped - 1; i >= 0; i-- ) {
 137                 sum = x[i];
 138                 for ( j = i + 1; j < numClamped; j++ ) {
 139                         sum -= clamped[i][j] * x[j];
 140                 }
 141                 x[i] = sum * diagonal[i];
 142         }
 143 }
 144
 145 /*
 146 ============
 147 idLCP_Square::Swap
 148 ============
 149 */
 150 void idLCP_Square::Swap( int i, int j ) {
 151
 152         if ( i == j ) {
 153                 return;
 154         }
 155
 156         idSwap( rowPtrs[i], rowPtrs[j] );
 157         m.SwapColumns( i, j );
 158         b.SwapElements( i, j );
 159         lo.SwapElements( i, j );
 160         hi.SwapElements( i, j );
 161         a.SwapElements( i, j );
 162         f.SwapElements( i, j );
 163         if ( boxIndex ) {
 164                 idSwap( boxIndex[i], boxIndex[j] );
 165         }
 166         idSwap( side[i], side[j] );
 167         idSwap( permuted[i], permuted[j] );
 168 }
 169
 170 /*
 171 ============
 172 idLCP_Square::AddClamped
 173 ============
 174 */
 175 void idLCP_Square::AddClamped( int r ) {
 176         int i, j;
 177         float sum;
 178
 179         assert( r >= numClamped );
 180
 181         // add a row at the bottom and a column at the right of the factored
 182         // matrix for the clamped variables
 183
 184         Swap( numClamped, r );
 185
 186         // add row to L
 187         for ( i = 0; i < numClamped; i++ ) {
 188                 sum = rowPtrs[numClamped][i];
 189                 for ( j = 0; j < i; j++ ) {
 190                         sum -= clamped[numClamped][j] * clamped[j][i];
 191                 }
 192                 clamped[numClamped][i] = sum * diagonal[i];
 193         }
 194
 195         // add column to U
 196         for ( i = 0; i <= numClamped; i++ ) {
 197                 sum = rowPtrs[i][numClamped];
 198                 for ( j = 0; j < i; j++ ) {
 199                         sum -= clamped[i][j] * clamped[j][numClamped];
 200                 }
 201                 clamped[i][numClamped] = sum;
 202         }
 203
 204         diagonal[numClamped] = 1.0f / clamped[numClamped][numClamped];
 205
 206         numClamped++;
 207 }
 208
 209 /*
 210 ============
 211 idLCP_Square::RemoveClamped
 212 ============
 213 */
 214 void idLCP_Square::RemoveClamped( int r ) {
 215         int i, j;
 216         float *y0, *y1, *z0, *z1;
 217         double diag, beta0, beta1, p0, p1, q0, q1, d;
 218
 219         assert( r < numClamped );
 220
 221         numClamped--;
 222
 223         // no need to swap and update the factored matrix when the last row and column are removed
 224         if ( r == numClamped ) {
 225                 return;
 226         }
 227
 228         y0 = (float *) _alloca16( numClamped * sizeof( float ) );
 229         z0 = (float *) _alloca16( numClamped * sizeof( float ) );
 230         y1 = (float *) _alloca16( numClamped * sizeof( float ) );
 231         z1 = (float *) _alloca16( numClamped * sizeof( float ) );
 232
 233         // the row/column need to be subtracted from the factorization
 234         for ( i = 0; i < numClamped; i++ ) {
 235                 y0[i] = -rowPtrs[i][r];
 236         }
 237
 238         memset( y1, 0, numClamped * sizeof( float ) );
 239         y1[r] = 1.0f;
 240
 241         memset( z0, 0, numClamped * sizeof( float ) );
 242         z0[r] = 1.0f;
 243
 244         for ( i = 0; i < numClamped; i++ ) {
 245                 z1[i] = -rowPtrs[r][i];
 246         }
 247
 248         // swap the to be removed row/column with the last row/column
 249         Swap( r, numClamped );
 250
 251         // the swapped last row/column need to be added to the factorization
 252         for ( i = 0; i < numClamped; i++ ) {
 253                 y0[i] += rowPtrs[i][r];
 254         }
 255
 256         for ( i = 0; i < numClamped; i++ ) {
 257                 z1[i] += rowPtrs[r][i];
 258         }
 259         z1[r] = 0.0f;
 260
 261         // update the beginning of the to be updated row and column
 262         for ( i = 0; i < r; i++ ) {
 263                 p0 = y0[i];
 264                 beta1 = z1[i] * diagonal[i];
 265
 266                 clamped[i][r] += p0;
 267                 for ( j = i+1; j < numClamped; j++ ) {
 268                         z1[j] -= beta1 * clamped[i][j];
 269                 }
 270                 for ( j = i+1; j < numClamped; j++ ) {
 271                         y0[j] -= p0 * clamped[j][i];
 272                 }
 273                 clamped[r][i] += beta1;
 274         }
 275
 276         // update the lower right corner starting at r,r
 277         for ( i = r; i < numClamped; i++ ) {
 278                 diag = clamped[i][i];
 279
 280                 p0 = y0[i];
 281                 p1 = z0[i];
 282                 diag += p0 * p1;
 283
 284                 if ( diag == 0.0f ) {
 285                         idLib::common->Printf( "idLCP_Square::RemoveClamped: updating factorization failed\n" );
 286                         return;
 287                 }
 288
 289                 beta0 = p1 / diag;
 290
 291                 q0 = y1[i];
 292                 q1 = z1[i];
 293                 diag += q0 * q1;
 294
 295                 if ( diag == 0.0f ) {
 296                         idLib::common->Printf( "idLCP_Square::RemoveClamped: updating factorization failed\n" );
 297                         return;
 298                 }
 299
 300                 d = 1.0f / diag;
 301                 beta1 = q1 * d;
 302
 303                 clamped[i][i] = diag;
 304                 diagonal[i] = d;
 305
 306                 for ( j = i+1; j < numClamped; j++ ) {
 307
 308                         d = clamped[i][j];
 309
 310                         d += p0 * z0[j];
 311                         z0[j] -= beta0 * d;
 312
 313                         d += q0 * z1[j];
 314                         z1[j] -= beta1 * d;
 315
 316                         clamped[i][j] = d;
 317                 }
 318
 319                 for ( j = i+1; j < numClamped; j++ ) {
 320
 321                         d = clamped[j][i];
 322
 323                         y0[j] -= p0 * d;
 324                         d += beta0 * y0[j];
 325
 326                         y1[j] -= q0 * d;
 327                         d += beta1 * y1[j];
 328
 329                         clamped[j][i] = d;
 330                 }
 331         }
 332         return;
 333 }
 334
 335 /*
 336 ============
 337 idLCP_Square::CalcForceDelta
 338
 339   modifies this->delta_f
 340 ============
 341 */
 342 ID_INLINE void idLCP_Square::CalcForceDelta( int d, float dir ) {
 343         int i;
 344         float *ptr;
 345
 346         delta_f[d] = dir;
 347
 348         if ( numClamped == 0 ) {
 349                 return;
 350         }
 351
 352         // get column d of matrix
 353         ptr = (float *) _alloca16( numClamped * sizeof( float ) );
 354         for ( i = 0; i < numClamped; i++ ) {
 355                 ptr[i] = rowPtrs[i][d];
 356         }
 357
 358         // solve force delta
 359         SolveClamped( delta_f, ptr );
 360
 361         // flip force delta based on direction
 362         if ( dir > 0.0f ) {
 363                 ptr = delta_f.ToFloatPtr();
 364                 for ( i = 0; i < numClamped; i++ ) {
 365                         ptr[i] = - ptr[i];
 366                 }
 367         }
 368 }
 369
 370 /*
 371 ============
 372 idLCP_Square::CalcAccelDelta
 373
 374   modifies this->delta_a and uses this->delta_f
 375 ============
 376 */
 377 ID_INLINE void idLCP_Square::CalcAccelDelta( int d ) {
 378         int j;
 379         float dot;
 380
 381         // only the not clamped variables, including the current variable, can have a change in acceleration
 382         for ( j = numClamped; j <= d; j++ ) {
 383                 // only the clamped variables and the current variable have a force delta unequal zero
 384                 SIMDProcessor->Dot( dot, rowPtrs[j], delta_f.ToFloatPtr(), numClamped );
 385                 delta_a[j] = dot + rowPtrs[j][d] * delta_f[d];
 386         }
 387 }
 388
 389 /*
 390 ============
 391 idLCP_Square::ChangeForce
 392
 393   modifies this->f and uses this->delta_f
 394 ============
 395 */
 396 ID_INLINE void idLCP_Square::ChangeForce( int d, float step ) {
 397         // only the clamped variables and current variable have a force delta unequal zero
 398         SIMDProcessor->MulAdd( f.ToFloatPtr(), step, delta_f.ToFloatPtr(), numClamped );
 399         f[d] += step * delta_f[d];
 400 }
 401
 402 /*
 403 ============
 404 idLCP_Square::ChangeAccel
 405
 406   modifies this->a and uses this->delta_a
 407 ============
 408 */
 409 ID_INLINE void idLCP_Square::ChangeAccel( int d, float step ) {
 410         // only the not clamped variables, including the current variable, can have an acceleration unequal zero
 411         SIMDProcessor->MulAdd( a.ToFloatPtr() + numClamped, step, delta_a.ToFloatPtr() + numClamped, d - numClamped + 1 );
 412 }
 413
 414 /*
 415 ============
 416 idLCP_Square::GetMaxStep
 417 ============
 418 */
 419 void idLCP_Square::GetMaxStep( int d, float dir, float &maxStep, int &limit, int &limitSide ) const {
 420         int i;
 421         float s;
 422
 423         // default to a full step for the current variable
 424         if ( idMath::Fabs( delta_a[d] ) > LCP_DELTA_ACCEL_EPSILON ) {
 425                 maxStep = -a[d] / delta_a[d];
 426         } else {
 427                 maxStep = 0.0f;
 428         }
 429         limit = d;
 430         limitSide = 0;
 431
 432         // test the current variable
 433         if ( dir < 0.0f ) {
 434                 if ( lo[d] != -idMath::INFINITY ) {
 435                         s = ( lo[d] - f[d] ) / dir;
 436                         if ( s < maxStep ) {
 437                                 maxStep = s;
 438                                 limitSide = -1;
 439                         }
 440                 }
 441         } else {
 442                 if ( hi[d] != idMath::INFINITY ) {
 443                         s = ( hi[d] - f[d] ) / dir;
 444                         if ( s < maxStep ) {
 445                                 maxStep = s;
 446                                 limitSide = 1;
 447                         }
 448                 }
 449         }
 450
 451         // test the clamped bounded variables
 452         for ( i = numUnbounded; i < numClamped; i++ ) {
 453                 if ( delta_f[i] < -LCP_DELTA_FORCE_EPSILON ) {
 454                         // if there is a low boundary
 455                         if ( lo[i] != -idMath::INFINITY ) {
 456                                 s = ( lo[i] - f[i] ) / delta_f[i];
 457                                 if ( s < maxStep ) {
 458                                         maxStep = s;
 459                                         limit = i;
 460                                         limitSide = -1;
 461                                 }
 462                         }
 463                 } else if ( delta_f[i] > LCP_DELTA_FORCE_EPSILON ) {
 464                         // if there is a high boundary
 465                         if ( hi[i] != idMath::INFINITY ) {
 466                                 s = ( hi[i] - f[i] ) / delta_f[i];
 467                                 if ( s < maxStep ) {
 468                                         maxStep = s;
 469                                         limit = i;
 470                                         limitSide = 1;
 471                                 }
 472                         }
 473                 }
 474         }
 475
 476         // test the not clamped bounded variables
 477         for ( i = numClamped; i < d; i++ ) {
 478                 if ( side[i] == -1 ) {
 479                         if ( delta_a[i] >= -LCP_DELTA_ACCEL_EPSILON ) {
 480                                 continue;
 481                         }
 482                 } else if ( side[i] == 1 ) {
 483                         if ( delta_a[i] <= LCP_DELTA_ACCEL_EPSILON ) {
 484                                 continue;
 485                         }
 486                 } else {
 487                         continue;
 488                 }
 489                 // ignore variables for which the force is not allowed to take any substantial value
 490                 if ( lo[i] >= -LCP_BOUND_EPSILON && hi[i] <= LCP_BOUND_EPSILON ) {
 491                         continue;
 492                 }
 493                 s = -a[i] / delta_a[i];
 494                 if ( s < maxStep ) {
 495                         maxStep = s;
 496                         limit = i;
 497                         limitSide = 0;
 498                 }
 499         }
 500 }
 501
 502 /*
 503 ============
 504 idLCP_Square::Solve
 505 ============
 506 */
 507 bool idLCP_Square::Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex ) {
 508         int i, j, n, limit, limitSide, boxStartIndex;
 509         float dir, maxStep, dot, s;
 510         char *failed;
 511
 512         // true when the matrix rows are 16 byte padded
 513         padded = ((o_m.GetNumRows()+3)&~3) == o_m.GetNumColumns();
 514
 515         assert( padded || o_m.GetNumRows() == o_m.GetNumColumns() );
 516         assert( o_x.GetSize() == o_m.GetNumRows() );
 517         assert( o_b.GetSize() == o_m.GetNumRows() );
 518         assert( o_lo.GetSize() == o_m.GetNumRows() );
 519         assert( o_hi.GetSize() == o_m.GetNumRows() );
 520
 521         // allocate memory for permuted input
 522         f.SetData( o_m.GetNumRows(), VECX_ALLOCA( o_m.GetNumRows() ) );
 523         a.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
 524         b.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
 525         lo.SetData( o_lo.GetSize(), VECX_ALLOCA( o_lo.GetSize() ) );
 526         hi.SetData( o_hi.GetSize(), VECX_ALLOCA( o_hi.GetSize() ) );
 527         if ( o_boxIndex ) {
 528                 boxIndex = (int *)_alloca16( o_x.GetSize() * sizeof( int ) );
 529                 memcpy( boxIndex, o_boxIndex, o_x.GetSize() * sizeof( int ) );
 530         } else {
 531                 boxIndex = NULL;
 532         }
 533
 534         // we override the const on o_m here but on exit the matrix is unchanged
 535         m.SetData( o_m.GetNumRows(), o_m.GetNumColumns(), const_cast<float *>(o_m[0]) );
 536         f.Zero();
 537         a.Zero();
 538         b = o_b;
 539         lo = o_lo;
 540         hi = o_hi;
 541
 542         // pointers to the rows of m
 543         rowPtrs = (float **) _alloca16( m.GetNumRows() * sizeof( float * ) );
 544         for ( i = 0; i < m.GetNumRows(); i++ ) {
 545                 rowPtrs[i] = m[i];
 546         }
 547
 548         // tells if a variable is at the low boundary, high boundary or inbetween
 549         side = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
 550
 551         // index to keep track of the permutation
 552         permuted = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
 553         for ( i = 0; i < m.GetNumRows(); i++ ) {
 554                 permuted[i] = i;
 555         }
 556
 557         // permute input so all unbounded variables come first
 558         numUnbounded = 0;
 559         for ( i = 0; i < m.GetNumRows(); i++ ) {
 560                 if ( lo[i] == -idMath::INFINITY && hi[i] == idMath::INFINITY ) {
 561                         if ( numUnbounded != i ) {
 562                                 Swap( numUnbounded, i );
 563                         }
 564                         numUnbounded++;
 565                 }
 566         }
 567
 568         // permute input so all variables using the boxIndex come last
 569         boxStartIndex = m.GetNumRows();
 570         if ( boxIndex ) {
 571                 for ( i = m.GetNumRows() - 1; i >= numUnbounded; i-- ) {
 572                         if ( boxIndex[i] >= 0 && ( lo[i] != -idMath::INFINITY || hi[i] != idMath::INFINITY ) ) {
 573                                 boxStartIndex--;
 574                                 if ( boxStartIndex != i ) {
 575                                         Swap( boxStartIndex, i );
 576                                 }
 577                         }
 578                 }
 579         }
 580
 581         // sub matrix for factorization
 582         clamped.SetData( m.GetNumRows(), m.GetNumColumns(), MATX_ALLOCA( m.GetNumRows() * m.GetNumColumns() ) );
 583         diagonal.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
 584
 585         // all unbounded variables are clamped
 586         numClamped = numUnbounded;
 587
 588         // if there are unbounded variables
 589         if ( numUnbounded ) {
 590
 591                 // factor and solve for unbounded variables
 592                 if ( !FactorClamped() ) {
 593                         idLib::common->Printf( "idLCP_Square::Solve: unbounded factorization failed\n" );
 594                         return false;
 595                 }
 596                 SolveClamped( f, b.ToFloatPtr() );
 597
 598                 // if there are no bounded variables we are done
 599                 if ( numUnbounded == m.GetNumRows() ) {
 600                         o_x = f;        // the vector is not permuted
 601                         return true;
 602                 }
 603         }
 604
 605 #ifdef IGNORE_UNSATISFIABLE_VARIABLES
 606         int numIgnored = 0;
 607 #endif
 608
 609         // allocate for delta force and delta acceleration
 610         delta_f.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
 611         delta_a.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
 612
 613         // solve for bounded variables
 614         failed = NULL;
 615         for ( i = numUnbounded; i < m.GetNumRows(); i++ ) {
 616
 617                 // once we hit the box start index we can initialize the low and high boundaries of the variables using the box index
 618                 if ( i == boxStartIndex ) {
 619                         for ( j = 0; j < boxStartIndex; j++ ) {
 620                                 o_x[permuted[j]] = f[j];
 621                         }
 622                         for ( j = boxStartIndex; j < m.GetNumRows(); j++ ) {
 623                                 s = o_x[boxIndex[j]];
 624                                 if ( lo[j] != -idMath::INFINITY ) {
 625                                         lo[j] = - idMath::Fabs( lo[j] * s );
 626                                 }
 627                                 if ( hi[j] != idMath::INFINITY ) {
 628                                         hi[j] = idMath::Fabs( hi[j] * s );
 629                                 }
 630                         }
 631                 }
 632
 633                 // calculate acceleration for current variable
 634                 SIMDProcessor->Dot( dot, rowPtrs[i], f.ToFloatPtr(), i );
 635                 a[i] = dot - b[i];
 636
 637                 // if already at the low boundary
 638                 if ( lo[i] >= -LCP_BOUND_EPSILON && a[i] >= -LCP_ACCEL_EPSILON ) {
 639                         side[i] = -1;
 640                         continue;
 641                 }
 642
 643                 // if already at the high boundary
 644                 if ( hi[i] <= LCP_BOUND_EPSILON && a[i] <= LCP_ACCEL_EPSILON ) {
 645                         side[i] = 1;
 646                         continue;
 647                 }
 648
 649                 // if inside the clamped region
 650                 if ( idMath::Fabs( a[i] ) <= LCP_ACCEL_EPSILON ) {
 651                         side[i] = 0;
 652                         AddClamped( i );
 653                         continue;
 654                 }
 655
 656                 // drive the current variable into a valid region
 657                 for ( n = 0; n < maxIterations; n++ ) {
 658
 659                         // direction to move
 660                         if ( a[i] <= 0.0f ) {
 661                                 dir = 1.0f;
 662                         } else {
 663                                 dir = -1.0f;
 664                         }
 665
 666                         // calculate force delta
 667                         CalcForceDelta( i, dir );
 668
 669                         // calculate acceleration delta: delta_a = m * delta_f;
 670                         CalcAccelDelta( i );
 671
 672                         // maximum step we can take
 673                         GetMaxStep( i, dir, maxStep, limit, limitSide );
 674
 675                         if ( maxStep <= 0.0f ) {
 676 #ifdef IGNORE_UNSATISFIABLE_VARIABLES
 677                                 // ignore the current variable completely
 678                                 lo[i] = hi[i] = 0.0f;
 679                                 f[i] = 0.0f;
 680                                 side[i] = -1;
 681                                 numIgnored++;
 682 #else
 683                                 failed = va( "invalid step size %.4f", maxStep );
 684 #endif
 685                                 break;
 686                         }
 687
 688                         // change force
 689                         ChangeForce( i, maxStep );
 690
 691                         // change acceleration
 692                         ChangeAccel( i, maxStep );
 693
 694                         // clamp/unclamp the variable that limited this step
 695                         side[limit] = limitSide;
 696                         switch( limitSide ) {
 697                                 case 0: {
 698                                         a[limit] = 0.0f;
 699                                         AddClamped( limit );
 700                                         break;
 701                                 }
 702                                 case -1: {
 703                                         f[limit] = lo[limit];
 704                                         if ( limit != i ) {
 705                                                 RemoveClamped( limit );
 706                                         }
 707                                         break;
 708                                 }
 709                                 case 1: {
 710                                         f[limit] = hi[limit];
 711                                         if ( limit != i ) {
 712                                                 RemoveClamped( limit );
 713                                         }
 714                                         break;
 715                                 }
 716                         }
 717
 718                         // if the current variable limited the step we can continue with the next variable
 719                         if ( limit == i ) {
 720                                 break;
 721                         }
 722                 }
 723
 724                 if ( n >= maxIterations ) {
 725                         failed = va( "max iterations %d", maxIterations );
 726                         break;
 727                 }
 728
 729                 if ( failed ) {
 730                         break;
 731                 }
 732         }
 733
 734 #ifdef IGNORE_UNSATISFIABLE_VARIABLES
 735         if ( numIgnored ) {
 736                 if ( lcp_showFailures.GetBool() ) {
 737                         idLib::common->Printf( "idLCP_Symmetric::Solve: %d of %d bounded variables ignored\n", numIgnored, m.GetNumRows() - numUnbounded );
 738                 }
 739         }
 740 #endif
 741
 742         // if failed clear remaining forces
 743         if ( failed ) {
 744                 if ( lcp_showFailures.GetBool() ) {
 745                         idLib::common->Printf( "idLCP_Square::Solve: %s (%d of %d bounded variables ignored)\n", failed, m.GetNumRows() - i, m.GetNumRows() - numUnbounded );
 746                 }
 747                 for ( j = i; j < m.GetNumRows(); j++ ) {
 748                         f[j] = 0.0f;
 749                 }
 750         }
 751
 752 #if defined(_DEBUG) && 0
 753         if ( !failed ) {
 754                 // test whether or not the solution satisfies the complementarity conditions
 755                 for ( i = 0; i < m.GetNumRows(); i++ ) {
 756                         a[i] = -b[i];
 757                         for ( j = 0; j < m.GetNumRows(); j++ ) {
 758                                 a[i] += rowPtrs[i][j] * f[j];
 759                         }
 760
 761                         if ( f[i] == lo[i] ) {
 762                                 if ( lo[i] != hi[i] && a[i] < -LCP_ACCEL_EPSILON ) {
 763                                         int bah1 = 1;
 764                                 }
 765                         } else if ( f[i] == hi[i] ) {
 766                                 if ( lo[i] != hi[i] && a[i] > LCP_ACCEL_EPSILON ) {
 767                                         int bah2 = 1;
 768                                 }
 769                         } else if ( f[i] < lo[i] || f[i] > hi[i] || idMath::Fabs( a[i] ) > 1.0f ) {
 770                                 int bah3 = 1;
 771                         }
 772                 }
 773         }
 774 #endif
 775
 776         // unpermute result
 777         for ( i = 0; i < f.GetSize(); i++ ) {
 778                 o_x[permuted[i]] = f[i];
 779         }
 780
 781         // unpermute original matrix
 782         for ( i = 0; i < m.GetNumRows(); i++ ) {
 783                 for ( j = 0; j < m.GetNumRows(); j++ ) {
 784                         if ( permuted[j] == i ) {
 785                                 break;
 786                         }
 787                 }
 788                 if ( i != j ) {
 789                         m.SwapColumns( i, j );
 790                         idSwap( permuted[i], permuted[j] );
 791                 }
 792         }
 793
 794         return true;
 795 }
 796
 797
 798 //===============================================================
 799 //                                                        M
 800 //  idLCP_Symmetric                                      MrE
 801 //                                                        E
 802 //===============================================================
 803
 804 class idLCP_Symmetric : public idLCP {
 805 public:
 806         virtual bool    Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex );
 807
 808 private:
 809         idMatX                  m;                                      // original matrix
 810         idVecX                  b;                                      // right hand side
 811         idVecX                  lo, hi;                         // low and high bounds
 812         idVecX                  f, a;                           // force and acceleration
 813         idVecX                  delta_f, delta_a;       // delta force and delta acceleration
 814         idMatX                  clamped;                        // LDLt factored sub matrix for clamped variables
 815         idVecX                  diagonal;                       // reciprocal of diagonal of LDLt factored sub matrix for clamped variables
 816         idVecX                  solveCache1;            // intermediate result cached in SolveClamped
 817         idVecX                  solveCache2;            // "
 818         int                             numUnbounded;           // number of unbounded variables
 819         int                             numClamped;                     // number of clamped variables
 820         int                             clampedChangeStart;     // lowest row/column changed in the clamped matrix during an iteration
 821         float **                rowPtrs;                        // pointers to the rows of m
 822         int *                   boxIndex;                       // box index
 823         int *                   side;                           // tells if a variable is at the low boundary = -1, high boundary = 1 or inbetween = 0
 824         int *                   permuted;                       // index to keep track of the permutation
 825         bool                    padded;                         // set to true if the rows of the initial matrix are 16 byte padded
 826
 827 private:
 828         bool                    FactorClamped( void );
 829         void                    SolveClamped( idVecX &x, const float *b );
 830         void                    Swap( int i, int j );
 831         void                    AddClamped( int r, bool useSolveCache );
 832         void                    RemoveClamped( int r );
 833         void                    CalcForceDelta( int d, float dir );
 834         void                    CalcAccelDelta( int d );
 835         void                    ChangeForce( int d, float step );
 836         void                    ChangeAccel( int d, float step );
 837         void                    GetMaxStep( int d, float dir, float &maxStep, int &limit, int &limitSide ) const;
 838 };
 839
 840 /*
 841 ============
 842 idLCP_Symmetric::FactorClamped
 843 ============
 844 */
 845 bool idLCP_Symmetric::FactorClamped( void ) {
 846
 847         clampedChangeStart = 0;
 848
 849         for ( int i = 0; i < numClamped; i++ ) {
 850                 memcpy( clamped[i], rowPtrs[i], numClamped * sizeof( float ) );
 851         }
 852         return SIMDProcessor->MatX_LDLTFactor( clamped, diagonal, numClamped );
 853 }
 854
 855 /*
 856 ============
 857 idLCP_Symmetric::SolveClamped
 858 ============
 859 */
 860 void idLCP_Symmetric::SolveClamped( idVecX &x, const float *b ) {
 861
 862         // solve L
 863         SIMDProcessor->MatX_LowerTriangularSolve( clamped, solveCache1.ToFloatPtr(), b, numClamped, clampedChangeStart );
 864
 865         // solve D
 866         SIMDProcessor->Mul( solveCache2.ToFloatPtr(), solveCache1.ToFloatPtr(), diagonal.ToFloatPtr(), numClamped );
 867
 868         // solve Lt
 869         SIMDProcessor->MatX_LowerTriangularSolveTranspose( clamped, x.ToFloatPtr(), solveCache2.ToFloatPtr(), numClamped );
 870
 871         clampedChangeStart = numClamped;
 872 }
 873
 874 /*
 875 ============
 876 idLCP_Symmetric::Swap
 877 ============
 878 */
 879 void idLCP_Symmetric::Swap( int i, int j ) {
 880
 881         if ( i == j ) {
 882                 return;
 883         }
 884
 885         idSwap( rowPtrs[i], rowPtrs[j] );
 886         m.SwapColumns( i, j );
 887         b.SwapElements( i, j );
 888         lo.SwapElements( i, j );
 889         hi.SwapElements( i, j );
 890         a.SwapElements( i, j );
 891         f.SwapElements( i, j );
 892         if ( boxIndex ) {
 893                 idSwap( boxIndex[i], boxIndex[j] );
 894         }
 895         idSwap( side[i], side[j] );
 896         idSwap( permuted[i], permuted[j] );
 897 }
 898
 899 /*
 900 ============
 901 idLCP_Symmetric::AddClamped
 902 ============
 903 */
 904 void idLCP_Symmetric::AddClamped( int r, bool useSolveCache ) {
 905         float d, dot;
 906
 907         assert( r >= numClamped );
 908
 909         if ( numClamped < clampedChangeStart ) {
 910                 clampedChangeStart = numClamped;
 911         }
 912
 913         // add a row at the bottom and a column at the right of the factored
 914         // matrix for the clamped variables
 915
 916         Swap( numClamped, r );
 917
 918         // solve for v in L * v = rowPtr[numClamped]
 919         if ( useSolveCache ) {
 920
 921                 // the lower triangular solve was cached in SolveClamped called by CalcForceDelta
 922                 memcpy( clamped[numClamped], solveCache2.ToFloatPtr(), numClamped * sizeof( float ) );
 923                 // calculate row dot product
 924                 SIMDProcessor->Dot( dot, solveCache2.ToFloatPtr(), solveCache1.ToFloatPtr(), numClamped );
 925
 926         } else {
 927
 928                 float *v = (float *) _alloca16( numClamped * sizeof( float ) );
 929
 930                 SIMDProcessor->MatX_LowerTriangularSolve( clamped, v, rowPtrs[numClamped], numClamped );
 931                 // add bottom row to L
 932                 SIMDProcessor->Mul( clamped[numClamped], v, diagonal.ToFloatPtr(), numClamped );
 933                 // calculate row dot product
 934                 SIMDProcessor->Dot( dot, clamped[numClamped], v, numClamped );
 935         }
 936
 937         // update diagonal[numClamped]
 938         d = rowPtrs[numClamped][numClamped] - dot;
 939
 940         if ( d == 0.0f ) {
 941                 idLib::common->Printf( "idLCP_Symmetric::AddClamped: updating factorization failed\n" );
 942                 numClamped++;
 943                 return;
 944         }
 945
 946         clamped[numClamped][numClamped] = d;
 947         diagonal[numClamped] = 1.0f / d;
 948
 949         numClamped++;
 950 }
 951
 952 /*
 953 ============
 954 idLCP_Symmetric::RemoveClamped
 955 ============
 956 */
 957 void idLCP_Symmetric::RemoveClamped( int r ) {
 958         int i, j, n;
 959         float *addSub, *original, *v, *ptr, *v1, *v2, dot;
 960         double sum, diag, newDiag, invNewDiag, p1, p2, alpha1, alpha2, beta1, beta2;
 961
 962         assert( r < numClamped );
 963
 964         if ( r < clampedChangeStart ) {
 965                 clampedChangeStart = r;
 966         }
 967
 968         numClamped--;
 969
 970         // no need to swap and update the factored matrix when the last row and column are removed
 971         if ( r == numClamped ) {
 972                 return;
 973         }
 974
 975         // swap the to be removed row/column with the last row/column
 976         Swap( r, numClamped );
 977
 978         // update the factored matrix
 979         addSub = (float *) _alloca16( numClamped * sizeof( float ) );
 980
 981         if ( r == 0 ) {
 982
 983                 if ( numClamped == 1 ) {
 984                         diag = rowPtrs[0][0];
 985                         if ( diag == 0.0f ) {
 986                                 idLib::common->Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
 987                                 return;
 988                         }
 989                         clamped[0][0] = diag;
 990                         diagonal[0] = 1.0f / diag;
 991                         return;
 992                 }
 993
 994                 // calculate the row/column to be added to the lower right sub matrix starting at (r, r)
 995                 original = rowPtrs[numClamped];
 996                 ptr = rowPtrs[r];
 997                 addSub[0] = ptr[0] - original[numClamped];
 998                 for ( i = 1; i < numClamped; i++ ) {
 999                         addSub[i] = ptr[i] - original[i];
1000                 }
1001
1002         } else {
1003
1004                 v = (float *) _alloca16( numClamped * sizeof( float ) );
1005
1006                 // solve for v in L * v = rowPtr[r]
1007                 SIMDProcessor->MatX_LowerTriangularSolve( clamped, v, rowPtrs[r], r );
1008
1009                 // update removed row
1010                 SIMDProcessor->Mul( clamped[r], v, diagonal.ToFloatPtr(), r );
1011
1012                 // if the last row/column of the matrix is updated
1013                 if ( r == numClamped - 1 ) {
1014                         // only calculate new diagonal
1015                         SIMDProcessor->Dot( dot, clamped[r], v, r );
1016                         diag = rowPtrs[r][r] - dot;
1017                         if ( diag == 0.0f ) {
1018                                 idLib::common->Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
1019                                 return;
1020                         }
1021                         clamped[r][r] = diag;
1022                         diagonal[r] = 1.0f / diag;
1023                         return;
1024                 }
1025
1026                 // calculate the row/column to be added to the lower right sub matrix starting at (r, r)
1027                 for ( i = 0; i < r; i++ ) {
1028                         v[i] = clamped[r][i] * clamped[i][i];
1029                 }
1030                 for ( i = r; i < numClamped; i++ ) {
1031                         if ( i == r ) {
1032                                 sum = clamped[r][r];
1033                         } else {
1034                                 sum = clamped[r][r] * clamped[i][r];
1035                         }
1036                         ptr = clamped[i];
1037                         for ( j = 0; j < r; j++ ) {
1038                                 sum += ptr[j] * v[j];
1039                         }
1040                         addSub[i] = rowPtrs[r][i] - sum;
1041                 }
1042         }
1043
1044         // add row/column to the lower right sub matrix starting at (r, r)
1045
1046         v1 = (float *) _alloca16( numClamped * sizeof( float ) );
1047         v2 = (float *) _alloca16( numClamped * sizeof( float ) );
1048
1049         diag = idMath::SQRT_1OVER2;
1050         v1[r] = ( 0.5f * addSub[r] + 1.0f ) * diag;
1051         v2[r] = ( 0.5f * addSub[r] - 1.0f ) * diag;
1052         for ( i = r+1; i < numClamped; i++ ) {
1053                 v1[i] = v2[i] = addSub[i] * diag;
1054         }
1055
1056         alpha1 = 1.0f;
1057         alpha2 = -1.0f;
1058
1059         // simultaneous update/downdate of the sub matrix starting at (r, r)
1060         n = clamped.GetNumColumns();
1061         for ( i = r; i < numClamped; i++ ) {
1062
1063                 diag = clamped[i][i];
1064                 p1 = v1[i];
1065                 newDiag = diag + alpha1 * p1 * p1;
1066
1067                 if ( newDiag == 0.0f ) {
1068                         idLib::common->Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
1069                         return;
1070                 }
1071
1072                 alpha1 /= newDiag;
1073                 beta1 = p1 * alpha1;
1074                 alpha1 *= diag;
1075
1076                 diag = newDiag;
1077                 p2 = v2[i];
1078                 newDiag = diag + alpha2 * p2 * p2;
1079
1080                 if ( newDiag == 0.0f ) {
1081                         idLib::common->Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
1082                         return;
1083                 }
1084
1085                 clamped[i][i] = newDiag;
1086                 diagonal[i] = invNewDiag = 1.0f / newDiag;
1087
1088                 alpha2 *= invNewDiag;
1089                 beta2 = p2 * alpha2;
1090                 alpha2 *= diag;
1091
1092                 // update column below diagonal (i,i)
1093                 ptr = clamped.ToFloatPtr() + i;
1094
1095                 for ( j = i+1; j < numClamped - 1; j += 2 ) {
1096
1097                         float sum0 = ptr[(j+0)*n];
1098                         float sum1 = ptr[(j+1)*n];
1099
1100                         v1[j+0] -= p1 * sum0;
1101                         v1[j+1] -= p1 * sum1;
1102
1103                         sum0 += beta1 * v1[j+0];
1104                         sum1 += beta1 * v1[j+1];
1105
1106                         v2[j+0] -= p2 * sum0;
1107                         v2[j+1] -= p2 * sum1;
1108
1109                         sum0 += beta2 * v2[j+0];
1110                         sum1 += beta2 * v2[j+1];
1111
1112                         ptr[(j+0)*n] = sum0;
1113                         ptr[(j+1)*n] = sum1;
1114                 }
1115
1116                 for ( ; j < numClamped; j++ ) {
1117
1118                         sum = ptr[j*n];
1119
1120                         v1[j] -= p1 * sum;
1121                         sum += beta1 * v1[j];
1122
1123                         v2[j] -= p2 * sum;
1124                         sum += beta2 * v2[j];
1125
1126                         ptr[j*n] = sum;
1127                 }
1128         }
1129 }
1130
1131 /*
1132 ============
1133 idLCP_Symmetric::CalcForceDelta
1134
1135   modifies this->delta_f
1136 ============
1137 */
1138 ID_INLINE void idLCP_Symmetric::CalcForceDelta( int d, float dir ) {
1139         int i;
1140         float *ptr;
1141
1142         delta_f[d] = dir;
1143
1144         if ( numClamped == 0 ) {
1145                 return;
1146         }
1147
1148         // solve force delta
1149         SolveClamped( delta_f, rowPtrs[d] );
1150
1151         // flip force delta based on direction
1152         if ( dir > 0.0f ) {
1153                 ptr = delta_f.ToFloatPtr();
1154                 for ( i = 0; i < numClamped; i++ ) {
1155                         ptr[i] = - ptr[i];
1156                 }
1157         }
1158 }
1159
1160 /*
1161 ============
1162 idLCP_Symmetric::CalcAccelDelta
1163
1164   modifies this->delta_a and uses this->delta_f
1165 ============
1166 */
1167 ID_INLINE void idLCP_Symmetric::CalcAccelDelta( int d ) {
1168         int j;
1169         float dot;
1170
1171         // only the not clamped variables, including the current variable, can have a change in acceleration
1172         for ( j = numClamped; j <= d; j++ ) {
1173                 // only the clamped variables and the current variable have a force delta unequal zero
1174                 SIMDProcessor->Dot( dot, rowPtrs[j], delta_f.ToFloatPtr(), numClamped );
1175                 delta_a[j] = dot + rowPtrs[j][d] * delta_f[d];
1176         }
1177 }
1178
1179 /*
1180 ============
1181 idLCP_Symmetric::ChangeForce
1182
1183   modifies this->f and uses this->delta_f
1184 ============
1185 */
1186 ID_INLINE void idLCP_Symmetric::ChangeForce( int d, float step ) {
1187         // only the clamped variables and current variable have a force delta unequal zero
1188         SIMDProcessor->MulAdd( f.ToFloatPtr(), step, delta_f.ToFloatPtr(), numClamped );
1189         f[d] += step * delta_f[d];
1190 }
1191
1192 /*
1193 ============
1194 idLCP_Symmetric::ChangeAccel
1195
1196   modifies this->a and uses this->delta_a
1197 ============
1198 */
1199 ID_INLINE void idLCP_Symmetric::ChangeAccel( int d, float step ) {
1200         // only the not clamped variables, including the current variable, can have an acceleration unequal zero
1201         SIMDProcessor->MulAdd( a.ToFloatPtr() + numClamped, step, delta_a.ToFloatPtr() + numClamped, d - numClamped + 1 );
1202 }
1203
1204 /*
1205 ============
1206 idLCP_Symmetric::GetMaxStep
1207 ============
1208 */
1209 void idLCP_Symmetric::GetMaxStep( int d, float dir, float &maxStep, int &limit, int &limitSide ) const {
1210         int i;
1211         float s;
1212
1213         // default to a full step for the current variable
1214         if ( idMath::Fabs( delta_a[d] ) > LCP_DELTA_ACCEL_EPSILON ) {
1215                 maxStep = -a[d] / delta_a[d];
1216         } else {
1217                 maxStep = 0.0f;
1218         }
1219         limit = d;
1220         limitSide = 0;
1221
1222         // test the current variable
1223         if ( dir < 0.0f ) {
1224                 if ( lo[d] != -idMath::INFINITY ) {
1225                         s = ( lo[d] - f[d] ) / dir;
1226                         if ( s < maxStep ) {
1227                                 maxStep = s;
1228                                 limitSide = -1;
1229                         }
1230                 }
1231         } else {
1232                 if ( hi[d] != idMath::INFINITY ) {
1233                         s = ( hi[d] - f[d] ) / dir;
1234                         if ( s < maxStep ) {
1235                                 maxStep = s;
1236                                 limitSide = 1;
1237                         }
1238                 }
1239         }
1240
1241         // test the clamped bounded variables
1242         for ( i = numUnbounded; i < numClamped; i++ ) {
1243                 if ( delta_f[i] < -LCP_DELTA_FORCE_EPSILON ) {
1244                         // if there is a low boundary
1245                         if ( lo[i] != -idMath::INFINITY ) {
1246                                 s = ( lo[i] - f[i] ) / delta_f[i];
1247                                 if ( s < maxStep ) {
1248                                         maxStep = s;
1249                                         limit = i;
1250                                         limitSide = -1;
1251                                 }
1252                         }
1253                 } else if ( delta_f[i] > LCP_DELTA_FORCE_EPSILON ) {
1254                         // if there is a high boundary
1255                         if ( hi[i] != idMath::INFINITY ) {
1256                                 s = ( hi[i] - f[i] ) / delta_f[i];
1257                                 if ( s < maxStep ) {
1258                                         maxStep = s;
1259                                         limit = i;
1260                                         limitSide = 1;
1261                                 }
1262                         }
1263                 }
1264         }
1265
1266         // test the not clamped bounded variables
1267         for ( i = numClamped; i < d; i++ ) {
1268                 if ( side[i] == -1 ) {
1269                         if ( delta_a[i] >= -LCP_DELTA_ACCEL_EPSILON ) {
1270                                 continue;
1271                         }
1272                 } else if ( side[i] == 1 ) {
1273                         if ( delta_a[i] <= LCP_DELTA_ACCEL_EPSILON ) {
1274                                 continue;
1275                         }
1276                 } else {
1277                         continue;
1278                 }
1279                 // ignore variables for which the force is not allowed to take any substantial value
1280                 if ( lo[i] >= -LCP_BOUND_EPSILON && hi[i] <= LCP_BOUND_EPSILON ) {
1281                         continue;
1282                 }
1283                 s = -a[i] / delta_a[i];
1284                 if ( s < maxStep ) {
1285                         maxStep = s;
1286                         limit = i;
1287                         limitSide = 0;
1288                 }
1289         }
1290 }
1291
1292 /*
1293 ============
1294 idLCP_Symmetric::Solve
1295 ============
1296 */
1297 bool idLCP_Symmetric::Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex ) {
1298         int i, j, n, limit, limitSide, boxStartIndex;
1299         float dir, maxStep, dot, s;
1300         char *failed;
1301
1302         // true when the matrix rows are 16 byte padded
1303         padded = ((o_m.GetNumRows()+3)&~3) == o_m.GetNumColumns();
1304
1305         assert( padded || o_m.GetNumRows() == o_m.GetNumColumns() );
1306         assert( o_x.GetSize() == o_m.GetNumRows() );
1307         assert( o_b.GetSize() == o_m.GetNumRows() );
1308         assert( o_lo.GetSize() == o_m.GetNumRows() );
1309         assert( o_hi.GetSize() == o_m.GetNumRows() );
1310
1311         // allocate memory for permuted input
1312         f.SetData( o_m.GetNumRows(), VECX_ALLOCA( o_m.GetNumRows() ) );
1313         a.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
1314         b.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
1315         lo.SetData( o_lo.GetSize(), VECX_ALLOCA( o_lo.GetSize() ) );
1316         hi.SetData( o_hi.GetSize(), VECX_ALLOCA( o_hi.GetSize() ) );
1317         if ( o_boxIndex ) {
1318                 boxIndex = (int *)_alloca16( o_x.GetSize() * sizeof( int ) );
1319                 memcpy( boxIndex, o_boxIndex, o_x.GetSize() * sizeof( int ) );
1320         } else {
1321                 boxIndex = NULL;
1322         }
1323
1324         // we override the const on o_m here but on exit the matrix is unchanged
1325         m.SetData( o_m.GetNumRows(), o_m.GetNumColumns(), const_cast<float *>(o_m[0]) );
1326         f.Zero();
1327         a.Zero();
1328         b = o_b;
1329         lo = o_lo;
1330         hi = o_hi;
1331
1332         // pointers to the rows of m
1333         rowPtrs = (float **) _alloca16( m.GetNumRows() * sizeof( float * ) );
1334         for ( i = 0; i < m.GetNumRows(); i++ ) {
1335                 rowPtrs[i] = m[i];
1336         }
1337
1338         // tells if a variable is at the low boundary, high boundary or inbetween
1339         side = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
1340
1341         // index to keep track of the permutation
1342         permuted = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
1343         for ( i = 0; i < m.GetNumRows(); i++ ) {
1344                 permuted[i] = i;
1345         }
1346
1347         // permute input so all unbounded variables come first
1348         numUnbounded = 0;
1349         for ( i = 0; i < m.GetNumRows(); i++ ) {
1350                 if ( lo[i] == -idMath::INFINITY && hi[i] == idMath::INFINITY ) {
1351                         if ( numUnbounded != i ) {
1352                                 Swap( numUnbounded, i );
1353                         }
1354                         numUnbounded++;
1355                 }
1356         }
1357
1358         // permute input so all variables using the boxIndex come last
1359         boxStartIndex = m.GetNumRows();
1360         if ( boxIndex ) {
1361                 for ( i = m.GetNumRows() - 1; i >= numUnbounded; i-- ) {
1362                         if ( boxIndex[i] >= 0 && ( lo[i] != -idMath::INFINITY || hi[i] != idMath::INFINITY ) ) {
1363                                 boxStartIndex--;
1364                                 if ( boxStartIndex != i ) {
1365                                         Swap( boxStartIndex, i );
1366                                 }
1367                         }
1368                 }
1369         }
1370
1371         // sub matrix for factorization
1372         clamped.SetData( m.GetNumRows(), m.GetNumColumns(), MATX_ALLOCA( m.GetNumRows() * m.GetNumColumns() ) );
1373         diagonal.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
1374         solveCache1.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
1375         solveCache2.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
1376
1377         // all unbounded variables are clamped
1378         numClamped = numUnbounded;
1379
1380         // if there are unbounded variables
1381         if ( numUnbounded ) {
1382
1383                 // factor and solve for unbounded variables
1384                 if ( !FactorClamped() ) {
1385                         idLib::common->Printf( "idLCP_Symmetric::Solve: unbounded factorization failed\n" );
1386                         return false;
1387                 }
1388                 SolveClamped( f, b.ToFloatPtr() );
1389
1390                 // if there are no bounded variables we are done
1391                 if ( numUnbounded == m.GetNumRows() ) {
1392                         o_x = f;        // the vector is not permuted
1393                         return true;
1394                 }
1395         }
1396
1397 #ifdef IGNORE_UNSATISFIABLE_VARIABLES
1398         int numIgnored = 0;
1399 #endif
1400
1401         // allocate for delta force and delta acceleration
1402         delta_f.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
1403         delta_a.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
1404
1405         // solve for bounded variables
1406         failed = NULL;
1407         for ( i = numUnbounded; i < m.GetNumRows(); i++ ) {
1408
1409                 clampedChangeStart = 0;
1410
1411                 // once we hit the box start index we can initialize the low and high boundaries of the variables using the box index
1412                 if ( i == boxStartIndex ) {
1413                         for ( j = 0; j < boxStartIndex; j++ ) {
1414                                 o_x[permuted[j]] = f[j];
1415                         }
1416                         for ( j = boxStartIndex; j < m.GetNumRows(); j++ ) {
1417                                 s = o_x[boxIndex[j]];
1418                                 if ( lo[j] != -idMath::INFINITY ) {
1419                                         lo[j] = - idMath::Fabs( lo[j] * s );
1420                                 }
1421                                 if ( hi[j] != idMath::INFINITY ) {
1422                                         hi[j] = idMath::Fabs( hi[j] * s );
1423                                 }
1424                         }
1425                 }
1426
1427                 // calculate acceleration for current variable
1428                 SIMDProcessor->Dot( dot, rowPtrs[i], f.ToFloatPtr(), i );
1429                 a[i] = dot - b[i];
1430
1431                 // if already at the low boundary
1432                 if ( lo[i] >= -LCP_BOUND_EPSILON && a[i] >= -LCP_ACCEL_EPSILON ) {
1433                         side[i] = -1;
1434                         continue;
1435                 }
1436
1437                 // if already at the high boundary
1438                 if ( hi[i] <= LCP_BOUND_EPSILON && a[i] <= LCP_ACCEL_EPSILON ) {
1439                         side[i] = 1;
1440                         continue;
1441                 }
1442
1443                 // if inside the clamped region
1444                 if ( idMath::Fabs( a[i] ) <= LCP_ACCEL_EPSILON ) {
1445                         side[i] = 0;
1446                         AddClamped( i, false );
1447                         continue;
1448                 }
1449
1450                 // drive the current variable into a valid region
1451                 for ( n = 0; n < maxIterations; n++ ) {
1452
1453                         // direction to move
1454                         if ( a[i] <= 0.0f ) {
1455                                 dir = 1.0f;
1456                         } else {
1457                                 dir = -1.0f;
1458                         }
1459
1460                         // calculate force delta
1461                         CalcForceDelta( i, dir );
1462
1463                         // calculate acceleration delta: delta_a = m * delta_f;
1464                         CalcAccelDelta( i );
1465
1466                         // maximum step we can take
1467                         GetMaxStep( i, dir, maxStep, limit, limitSide );
1468
1469                         if ( maxStep <= 0.0f ) {
1470 #ifdef IGNORE_UNSATISFIABLE_VARIABLES
1471                                 // ignore the current variable completely
1472                                 lo[i] = hi[i] = 0.0f;
1473                                 f[i] = 0.0f;
1474                                 side[i] = -1;
1475                                 numIgnored++;
1476 #else
1477                                 failed = va( "invalid step size %.4f", maxStep );
1478 #endif
1479                                 break;
1480                         }
1481
1482                         // change force
1483                         ChangeForce( i, maxStep );
1484
1485                         // change acceleration
1486                         ChangeAccel( i, maxStep );
1487
1488                         // clamp/unclamp the variable that limited this step
1489                         side[limit] = limitSide;
1490                         switch( limitSide ) {
1491                                 case 0: {
1492                                         a[limit] = 0.0f;
1493                                         AddClamped( limit, ( limit == i ) );
1494                                         break;
1495                                 }
1496                                 case -1: {
1497                                         f[limit] = lo[limit];
1498                                         if ( limit != i ) {
1499                                                 RemoveClamped( limit );
1500                                         }
1501                                         break;
1502                                 }
1503                                 case 1: {
1504                                         f[limit] = hi[limit];
1505                                         if ( limit != i ) {
1506                                                 RemoveClamped( limit );
1507                                         }
1508                                         break;
1509                                 }
1510                         }
1511
1512                         // if the current variable limited the step we can continue with the next variable
1513                         if ( limit == i ) {
1514                                 break;
1515                         }
1516                 }
1517
1518                 if ( n >= maxIterations ) {
1519                         failed = va( "max iterations %d", maxIterations );
1520                         break;
1521                 }
1522
1523                 if ( failed ) {
1524                         break;
1525                 }
1526         }
1527
1528 #ifdef IGNORE_UNSATISFIABLE_VARIABLES
1529         if ( numIgnored ) {
1530                 if ( lcp_showFailures.GetBool() ) {
1531                         idLib::common->Printf( "idLCP_Symmetric::Solve: %d of %d bounded variables ignored\n", numIgnored, m.GetNumRows() - numUnbounded );
1532                 }
1533         }
1534 #endif
1535
1536         // if failed clear remaining forces
1537         if ( failed ) {
1538                 if ( lcp_showFailures.GetBool() ) {
1539                         idLib::common->Printf( "idLCP_Symmetric::Solve: %s (%d of %d bounded variables ignored)\n", failed, m.GetNumRows() - i, m.GetNumRows() - numUnbounded );
1540                 }
1541                 for ( j = i; j < m.GetNumRows(); j++ ) {
1542                         f[j] = 0.0f;
1543                 }
1544         }
1545
1546 #if defined(_DEBUG) && 0
1547         if ( !failed ) {
1548                 // test whether or not the solution satisfies the complementarity conditions
1549                 for ( i = 0; i < m.GetNumRows(); i++ ) {
1550                         a[i] = -b[i];
1551                         for ( j = 0; j < m.GetNumRows(); j++ ) {
1552                                 a[i] += rowPtrs[i][j] * f[j];
1553                         }
1554
1555                         if ( f[i] == lo[i] ) {
1556                                 if ( lo[i] != hi[i] && a[i] < -LCP_ACCEL_EPSILON ) {
1557                                         int bah1 = 1;
1558                                 }
1559                         } else if ( f[i] == hi[i] ) {
1560                                 if ( lo[i] != hi[i] && a[i] > LCP_ACCEL_EPSILON ) {
1561                                         int bah2 = 1;
1562                                 }
1563                         } else if ( f[i] < lo[i] || f[i] > hi[i] || idMath::Fabs( a[i] ) > 1.0f ) {
1564                                 int bah3 = 1;
1565                         }
1566                 }
1567         }
1568 #endif
1569
1570         // unpermute result
1571         for ( i = 0; i < f.GetSize(); i++ ) {
1572                 o_x[permuted[i]] = f[i];
1573         }
1574
1575         // unpermute original matrix
1576         for ( i = 0; i < m.GetNumRows(); i++ ) {
1577                 for ( j = 0; j < m.GetNumRows(); j++ ) {
1578                         if ( permuted[j] == i ) {
1579                                 break;
1580                         }
1581                 }
1582                 if ( i != j ) {
1583                         m.SwapColumns( i, j );
1584                         idSwap( permuted[i], permuted[j] );
1585                 }
1586         }
1587
1588         return true;
1589 }
1590
1591
1592 //===============================================================
1593 //
1594 //      idLCP
1595 //
1596 //===============================================================
1597
1598 /*
1599 ============
1600 idLCP::AllocSquare
1601 ============
1602 */
1603 idLCP *idLCP::AllocSquare( void ) {
1604         idLCP *lcp = new idLCP_Square;
1605         lcp->SetMaxIterations( 32 );
1606         return lcp;
1607 }
1608
1609 /*
1610 ============
1611 idLCP::AllocSymmetric
1612 ============
1613 */
1614 idLCP *idLCP::AllocSymmetric( void ) {
1615         idLCP *lcp = new idLCP_Symmetric;
1616         lcp->SetMaxIterations( 32 );
1617         return lcp;
1618 }
1619
1620 /*
1621 ============
1622 idLCP::~idLCP
1623 ============
1624 */
1625 idLCP::~idLCP( void ) {
1626 }
1627
1628 /*
1629 ============
1630 idLCP::SetMaxIterations
1631 ============
1632 */
1633 void idLCP::SetMaxIterations( int max ) {
1634         maxIterations = max;
1635 }
1636
1637 /*
1638 ============
1639 idLCP::GetMaxIterations
1640 ============
1641 */
1642 int idLCP::GetMaxIterations( void ) {
1643         return maxIterations;
1644 }