DOOM-3-BFG

DOOM 3 BFG Edition
Log | Files | Refs

Lcp.cpp (88972B)


      1 /*
      2 ===========================================================================
      3 
      4 Doom 3 BFG Edition GPL Source Code
      5 Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. 
      6 
      7 This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").  
      8 
      9 Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
     10 it under the terms of the GNU General Public License as published by
     11 the Free Software Foundation, either version 3 of the License, or
     12 (at your option) any later version.
     13 
     14 Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
     15 but WITHOUT ANY WARRANTY; without even the implied warranty of
     16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     17 GNU General Public License for more details.
     18 
     19 You should have received a copy of the GNU General Public License
     20 along with Doom 3 BFG Edition Source Code.  If not, see <http://www.gnu.org/licenses/>.
     21 
     22 In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code.  If not, please request a copy in writing from id Software at the address below.
     23 
     24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
     25 
     26 ===========================================================================
     27 */
     28 #pragma hdrstop
     29 #include "../precompiled.h"
     30 
     31 // this file is full of intentional case fall throughs
     32 //lint -e616
     33 
     34 // the code is correct, it can't be a NULL pointer
     35 //lint -e613
     36 
     37 static idCVar lcp_showFailures( "lcp_showFailures", "0", CVAR_BOOL, "show LCP solver failures" );
     38 
     39 const float LCP_BOUND_EPSILON			= 1e-5f;
     40 const float LCP_ACCEL_EPSILON			= 1e-5f;
     41 const float LCP_DELTA_ACCEL_EPSILON		= 1e-9f;
     42 const float LCP_DELTA_FORCE_EPSILON		= 1e-9f;
     43 
     44 #define IGNORE_UNSATISFIABLE_VARIABLES
     45 
     46 
     47 #if defined( ID_WIN_X86_SSE_ASM ) || defined( ID_WIN_X86_SSE_INTRIN )
     48 
     49 ALIGN16( const __m128 SIMD_SP_zero )							= { 0.0f, 0.0f, 0.0f, 0.0f };
     50 ALIGN16( const __m128 SIMD_SP_one )								= { 1.0f, 1.0f, 1.0f, 1.0f };
     51 ALIGN16( const __m128 SIMD_SP_two )								= { 2.0f, 2.0f, 2.0f, 2.0f };
     52 ALIGN16( const __m128 SIMD_SP_tiny )							= { 1e-10f, 1e-10f, 1e-10f, 1e-10f };
     53 ALIGN16( const __m128 SIMD_SP_infinity )						= { idMath::INFINITY, idMath::INFINITY, idMath::INFINITY, idMath::INFINITY };
     54 ALIGN16( const __m128 SIMD_SP_LCP_DELTA_ACCEL_EPSILON )			= { LCP_DELTA_ACCEL_EPSILON, LCP_DELTA_ACCEL_EPSILON, LCP_DELTA_ACCEL_EPSILON, LCP_DELTA_ACCEL_EPSILON };
     55 ALIGN16( const __m128 SIMD_SP_LCP_DELTA_FORCE_EPSILON )			= { LCP_DELTA_FORCE_EPSILON, LCP_DELTA_FORCE_EPSILON, LCP_DELTA_FORCE_EPSILON, LCP_DELTA_FORCE_EPSILON };
     56 ALIGN16( const __m128 SIMD_SP_LCP_BOUND_EPSILON )				= { LCP_BOUND_EPSILON, LCP_BOUND_EPSILON, LCP_BOUND_EPSILON, LCP_BOUND_EPSILON };
     57 ALIGN16( const __m128 SIMD_SP_neg_LCP_BOUND_EPSILON )			= { -LCP_BOUND_EPSILON, -LCP_BOUND_EPSILON, -LCP_BOUND_EPSILON, -LCP_BOUND_EPSILON };
     58 ALIGN16( const unsigned int SIMD_SP_signBit[4] )				= { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK };
     59 ALIGN16( const unsigned int SIMD_SP_absMask[4] )				= { ~IEEE_FLT_SIGN_MASK, ~IEEE_FLT_SIGN_MASK, ~IEEE_FLT_SIGN_MASK, ~IEEE_FLT_SIGN_MASK };
     60 ALIGN16( const unsigned int SIMD_SP_indexedStartMask[4][4] )	= { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF }, { 0, 0, 0, 0xFFFFFFFF } };
     61 ALIGN16( const unsigned int SIMD_SP_indexedEndMask[4][4] )		= { { 0, 0, 0, 0 }, { 0xFFFFFFFF, 0, 0, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 } };
     62 ALIGN16( const unsigned int SIMD_SP_clearLast1[4] )				= { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 };
     63 ALIGN16( const unsigned int SIMD_SP_clearLast2[4] )				= { 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 };
     64 ALIGN16( const unsigned int SIMD_SP_clearLast3[4] )				= { 0xFFFFFFFF, 0, 0, 0 };
     65 ALIGN16( const unsigned int SIMD_DW_zero[4] )					= { 0, 0, 0, 0 };
     66 ALIGN16( const unsigned int SIMD_DW_one[4] )					= { 1, 1, 1, 1 };
     67 ALIGN16( const unsigned int SIMD_DW_four[4] )					= { 4, 4, 4, 4 };
     68 ALIGN16( const unsigned int SIMD_DW_index[4] )					= { 0, 1, 2, 3 };
     69 ALIGN16( const int SIMD_DW_not3[4] )							= { ~3, ~3, ~3, ~3 };
     70 
     71 #endif
     72 
     73 /*
     74 ========================
     75 Multiply_SIMD
     76 
     77 dst[i] = src0[i] * src1[i];
     78 
     79 Assumes the source and destination have the same memory alignment.
     80 ========================
     81 */
     82 static void Multiply_SIMD( float * dst, const float * src0, const float * src1, const int count ) {
     83 	int i = 0;
     84 	for ( ; ( (unsigned int)dst & 0xF ) != 0 && i < count; i++ ) {
     85 		dst[i] = src0[i] * src1[i];
     86 	}
     87 
     88 #ifdef ID_WIN_X86_SSE_INTRIN
     89 
     90 	for ( ; i + 4 <= count; i += 4 ) {
     91 		assert_16_byte_aligned( &dst[i] );
     92 		assert_16_byte_aligned( &src0[i] );
     93 		assert_16_byte_aligned( &src1[i] );
     94 
     95 		__m128 s0 = _mm_load_ps( src0 + i );
     96 		__m128 s1 = _mm_load_ps( src1 + i );
     97 		s0 = _mm_mul_ps( s0, s1 );
     98 		_mm_store_ps( dst + i, s0 );
     99 	}
    100 
    101 #else
    102 
    103 	for ( ; i + 4 <= count; i += 4 ) {
    104 		assert_16_byte_aligned( &dst[i] );
    105 		assert_16_byte_aligned( &src0[i] );
    106 		assert_16_byte_aligned( &src1[i] );
    107 
    108 		dst[i+0] = src0[i+0] * src1[i+0];
    109 		dst[i+1] = src0[i+1] * src1[i+1];
    110 		dst[i+2] = src0[i+2] * src1[i+2];
    111 		dst[i+3] = src0[i+3] * src1[i+3];
    112 	}
    113 
    114 #endif
    115 
    116 	for ( ; i < count; i++ ) {
    117 		dst[i] = src0[i] * src1[i];
    118 	}
    119 }
    120 
    121 /*
    122 ========================
    123 MultiplyAdd_SIMD
    124 
    125 dst[i] += constant * src[i];
    126 
    127 Assumes the source and destination have the same memory alignment.
    128 ========================
    129 */
    130 static void MultiplyAdd_SIMD( float * dst, const float constant, const float * src, const int count ) {
    131 	int i = 0;
    132 	for ( ; ( (unsigned int)dst & 0xF ) != 0 && i < count; i++ ) {
    133 		dst[i] += constant * src[i];
    134 	}
    135 
    136 #ifdef ID_WIN_X86_SSE_INTRIN
    137 
    138 	__m128 c = _mm_load1_ps( & constant );
    139 	for ( ; i + 4 <= count; i += 4 ) {
    140 		assert_16_byte_aligned( &dst[i] );
    141 		assert_16_byte_aligned( &src[i] );
    142 
    143 		__m128 s = _mm_load_ps( src + i );
    144 		__m128 d = _mm_load_ps( dst + i );
    145 		s = _mm_add_ps( _mm_mul_ps( s, c ), d );
    146 		_mm_store_ps( dst + i, s );
    147 	}
    148 
    149 #else
    150 
    151 	for ( ; i + 4 <= count; i += 4 ) {
    152 		assert_16_byte_aligned( &src[i] );
    153 		assert_16_byte_aligned( &dst[i] );
    154 
    155 		dst[i+0] += constant * src[i+0];
    156 		dst[i+1] += constant * src[i+1];
    157 		dst[i+2] += constant * src[i+2];
    158 		dst[i+3] += constant * src[i+3];
    159 	}
    160 
    161 #endif
    162 
    163 	for ( ; i < count; i++ ) {
    164 		dst[i] += constant * src[i];
    165 	}
    166 }
    167 
    168 /*
    169 ========================
    170 DotProduct_SIMD
    171 
    172 dot = src0[0] * src1[0] + src0[1] * src1[1] + src0[2] * src1[2] + ...
    173 ========================
    174 */
    175 static float DotProduct_SIMD( const float * src0, const float * src1, const int count ) {
    176 	assert_16_byte_aligned( src0 );
    177 	assert_16_byte_aligned( src1 );
    178 
    179 #ifdef ID_WIN_X86_SSE_INTRIN
    180 
    181 	__m128 sum = (__m128 &) SIMD_SP_zero;
    182 	int i = 0;
    183 	for ( ; i < count - 3; i += 4 ) {
    184 		__m128 s0 = _mm_load_ps( src0 + i );
    185 		__m128 s1 = _mm_load_ps( src1 + i );
    186 		sum = _mm_add_ps( _mm_mul_ps( s0, s1 ), sum );
    187 	}
    188 	__m128 mask = _mm_load_ps( (float *) &SIMD_SP_indexedEndMask[count & 3] );
    189 	__m128 s0 = _mm_and_ps( _mm_load_ps( src0 + i ), mask );
    190 	__m128 s1 = _mm_and_ps( _mm_load_ps( src1 + i ), mask );
    191 	sum = _mm_add_ps( _mm_mul_ps( s0, s1 ), sum );
    192 	sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
    193 	sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
    194 	float dot;
    195 	_mm_store_ss( & dot, sum );
    196 	return dot;
    197 
    198 #else
    199 
    200 	float s0 = 0.0f;
    201 	float s1 = 0.0f;
    202 	float s2 = 0.0f;
    203 	float s3 = 0.0f;
    204 	int i = 0;
    205 	for ( ; i < count - 3; i += 4 ) {
    206 		s0 += src0[i+4] * src1[i+4];
    207 		s1 += src0[i+5] * src1[i+5];
    208 		s2 += src0[i+6] * src1[i+6];
    209 		s3 += src0[i+7] * src1[i+7];
    210 	}
    211 	switch( count - i ) {
    212 		NODEFAULT;
    213 		case 3: s0 += src0[i+2] * src1[i+2];
    214 		case 2: s1 += src0[i+1] * src1[i+1];
    215 		case 1: s2 += src0[i+0] * src1[i+0];
    216 		case 0:
    217 			break;
    218 	}
    219 	return s0 + s1 + s2 + s3;
    220 
    221 #endif
    222 }
    223 
    224 /*
    225 ========================
    226 LowerTriangularSolve_SIMD
    227 
    228 Solves x in Lx = b for the n * n sub-matrix of L.
    229 	* if skip > 0 the first skip elements of x are assumed to be valid already
    230 	* L has to be a lower triangular matrix with (implicit) ones on the diagonal
    231 	* x == b is allowed
    232 ========================
    233 */
    234 static void LowerTriangularSolve_SIMD( const idMatX & L, float * x, const float * b, const int n, int skip ) {
    235 	if ( skip >= n ) {
    236 		return;
    237 	}
    238 
    239 	const float *lptr = L.ToFloatPtr();
    240 	int nc = L.GetNumColumns();
    241 
    242 	assert( ( nc & 3 ) == 0 );
    243 
    244 	// unrolled cases for n < 8
    245 	if ( n < 8 ) {
    246 		#define NSKIP( n, s )	((n<<3)|(s&7))
    247 		switch( NSKIP( n, skip ) ) {
    248 			case NSKIP( 1, 0 ): x[0] = b[0];
    249 				return;
    250 			case NSKIP( 2, 0 ): x[0] = b[0];
    251 			case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
    252 				return;
    253 			case NSKIP( 3, 0 ): x[0] = b[0];
    254 			case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
    255 			case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
    256 				return;
    257 			case NSKIP( 4, 0 ): x[0] = b[0];
    258 			case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
    259 			case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
    260 			case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
    261 				return;
    262 			case NSKIP( 5, 0 ): x[0] = b[0];
    263 			case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
    264 			case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
    265 			case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
    266 			case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
    267 				return;
    268 			case NSKIP( 6, 0 ): x[0] = b[0];
    269 			case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
    270 			case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
    271 			case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
    272 			case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
    273 			case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
    274 				return;
    275 			case NSKIP( 7, 0 ): x[0] = b[0];
    276 			case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
    277 			case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
    278 			case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
    279 			case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
    280 			case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
    281 			case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
    282 				return;
    283 		}
    284 		#undef NSKIP
    285 		return;
    286 	}
    287 
    288 	// process first 4 rows
    289 	switch( skip ) {
    290 		case 0: x[0] = b[0];
    291 		case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
    292 		case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
    293 		case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
    294 				skip = 4;
    295 	}
    296 
    297 	lptr = L[skip];
    298 
    299 	int i = skip;
    300 
    301 #ifdef ID_WIN_X86_SSE_INTRIN
    302 
    303 	// work up to a multiple of 4 rows
    304 	for ( ; ( i & 3 ) != 0 && i < n; i++ ) {
    305 		__m128 sum = _mm_load_ss( & b[i] );
    306 		int j = 0;
    307 		for ( ; j < i - 3; j += 4 ) {
    308 			__m128 s0 = _mm_load_ps( lptr + j );
    309 			__m128 s1 = _mm_load_ps( x + j );
    310 			sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) );
    311 		}
    312 		__m128 mask = _mm_load_ps( (float *) & SIMD_SP_indexedEndMask[i & 3] );
    313 		__m128 s0 = _mm_and_ps( _mm_load_ps( lptr + j ), mask );
    314 		__m128 s1 = _mm_and_ps( _mm_load_ps( x + j ), mask );
    315 		sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) );
    316 		sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
    317 		sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
    318 		_mm_store_ss( & x[i], sum );
    319 		lptr += nc;
    320 	}
    321 
    322 	for ( ; i + 3 < n; i += 4 ) {
    323 		const float * lptr0 = &lptr[0*nc];
    324 		const float * lptr1 = &lptr[1*nc];
    325 		const float * lptr2 = &lptr[2*nc];
    326 		const float * lptr3 = &lptr[3*nc];
    327 
    328 		assert_16_byte_aligned( lptr0 );
    329 		assert_16_byte_aligned( lptr1 );
    330 		assert_16_byte_aligned( lptr2 );
    331 		assert_16_byte_aligned( lptr3 );
    332 
    333 		__m128 va = _mm_load_ss( & b[i+0] );
    334 		__m128 vb = _mm_load_ss( & b[i+1] );
    335 		__m128 vc = _mm_load_ss( & b[i+2] );
    336 		__m128 vd = _mm_load_ss( & b[i+3] );
    337 
    338 		__m128 x0 = _mm_load_ps( & x[0] );
    339 
    340 		va = _mm_sub_ps( va, _mm_mul_ps( x0, _mm_load_ps( lptr0 + 0 ) ) );
    341 		vb = _mm_sub_ps( vb, _mm_mul_ps( x0, _mm_load_ps( lptr1 + 0 ) ) );
    342 		vc = _mm_sub_ps( vc, _mm_mul_ps( x0, _mm_load_ps( lptr2 + 0 ) ) );
    343 		vd = _mm_sub_ps( vd, _mm_mul_ps( x0, _mm_load_ps( lptr3 + 0 ) ) );
    344 
    345 		for ( int j = 4; j < i; j += 4 ) {
    346 			__m128 xj = _mm_load_ps( &x[j] );
    347 
    348 			va = _mm_sub_ps( va, _mm_mul_ps( xj, _mm_load_ps( lptr0 + j ) ) );
    349 			vb = _mm_sub_ps( vb, _mm_mul_ps( xj, _mm_load_ps( lptr1 + j ) ) );
    350 			vc = _mm_sub_ps( vc, _mm_mul_ps( xj, _mm_load_ps( lptr2 + j ) ) );
    351 			vd = _mm_sub_ps( vd, _mm_mul_ps( xj, _mm_load_ps( lptr3 + j ) ) );
    352 		}
    353 
    354 		vb = _mm_sub_ps( vb, _mm_mul_ps( va, _mm_load1_ps( lptr1 + i + 0 ) ) );
    355 		vc = _mm_sub_ps( vc, _mm_mul_ps( va, _mm_load1_ps( lptr2 + i + 0 ) ) );
    356 		vc = _mm_sub_ps( vc, _mm_mul_ps( vb, _mm_load1_ps( lptr2 + i + 1 ) ) );
    357 		vd = _mm_sub_ps( vd, _mm_mul_ps( va, _mm_load1_ps( lptr3 + i + 0 ) ) );
    358 		vd = _mm_sub_ps( vd, _mm_mul_ps( vb, _mm_load1_ps( lptr3 + i + 1 ) ) );
    359 		vd = _mm_sub_ps( vd, _mm_mul_ps( vc, _mm_load1_ps( lptr3 + i + 2 ) ) );
    360 
    361 		__m128 ta = _mm_unpacklo_ps( va, vc );		// x0, z0, x1, z1
    362 		__m128 tb = _mm_unpackhi_ps( va, vc );		// x2, z2, x3, z3
    363 		__m128 tc = _mm_unpacklo_ps( vb, vd );		// y0, w0, y1, w1
    364 		__m128 td = _mm_unpackhi_ps( vb, vd );		// y2, w2, y3, w3
    365 
    366 		va = _mm_unpacklo_ps( ta, tc );				// x0, y0, z0, w0
    367 		vb = _mm_unpackhi_ps( ta, tc );				// x1, y1, z1, w1
    368 		vc = _mm_unpacklo_ps( tb, td );				// x2, y2, z2, w2
    369 		vd = _mm_unpackhi_ps( tb, td );				// x3, y3, z3, w3
    370 
    371 		va = _mm_add_ps( va, vb );
    372 		vc = _mm_add_ps( vc, vd );
    373 		va = _mm_add_ps( va, vc );
    374 
    375 		_mm_store_ps( & x[i], va );
    376 
    377 		lptr += 4 * nc;
    378 	}
    379 
    380 	// go through any remaining rows
    381 	for ( ; i < n; i++ ) {
    382 		__m128 sum = _mm_load_ss( & b[i] );
    383 		int j = 0;
    384 		for ( ; j < i - 3; j += 4 ) {
    385 			__m128 s0 = _mm_load_ps( lptr + j );
    386 			__m128 s1 = _mm_load_ps( x + j );
    387 			sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) );
    388 		}
    389 		__m128 mask = _mm_load_ps( (float *) & SIMD_SP_indexedEndMask[i & 3] );
    390 		__m128 s0 = _mm_and_ps( _mm_load_ps( lptr + j ), mask );
    391 		__m128 s1 = _mm_and_ps( _mm_load_ps( x + j ), mask );
    392 		sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) );
    393 		sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
    394 		sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
    395 		_mm_store_ss( & x[i], sum );
    396 		lptr += nc;
    397 	}
    398 
    399 #else
    400 
    401 	// work up to a multiple of 4 rows
    402 	for ( ; ( i & 3 ) != 0 && i < n; i++ ) {
    403 		float sum = b[i];
    404 		for ( int j = 0; j < i; j++ ) {
    405 			sum -= lptr[j] * x[j];
    406 		}
    407 		x[i] = sum;
    408 		lptr += nc;
    409 	}
    410 
    411 	assert_16_byte_aligned( x );
    412 
    413 	for ( ; i + 3 < n; i += 4 ) {
    414 		const float * lptr0 = &lptr[0*nc];
    415 		const float * lptr1 = &lptr[1*nc];
    416 		const float * lptr2 = &lptr[2*nc];
    417 		const float * lptr3 = &lptr[3*nc];
    418 
    419 		assert_16_byte_aligned( lptr0 );
    420 		assert_16_byte_aligned( lptr1 );
    421 		assert_16_byte_aligned( lptr2 );
    422 		assert_16_byte_aligned( lptr3 );
    423 
    424 		float a0 = - lptr0[0] * x[0] + b[i+0];
    425 		float a1 = - lptr0[1] * x[1];
    426 		float a2 = - lptr0[2] * x[2];
    427 		float a3 = - lptr0[3] * x[3];
    428 
    429 		float b0 = - lptr1[0] * x[0] + b[i+1];
    430 		float b1 = - lptr1[1] * x[1];
    431 		float b2 = - lptr1[2] * x[2];
    432 		float b3 = - lptr1[3] * x[3];
    433 
    434 		float c0 = - lptr2[0] * x[0] + b[i+2];
    435 		float c1 = - lptr2[1] * x[1];
    436 		float c2 = - lptr2[2] * x[2];
    437 		float c3 = - lptr2[3] * x[3];
    438 
    439 		float d0 = - lptr3[0] * x[0] + b[i+3];
    440 		float d1 = - lptr3[1] * x[1];
    441 		float d2 = - lptr3[2] * x[2];
    442 		float d3 = - lptr3[3] * x[3];
    443 
    444 		for ( int j = 4; j < i; j += 4 ) {
    445 			a0 -= lptr0[j+0] * x[j+0];
    446 			a1 -= lptr0[j+1] * x[j+1];
    447 			a2 -= lptr0[j+2] * x[j+2];
    448 			a3 -= lptr0[j+3] * x[j+3];
    449 
    450 			b0 -= lptr1[j+0] * x[j+0];
    451 			b1 -= lptr1[j+1] * x[j+1];
    452 			b2 -= lptr1[j+2] * x[j+2];
    453 			b3 -= lptr1[j+3] * x[j+3];
    454 
    455 			c0 -= lptr2[j+0] * x[j+0];
    456 			c1 -= lptr2[j+1] * x[j+1];
    457 			c2 -= lptr2[j+2] * x[j+2];
    458 			c3 -= lptr2[j+3] * x[j+3];
    459 
    460 			d0 -= lptr3[j+0] * x[j+0];
    461 			d1 -= lptr3[j+1] * x[j+1];
    462 			d2 -= lptr3[j+2] * x[j+2];
    463 			d3 -= lptr3[j+3] * x[j+3];
    464 		}
    465 
    466 		b0 -= lptr1[i+0] * a0;
    467 		b1 -= lptr1[i+0] * a1;
    468 		b2 -= lptr1[i+0] * a2;
    469 		b3 -= lptr1[i+0] * a3;
    470 
    471 		c0 -= lptr2[i+0] * a0;
    472 		c1 -= lptr2[i+0] * a1;
    473 		c2 -= lptr2[i+0] * a2;
    474 		c3 -= lptr2[i+0] * a3;
    475 
    476 		c0 -= lptr2[i+1] * b0;
    477 		c1 -= lptr2[i+1] * b1;
    478 		c2 -= lptr2[i+1] * b2;
    479 		c3 -= lptr2[i+1] * b3;
    480 
    481 		d0 -= lptr3[i+0] * a0;
    482 		d1 -= lptr3[i+0] * a1;
    483 		d2 -= lptr3[i+0] * a2;
    484 		d3 -= lptr3[i+0] * a3;
    485 
    486 		d0 -= lptr3[i+1] * b0;
    487 		d1 -= lptr3[i+1] * b1;
    488 		d2 -= lptr3[i+1] * b2;
    489 		d3 -= lptr3[i+1] * b3;
    490 
    491 		d0 -= lptr3[i+2] * c0;
    492 		d1 -= lptr3[i+2] * c1;
    493 		d2 -= lptr3[i+2] * c2;
    494 		d3 -= lptr3[i+2] * c3;
    495 
    496 		x[i+0] = a0 + a1 + a2 + a3;
    497 		x[i+1] = b0 + b1 + b2 + b3;
    498 		x[i+2] = c0 + c1 + c2 + c3;
    499 		x[i+3] = d0 + d1 + d2 + d3;
    500 
    501 		lptr += 4 * nc;
    502 	}
    503 
    504 	// go through any remaining rows
    505 	for ( ; i < n; i++ ) {
    506 		float sum = b[i];
    507 		for ( int j = 0; j < i; j++ ) {
    508 			sum -= lptr[j] * x[j];
    509 		}
    510 		x[i] = sum;
    511 		lptr += nc;
    512 	}
    513 
    514 #endif
    515 
    516 }
    517 
    518 /*
    519 ========================
    520 LowerTriangularSolveTranspose_SIMD
    521 
    522 Solves x in L'x = b for the n * n sub-matrix of L.
    523 	* L has to be a lower triangular matrix with (implicit) ones on the diagonal
    524 	* x == b is allowed
    525 ========================
    526 */
    527 static void LowerTriangularSolveTranspose_SIMD( const idMatX & L, float * x, const float * b, const int n ) {
    528 	int nc = L.GetNumColumns();
    529 
    530 	assert( ( nc & 3 ) == 0 );
    531 
    532 	int m = n;
    533 	int r = n & 3;
    534 
    535 	if ( ( m & 3 ) != 0 ) {
    536 		const float * lptr = L.ToFloatPtr() + m * nc + m;
    537 		if ( ( m & 3 ) == 1 ) {
    538 			x[m-1] = b[m-1];
    539 			m -= 1;
    540 		} else if ( ( m & 3 ) == 2 ) {
    541 			x[m-1] = b[m-1];
    542 			x[m-2] = b[m-2] - lptr[-1*nc-2] * x[m-1];
    543 			m -= 2;
    544 		} else {
    545 			x[m-1] = b[m-1];
    546 			x[m-2] = b[m-2] - lptr[-1*nc-2] * x[m-1];
    547 			x[m-3] = b[m-3] - lptr[-1*nc-3] * x[m-1] - lptr[-2*nc-3] * x[m-2];
    548 			m -= 3;
    549 		}
    550 	}
    551 
    552 	const float * lptr = L.ToFloatPtr() + m * nc + m - 4;
    553 	float * xptr = x + m;
    554 
    555 #ifdef ID_WIN_X86_SSE2_INTRIN
    556 
    557 	// process 4 rows at a time
    558 	for ( int i = m; i >= 4; i -= 4 ) {
    559 		assert_16_byte_aligned( b );
    560 		assert_16_byte_aligned( xptr );
    561 		assert_16_byte_aligned( lptr );
    562 
    563 		__m128 s0 = _mm_load_ps( &b[i-4] );
    564 		__m128 s1 = (__m128 &)SIMD_SP_zero;
    565 		__m128 s2 = (__m128 &)SIMD_SP_zero;
    566 		__m128 s3 = (__m128 &)SIMD_SP_zero;
    567 
    568 		// process 4x4 blocks
    569 		const float * xptr2 = xptr;	// x + i;
    570 		const float * lptr2 = lptr;	// ptr = L[i] + i - 4;
    571 		for ( int j = i; j < m; j += 4 ) {
    572 			__m128 xj = _mm_load_ps( xptr2 );
    573 
    574 			s0 = _mm_sub_ps( s0, _mm_mul_ps( _mm_splat_ps( xj, 0 ), _mm_load_ps( lptr2 + 0 * nc ) ) );
    575 			s1 = _mm_sub_ps( s1, _mm_mul_ps( _mm_splat_ps( xj, 1 ), _mm_load_ps( lptr2 + 1 * nc ) ) );
    576 			s2 = _mm_sub_ps( s2, _mm_mul_ps( _mm_splat_ps( xj, 2 ), _mm_load_ps( lptr2 + 2 * nc ) ) );
    577 			s3 = _mm_sub_ps( s3, _mm_mul_ps( _mm_splat_ps( xj, 3 ), _mm_load_ps( lptr2 + 3 * nc ) ) );
    578 
    579 			lptr2 += 4 * nc;
    580 			xptr2 += 4;
    581 		}
    582 		for ( int j = 0; j < r; j++ ) {
    583 			s0 = _mm_sub_ps( s0, _mm_mul_ps( _mm_load_ps( lptr2 ), _mm_load1_ps( &xptr2[j] ) ) );
    584 			lptr2 += nc;
    585 		}
    586 		s0 = _mm_add_ps( s0, s1 );
    587 		s2 = _mm_add_ps( s2, s3 );
    588 		s0 = _mm_add_ps( s0, s2 );
    589 		// process left over of the 4 rows
    590 		lptr -= 4 * nc;
    591 		__m128 t0 = _mm_and_ps( _mm_load_ps( lptr + 3 * nc ), (__m128 &)SIMD_SP_clearLast1 );
    592 		__m128 t1 = _mm_and_ps( _mm_load_ps( lptr + 2 * nc ), (__m128 &)SIMD_SP_clearLast2 );
    593 		__m128 t2 = _mm_load_ss( lptr + 1 * nc );
    594 		s0 = _mm_sub_ps( s0, _mm_mul_ps( t0, _mm_splat_ps( s0, 3 ) ) );
    595 		s0 = _mm_sub_ps( s0, _mm_mul_ps( t1, _mm_splat_ps( s0, 2 ) ) );
    596 		s0 = _mm_sub_ps( s0, _mm_mul_ps( t2, _mm_splat_ps( s0, 1 ) ) );
    597 		// store result
    598 		_mm_store_ps( &xptr[-4], s0 );
    599 		// update pointers for next four rows
    600 		lptr -= 4;
    601 		xptr -= 4;
    602 	}
    603 
    604 #else
    605 
    606 	// process 4 rows at a time
    607 	for ( int i = m; i >= 4; i -= 4 ) {
    608 		assert_16_byte_aligned( b );
    609 		assert_16_byte_aligned( xptr );
    610 		assert_16_byte_aligned( lptr );
    611 
    612 		float s0 = b[i-4];
    613 		float s1 = b[i-3];
    614 		float s2 = b[i-2];
    615 		float s3 = b[i-1];
    616 		// process 4x4 blocks
    617 		const float * xptr2 = xptr;	// x + i;
    618 		const float * lptr2 = lptr;	// ptr = L[i] + i - 4;
    619 		for ( int j = i; j < m; j += 4 ) {
    620 			float t0 = xptr2[0];
    621 			s0 -= lptr2[0] * t0;
    622 			s1 -= lptr2[1] * t0;
    623 			s2 -= lptr2[2] * t0;
    624 			s3 -= lptr2[3] * t0;
    625 			lptr2 += nc;
    626 			float t1 = xptr2[1];
    627 			s0 -= lptr2[0] * t1;
    628 			s1 -= lptr2[1] * t1;
    629 			s2 -= lptr2[2] * t1;
    630 			s3 -= lptr2[3] * t1;
    631 			lptr2 += nc;
    632 			float t2 = xptr2[2];
    633 			s0 -= lptr2[0] * t2;
    634 			s1 -= lptr2[1] * t2;
    635 			s2 -= lptr2[2] * t2;
    636 			s3 -= lptr2[3] * t2;
    637 			lptr2 += nc;
    638 			float t3 = xptr2[3];
    639 			s0 -= lptr2[0] * t3;
    640 			s1 -= lptr2[1] * t3;
    641 			s2 -= lptr2[2] * t3;
    642 			s3 -= lptr2[3] * t3;
    643 			lptr2 += nc;
    644 			xptr2 += 4;
    645 		}
    646 		for ( int j = 0; j < r; j++ ) {
    647 			float t = xptr2[j];
    648 			s0 -= lptr2[0] * t;
    649 			s1 -= lptr2[1] * t;
    650 			s2 -= lptr2[2] * t;
    651 			s3 -= lptr2[3] * t;
    652 			lptr2 += nc;
    653 		}
    654 		// process left over of the 4 rows
    655 		lptr -= nc;
    656 		s0 -= lptr[0] * s3;
    657 		s1 -= lptr[1] * s3;
    658 		s2 -= lptr[2] * s3;
    659 		lptr -= nc;
    660 		s0 -= lptr[0] * s2;
    661 		s1 -= lptr[1] * s2;
    662 		lptr -= nc;
    663 		s0 -= lptr[0] * s1;
    664 		lptr -= nc;
    665 		// store result
    666 		xptr[-4] = s0;
    667 		xptr[-3] = s1;
    668 		xptr[-2] = s2;
    669 		xptr[-1] = s3;
    670 		// update pointers for next four rows
    671 		lptr -= 4;
    672 		xptr -= 4;
    673 	}
    674 
    675 #endif
    676 
    677 }
    678 
    679 /*
    680 ========================
    681 UpperTriangularSolve_SIMD
    682 
    683 Solves x in Ux = b for the n * n sub-matrix of U.
    684 	* U has to be a upper triangular matrix
    685 	* invDiag is the reciprical of the diagonal of the upper triangular matrix.
    686 	* x == b is allowed
    687 ========================
    688 */
    689 static void UpperTriangularSolve_SIMD( const idMatX & U, const float * invDiag, float * x, const float * b, const int n ) {
    690 	for ( int i = n - 1; i >= 0; i-- ) {
    691 		float sum = b[i];
    692 		const float * uptr = U[i];
    693 		for ( int j = i + 1; j < n; j++ ) {
    694 			sum -= uptr[j] * x[j];
    695 		}
    696 		x[i] = sum * invDiag[i];
    697 	}
    698 }
    699 
    700 /*
    701 ========================
    702 LU_Factor_SIMD
    703 
    704 In-place factorization LU of the n * n sub-matrix of mat. The reciprocal of the diagonal 
    705 elements of U are stored in invDiag. No pivoting is used.
    706 ========================
    707 */
    708 static bool LU_Factor_SIMD( idMatX & mat, idVecX & invDiag, const int n ) {
    709 	for ( int i = 0; i < n; i++ ) {
    710 
    711 		float d1 = mat[i][i];
    712 
    713 		if ( fabs( d1 ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
    714 			return false;
    715 		}
    716 
    717 		invDiag[i] = d1 = 1.0f / d1;
    718 
    719 		float * ptr1 = mat[i];
    720 
    721 		for ( int j = i + 1; j < n; j++ ) {
    722 
    723 			float * ptr2 = mat[j];
    724 			float d2 = ptr2[i] * d1;
    725 			ptr2[i] = d2;
    726 
    727 			int k;
    728 			for ( k = n - 1; k > i + 15; k -= 16 ) {
    729 				ptr2[k-0] -= d2 * ptr1[k-0];
    730 				ptr2[k-1] -= d2 * ptr1[k-1];
    731 				ptr2[k-2] -= d2 * ptr1[k-2];
    732 				ptr2[k-3] -= d2 * ptr1[k-3];
    733 				ptr2[k-4] -= d2 * ptr1[k-4];
    734 				ptr2[k-5] -= d2 * ptr1[k-5];
    735 				ptr2[k-6] -= d2 * ptr1[k-6];
    736 				ptr2[k-7] -= d2 * ptr1[k-7];
    737 				ptr2[k-8] -= d2 * ptr1[k-8];
    738 				ptr2[k-9] -= d2 * ptr1[k-9];
    739 				ptr2[k-10] -= d2 * ptr1[k-10];
    740 				ptr2[k-11] -= d2 * ptr1[k-11];
    741 				ptr2[k-12] -= d2 * ptr1[k-12];
    742 				ptr2[k-13] -= d2 * ptr1[k-13];
    743 				ptr2[k-14] -= d2 * ptr1[k-14];
    744 				ptr2[k-15] -= d2 * ptr1[k-15];
    745 			}
    746 			switch( k - i ) {
    747 				NODEFAULT;
    748 				case 15: ptr2[k-14] -= d2 * ptr1[k-14];
    749 				case 14: ptr2[k-13] -= d2 * ptr1[k-13];
    750 				case 13: ptr2[k-12] -= d2 * ptr1[k-12];
    751 				case 12: ptr2[k-11] -= d2 * ptr1[k-11];
    752 				case 11: ptr2[k-10] -= d2 * ptr1[k-10];
    753 				case 10: ptr2[k-9] -= d2 * ptr1[k-9];
    754 				case 9: ptr2[k-8] -= d2 * ptr1[k-8];
    755 				case 8: ptr2[k-7] -= d2 * ptr1[k-7];
    756 				case 7: ptr2[k-6] -= d2 * ptr1[k-6];
    757 				case 6: ptr2[k-5] -= d2 * ptr1[k-5];
    758 				case 5: ptr2[k-4] -= d2 * ptr1[k-4];
    759 				case 4: ptr2[k-3] -= d2 * ptr1[k-3];
    760 				case 3: ptr2[k-2] -= d2 * ptr1[k-2];
    761 				case 2: ptr2[k-1] -= d2 * ptr1[k-1];
    762 				case 1: ptr2[k-0] -= d2 * ptr1[k-0];
    763 				case 0: break;
    764 			}
    765 		}
    766 	}
    767 
    768 	return true;
    769 }
    770 
    771 /*
    772 ========================
    773 LDLT_Factor_SIMD
    774 
    775 In-place factorization LDL' of the n * n sub-matrix of mat. The reciprocal of the diagonal 
    776 elements are stored in invDiag.
    777 
    778 NOTE:	The number of columns of mat must be a multiple of 4.
    779 ========================
    780 */
    781 static bool LDLT_Factor_SIMD( idMatX & mat, idVecX & invDiag, const int n ) {
    782 	float s0, s1, s2, d;
    783 
    784 	float * v = (float *) _alloca16( ( ( n + 3 ) & ~3 ) * sizeof( float ) );
    785 	float * diag = (float *) _alloca16( ( ( n + 3 ) & ~3 ) * sizeof( float ) );
    786 	float * invDiagPtr = invDiag.ToFloatPtr();
    787 
    788 	int nc = mat.GetNumColumns();
    789 
    790 	assert( ( nc & 3 ) == 0 );
    791 
    792 	if ( n <= 0 ) {
    793 		return true;
    794 	}
    795 
    796 	float * mptr = mat[0];
    797 
    798 	float sum = mptr[0];
    799 
    800 	if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
    801 		return false;
    802 	}
    803 
    804 	diag[0] = sum;
    805 	invDiagPtr[0] = d = 1.0f / sum;
    806 
    807 	if ( n <= 1 ) {
    808 		return true;
    809 	}
    810 
    811 	mptr = mat[0];
    812 	for ( int j = 1; j < n; j++ ) {
    813 		mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
    814 	}
    815 
    816 	mptr = mat[1];
    817 
    818 	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
    819 	sum = mptr[1] - s0;
    820 
    821 	if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
    822 		return false;
    823 	}
    824 
    825 	mat[1][1] = sum;
    826 	diag[1] = sum;
    827 	invDiagPtr[1] = d = 1.0f / sum;
    828 
    829 	if ( n <= 2 ) {
    830 		return true;
    831 	}
    832 
    833 	mptr = mat[0];
    834 	for ( int j = 2; j < n; j++ ) {
    835 		mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
    836 	}
    837 
    838 	mptr = mat[2];
    839 
    840 	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
    841 	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
    842 	sum = mptr[2] - s0 - s1;
    843 
    844 	if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
    845 		return false;
    846 	}
    847 
    848 	mat[2][2] = sum;
    849 	diag[2] = sum;
    850 	invDiagPtr[2] = d = 1.0f / sum;
    851 
    852 	if ( n <= 3 ) {
    853 		return true;
    854 	}
    855 
    856 	mptr = mat[0];
    857 	for ( int j = 3; j < n; j++ ) {
    858 		mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
    859 	}
    860 
    861 	mptr = mat[3];
    862 
    863 	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
    864 	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
    865 	v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
    866 	sum = mptr[3] - s0 - s1 - s2;
    867 
    868 	if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
    869 		return false;
    870 	}
    871 
    872 	mat[3][3] = sum;
    873 	diag[3] = sum;
    874 	invDiagPtr[3] = d = 1.0f / sum;
    875 
    876 	if ( n <= 4 ) {
    877 		return true;
    878 	}
    879 
    880 	mptr = mat[0];
    881 	for ( int j = 4; j < n; j++ ) {
    882 		mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
    883 	}
    884 
    885 #ifdef ID_WIN_X86_SSE2_INTRIN
    886 
    887 	__m128 vzero = _mm_setzero_ps();
    888 	for ( int i = 4; i < n; i += 4 ) {
    889 		_mm_store_ps( diag + i, vzero );
    890 	}
    891 
    892 	for ( int i = 4; i < n; i++ ) {
    893 		mptr = mat[i];
    894 
    895 		assert_16_byte_aligned( v );
    896 		assert_16_byte_aligned( mptr );
    897 		assert_16_byte_aligned( diag );
    898 
    899 		__m128 m0 = _mm_load_ps( mptr + 0 );
    900 		__m128 d0 = _mm_load_ps( diag + 0 );
    901 		__m128 v0 = _mm_mul_ps( d0, m0 );
    902 		__m128 t0 = _mm_load_ss( mptr + i );
    903 		t0 = _mm_sub_ps( t0, _mm_mul_ps( m0, v0 ) );
    904 		_mm_store_ps( v + 0, v0 );
    905 
    906 		int k = 4;
    907 		for ( ; k < i - 3; k += 4 ) {
    908 			m0 = _mm_load_ps( mptr + k );
    909 			d0 = _mm_load_ps( diag + k );
    910 			v0 = _mm_mul_ps( d0, m0 );
    911 			t0 = _mm_sub_ps( t0, _mm_mul_ps( m0, v0 ) );
    912 			_mm_store_ps( v + k, v0 );
    913 		}
    914 
    915 		__m128 mask = (__m128 &) SIMD_SP_indexedEndMask[i & 3];
    916 
    917 		m0 = _mm_and_ps( _mm_load_ps( mptr + k ), mask );
    918 		d0 = _mm_load_ps( diag + k );
    919 		v0 = _mm_mul_ps( d0, m0 );
    920 		t0 = _mm_sub_ps( t0, _mm_mul_ps( m0, v0 ) );
    921 		_mm_store_ps( v + k, v0 );
    922 
    923 		t0 = _mm_add_ps( t0, _mm_shuffle_ps( t0, t0, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
    924 		t0 = _mm_add_ps( t0, _mm_shuffle_ps( t0, t0, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
    925 
    926 		__m128 tiny = _mm_and_ps( _mm_cmpeq_ps( t0, SIMD_SP_zero ), SIMD_SP_tiny );
    927 		t0 = _mm_or_ps( t0, tiny );
    928 
    929 		_mm_store_ss( mptr + i, t0 );
    930 		_mm_store_ss( diag + i, t0 );
    931 
    932 		__m128 d = _mm_rcp32_ps( t0 );
    933 		_mm_store_ss( invDiagPtr + i, d );
    934 
    935 		if ( i + 1 >= n ) {
    936 			return true;
    937 		}
    938 
    939 		int j = i + 1;
    940 		for ( ; j < n - 3; j += 4 ) {
    941 			float * ra = mat[j+0];
    942 			float * rb = mat[j+1];
    943 			float * rc = mat[j+2];
    944 			float * rd = mat[j+3];
    945 
    946 			assert_16_byte_aligned( v );
    947 			assert_16_byte_aligned( ra );
    948 			assert_16_byte_aligned( rb );
    949 			assert_16_byte_aligned( rc );
    950 			assert_16_byte_aligned( rd );
    951 
    952 			__m128 va = _mm_load_ss( ra + i );
    953 			__m128 vb = _mm_load_ss( rb + i );
    954 			__m128 vc = _mm_load_ss( rc + i );
    955 			__m128 vd = _mm_load_ss( rd + i );
    956 
    957 			__m128 v0 = _mm_load_ps( v + 0 );
    958 
    959 			va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( ra + 0 ), v0 ) );
    960 			vb = _mm_sub_ps( vb, _mm_mul_ps( _mm_load_ps( rb + 0 ), v0 ) );
    961 			vc = _mm_sub_ps( vc, _mm_mul_ps( _mm_load_ps( rc + 0 ), v0 ) );
    962 			vd = _mm_sub_ps( vd, _mm_mul_ps( _mm_load_ps( rd + 0 ), v0 ) );
    963 
    964 			int k = 4;
    965 			for ( ; k < i - 3; k += 4 ) {
    966 				v0 = _mm_load_ps( v + k );
    967 
    968 				va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( ra + k ), v0 ) );
    969 				vb = _mm_sub_ps( vb, _mm_mul_ps( _mm_load_ps( rb + k ), v0 ) );
    970 				vc = _mm_sub_ps( vc, _mm_mul_ps( _mm_load_ps( rc + k ), v0 ) );
    971 				vd = _mm_sub_ps( vd, _mm_mul_ps( _mm_load_ps( rd + k ), v0 ) );
    972 			}
    973 
    974 			v0 = _mm_load_ps( v + k );
    975 
    976 			va = _mm_sub_ps( va, _mm_mul_ps( _mm_and_ps( _mm_load_ps( ra + k ), mask ), v0 ) );
    977 			vb = _mm_sub_ps( vb, _mm_mul_ps( _mm_and_ps( _mm_load_ps( rb + k ), mask ), v0 ) );
    978 			vc = _mm_sub_ps( vc, _mm_mul_ps( _mm_and_ps( _mm_load_ps( rc + k ), mask ), v0 ) );
    979 			vd = _mm_sub_ps( vd, _mm_mul_ps( _mm_and_ps( _mm_load_ps( rd + k ), mask ), v0 ) );
    980 
    981 			__m128 ta = _mm_unpacklo_ps( va, vc );		// x0, z0, x1, z1
    982 			__m128 tb = _mm_unpackhi_ps( va, vc );		// x2, z2, x3, z3
    983 			__m128 tc = _mm_unpacklo_ps( vb, vd );		// y0, w0, y1, w1
    984 			__m128 td = _mm_unpackhi_ps( vb, vd );		// y2, w2, y3, w3
    985 
    986 			va = _mm_unpacklo_ps( ta, tc );				// x0, y0, z0, w0
    987 			vb = _mm_unpackhi_ps( ta, tc );				// x1, y1, z1, w1
    988 			vc = _mm_unpacklo_ps( tb, td );				// x2, y2, z2, w2
    989 			vd = _mm_unpackhi_ps( tb, td );				// x3, y3, z3, w3
    990 
    991 			va = _mm_add_ps( va, vb );
    992 			vc = _mm_add_ps( vc, vd );
    993 			va = _mm_add_ps( va, vc );
    994 			va = _mm_mul_ps( va, d );
    995 
    996 			_mm_store_ss( ra + i, _mm_splat_ps( va, 0 ) );
    997 			_mm_store_ss( rb + i, _mm_splat_ps( va, 1 ) );
    998 			_mm_store_ss( rc + i, _mm_splat_ps( va, 2 ) );
    999 			_mm_store_ss( rd + i, _mm_splat_ps( va, 3 ) );
   1000 		}
   1001 		for ( ; j < n; j++ ) {
   1002 			float * mptr = mat[j];
   1003 
   1004 			assert_16_byte_aligned( v );
   1005 			assert_16_byte_aligned( mptr );
   1006 
   1007 			__m128 va = _mm_load_ss( mptr + i );
   1008 			__m128 v0 = _mm_load_ps( v + 0 );
   1009 
   1010 			va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( mptr + 0 ), v0 ) );
   1011 
   1012 			int k = 4;
   1013 			for ( ; k < i - 3; k += 4 ) {
   1014 				v0 = _mm_load_ps( v + k );
   1015 				va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( mptr + k ), v0 ) );
   1016 			}
   1017 
   1018 			v0 = _mm_load_ps( v + k );
   1019 			va = _mm_sub_ps( va, _mm_mul_ps( _mm_and_ps( _mm_load_ps( mptr + k ), mask ), v0 ) );
   1020 
   1021 			va = _mm_add_ps( va, _mm_shuffle_ps( va, va, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   1022 			va = _mm_add_ps( va, _mm_shuffle_ps( va, va, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   1023 			va = _mm_mul_ps( va, d );
   1024 
   1025 			_mm_store_ss( mptr + i, va );
   1026 		}
   1027 	}
   1028 	return true;
   1029 
   1030 #else
   1031 
   1032 	for ( int i = 4; i < n; i += 4 ) {
   1033 		diag[i+0] = 0.0f;
   1034 		diag[i+1] = 0.0f;
   1035 		diag[i+2] = 0.0f;
   1036 		diag[i+3] = 0.0f;
   1037 	}
   1038 
   1039 	for ( int i = 4; i < n; i++ ) {
   1040 		mptr = mat[i];
   1041 
   1042 		assert_16_byte_aligned( v );
   1043 		assert_16_byte_aligned( mptr );
   1044 		assert_16_byte_aligned( diag );
   1045 
   1046 		v[0] = diag[0] * mptr[0];
   1047 		v[1] = diag[1] * mptr[1];
   1048 		v[2] = diag[2] * mptr[2];
   1049 		v[3] = diag[3] * mptr[3];
   1050 
   1051 		float t0 = - mptr[0] * v[0] + mptr[i];
   1052 		float t1 = - mptr[1] * v[1];
   1053 		float t2 = - mptr[2] * v[2];
   1054 		float t3 = - mptr[3] * v[3];
   1055 
   1056 		int k = 4;
   1057 		for ( ; k < i - 3; k += 4 ) {
   1058 			v[k+0] = diag[k+0] * mptr[k+0];
   1059 			v[k+1] = diag[k+1] * mptr[k+1];
   1060 			v[k+2] = diag[k+2] * mptr[k+2];
   1061 			v[k+3] = diag[k+3] * mptr[k+3];
   1062 
   1063 			t0 -= mptr[k+0] * v[k+0];
   1064 			t1 -= mptr[k+1] * v[k+1];
   1065 			t2 -= mptr[k+2] * v[k+2];
   1066 			t3 -= mptr[k+3] * v[k+3];
   1067 		}
   1068 
   1069 		float m0 = ( i - k > 0 ) ? mptr[k+0] : 0.0f;
   1070 		float m1 = ( i - k > 1 ) ? mptr[k+1] : 0.0f;
   1071 		float m2 = ( i - k > 2 ) ? mptr[k+2] : 0.0f;
   1072 		float m3 = ( i - k > 3 ) ? mptr[k+3] : 0.0f;
   1073 
   1074 		v[k+0] = diag[k+0] * m0;
   1075 		v[k+1] = diag[k+1] * m1;
   1076 		v[k+2] = diag[k+2] * m2;
   1077 		v[k+3] = diag[k+3] * m3;
   1078 
   1079 		t0 -= m0 * v[k+0];
   1080 		t1 -= m1 * v[k+1];
   1081 		t2 -= m2 * v[k+2];
   1082 		t3 -= m3 * v[k+3];
   1083 
   1084 		sum = t0 + t1 + t2 + t3;
   1085 
   1086 		if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
   1087 			return false;
   1088 		}
   1089 
   1090 		mat[i][i] = sum;
   1091 		diag[i] = sum;
   1092 		invDiagPtr[i] = d = 1.0f / sum;
   1093 
   1094 		if ( i + 1 >= n ) {
   1095 			return true;
   1096 		}
   1097 
   1098 		int j = i + 1;
   1099 		for ( ; j < n - 3; j += 4 ) {
   1100 			float * ra = mat[j+0];
   1101 			float * rb = mat[j+1];
   1102 			float * rc = mat[j+2];
   1103 			float * rd = mat[j+3];
   1104 
   1105 			assert_16_byte_aligned( v );
   1106 			assert_16_byte_aligned( ra );
   1107 			assert_16_byte_aligned( rb );
   1108 			assert_16_byte_aligned( rc );
   1109 			assert_16_byte_aligned( rd );
   1110 
   1111 			float a0 = - ra[0] * v[0] + ra[i];
   1112 			float a1 = - ra[1] * v[1];
   1113 			float a2 = - ra[2] * v[2];
   1114 			float a3 = - ra[3] * v[3];
   1115 
   1116 			float b0 = - rb[0] * v[0] + rb[i];
   1117 			float b1 = - rb[1] * v[1];
   1118 			float b2 = - rb[2] * v[2];
   1119 			float b3 = - rb[3] * v[3];
   1120 
   1121 			float c0 = - rc[0] * v[0] + rc[i];
   1122 			float c1 = - rc[1] * v[1];
   1123 			float c2 = - rc[2] * v[2];
   1124 			float c3 = - rc[3] * v[3];
   1125 
   1126 			float d0 = - rd[0] * v[0] + rd[i];
   1127 			float d1 = - rd[1] * v[1];
   1128 			float d2 = - rd[2] * v[2];
   1129 			float d3 = - rd[3] * v[3];
   1130 
   1131 			int k = 4;
   1132 			for ( ; k < i - 3; k += 4 ) {
   1133 				a0 -= ra[k+0] * v[k+0];
   1134 				a1 -= ra[k+1] * v[k+1];
   1135 				a2 -= ra[k+2] * v[k+2];
   1136 				a3 -= ra[k+3] * v[k+3];
   1137 
   1138 				b0 -= rb[k+0] * v[k+0];
   1139 				b1 -= rb[k+1] * v[k+1];
   1140 				b2 -= rb[k+2] * v[k+2];
   1141 				b3 -= rb[k+3] * v[k+3];
   1142 
   1143 				c0 -= rc[k+0] * v[k+0];
   1144 				c1 -= rc[k+1] * v[k+1];
   1145 				c2 -= rc[k+2] * v[k+2];
   1146 				c3 -= rc[k+3] * v[k+3];
   1147 
   1148 				d0 -= rd[k+0] * v[k+0];
   1149 				d1 -= rd[k+1] * v[k+1];
   1150 				d2 -= rd[k+2] * v[k+2];
   1151 				d3 -= rd[k+3] * v[k+3];
   1152 			}
   1153 
   1154 			float ra0 = ( i - k > 0 ) ? ra[k+0] : 0.0f;
   1155 			float ra1 = ( i - k > 1 ) ? ra[k+1] : 0.0f;
   1156 			float ra2 = ( i - k > 2 ) ? ra[k+2] : 0.0f;
   1157 			float ra3 = ( i - k > 3 ) ? ra[k+3] : 0.0f;
   1158 
   1159 			float rb0 = ( i - k > 0 ) ? rb[k+0] : 0.0f;
   1160 			float rb1 = ( i - k > 1 ) ? rb[k+1] : 0.0f;
   1161 			float rb2 = ( i - k > 2 ) ? rb[k+2] : 0.0f;
   1162 			float rb3 = ( i - k > 3 ) ? rb[k+3] : 0.0f;
   1163 
   1164 			float rc0 = ( i - k > 0 ) ? rc[k+0] : 0.0f;
   1165 			float rc1 = ( i - k > 1 ) ? rc[k+1] : 0.0f;
   1166 			float rc2 = ( i - k > 2 ) ? rc[k+2] : 0.0f;
   1167 			float rc3 = ( i - k > 3 ) ? rc[k+3] : 0.0f;
   1168 
   1169 			float rd0 = ( i - k > 0 ) ? rd[k+0] : 0.0f;
   1170 			float rd1 = ( i - k > 1 ) ? rd[k+1] : 0.0f;
   1171 			float rd2 = ( i - k > 2 ) ? rd[k+2] : 0.0f;
   1172 			float rd3 = ( i - k > 3 ) ? rd[k+3] : 0.0f;
   1173 
   1174 			a0 -= ra0 * v[k+0];
   1175 			a1 -= ra1 * v[k+1];
   1176 			a2 -= ra2 * v[k+2];
   1177 			a3 -= ra3 * v[k+3];
   1178 
   1179 			b0 -= rb0 * v[k+0];
   1180 			b1 -= rb1 * v[k+1];
   1181 			b2 -= rb2 * v[k+2];
   1182 			b3 -= rb3 * v[k+3];
   1183 
   1184 			c0 -= rc0 * v[k+0];
   1185 			c1 -= rc1 * v[k+1];
   1186 			c2 -= rc2 * v[k+2];
   1187 			c3 -= rc3 * v[k+3];
   1188 
   1189 			d0 -= rd0 * v[k+0];
   1190 			d1 -= rd1 * v[k+1];
   1191 			d2 -= rd2 * v[k+2];
   1192 			d3 -= rd3 * v[k+3];
   1193 
   1194 			ra[i] = ( a0 + a1 + a2 + a3 ) * d;
   1195 			rb[i] = ( b0 + b1 + b2 + b3 ) * d;
   1196 			rc[i] = ( c0 + c1 + c2 + c3 ) * d;
   1197 			rd[i] = ( d0 + d1 + d2 + d3 ) * d;
   1198 		}
   1199 		for ( ; j < n; j++ ) {
   1200 			mptr = mat[j];
   1201 
   1202 			assert_16_byte_aligned( v );
   1203 			assert_16_byte_aligned( mptr );
   1204 
   1205 			float a0 = - mptr[0] * v[0] + mptr[i];
   1206 			float a1 = - mptr[1] * v[1];
   1207 			float a2 = - mptr[2] * v[2];
   1208 			float a3 = - mptr[3] * v[3];
   1209 
   1210 			int k = 4;
   1211 			for ( ; k < i - 3; k += 4 ) {
   1212 				a0 -= mptr[k+0] * v[k+0];
   1213 				a1 -= mptr[k+1] * v[k+1];
   1214 				a2 -= mptr[k+2] * v[k+2];
   1215 				a3 -= mptr[k+3] * v[k+3];
   1216 			}
   1217 
   1218 			float m0 = ( i - k > 0 ) ? mptr[k+0] : 0.0f;
   1219 			float m1 = ( i - k > 1 ) ? mptr[k+1] : 0.0f;
   1220 			float m2 = ( i - k > 2 ) ? mptr[k+2] : 0.0f;
   1221 			float m3 = ( i - k > 3 ) ? mptr[k+3] : 0.0f;
   1222 
   1223 			a0 -= m0 * v[k+0];
   1224 			a1 -= m1 * v[k+1];
   1225 			a2 -= m2 * v[k+2];
   1226 			a3 -= m3 * v[k+3];
   1227 
   1228 			mptr[i] = ( a0 + a1 + a2 + a3 ) * d;
   1229 		}
   1230 	}
   1231 	return true;
   1232 
   1233 #endif
   1234 }
   1235 
   1236 /*
   1237 ========================
   1238 GetMaxStep_SIMD
   1239 ========================
   1240 */
   1241 static void GetMaxStep_SIMD( const float * f, const float * a, const float * delta_f, const float * delta_a,
   1242 							const float * lo, const float * hi, const int * side, int numUnbounded, int numClamped,
   1243 							int d, float dir, float & maxStep, int & limit, int & limitSide ) {
   1244 
   1245 #ifdef ID_WIN_X86_SSE2_INTRIN
   1246 
   1247 	__m128 vMaxStep;
   1248 	__m128i vLimit;
   1249 	__m128i vLimitSide;
   1250 
   1251 	// default to a full step for the current variable
   1252 	{
   1253 		__m128 vNegAccel = _mm_xor_ps( _mm_load1_ps( a + d ), (__m128 &) SIMD_SP_signBit );
   1254 		__m128 vDeltaAccel = _mm_load1_ps( delta_a + d );
   1255 		__m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaAccel, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_ACCEL_EPSILON );
   1256 		__m128 vStep = _mm_div32_ps( vNegAccel, _mm_sel_ps( SIMD_SP_one, vDeltaAccel, vM0 ) );
   1257 		vMaxStep = _mm_sel_ps( SIMD_SP_zero, vStep, vM0 );
   1258 		vLimit = _mm_shuffle_epi32( _mm_cvtsi32_si128( d ), 0 );
   1259 		vLimitSide = (__m128i &) SIMD_DW_zero;
   1260 	}
   1261 
   1262 	// test the current variable
   1263 	{
   1264 		__m128 vDeltaForce = _mm_load1_ps( & dir );
   1265 		__m128 vSign = _mm_cmplt_ps( vDeltaForce, SIMD_SP_zero );
   1266 		__m128 vForceLimit = _mm_sel_ps( _mm_load1_ps( hi + d ), _mm_load1_ps( lo + d ), vSign );
   1267 		__m128 vStep = _mm_div32_ps( _mm_sub_ps( vForceLimit, _mm_load1_ps( f + d ) ), vDeltaForce );
   1268 		__m128i vSetSide = _mm_or_si128( __m128c( vSign ), (__m128i &) SIMD_DW_one );
   1269 		__m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaForce, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_FORCE_EPSILON );
   1270 		__m128 vM1 = _mm_cmpneq_ps( _mm_and_ps( vForceLimit, (__m128 &) SIMD_SP_absMask ), SIMD_SP_infinity );
   1271 		__m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep );
   1272 		__m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 );
   1273 		vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 );
   1274 		vLimitSide = _mm_sel_si128( vLimitSide, vSetSide, __m128c( vM3 ) );
   1275 	}
   1276 
   1277 	// test the clamped bounded variables
   1278 	{
   1279 		__m128 mask = (__m128 &) SIMD_SP_indexedStartMask[numUnbounded & 3];
   1280 		__m128i index = _mm_add_epi32( _mm_and_si128( _mm_shuffle_epi32( _mm_cvtsi32_si128( numUnbounded ), 0 ), (__m128i &) SIMD_DW_not3 ), (__m128i &) SIMD_DW_index );
   1281 		int i = numUnbounded & ~3;
   1282 		for ( ; i < numClamped - 3; i += 4 ) {
   1283 			__m128 vDeltaForce = _mm_and_ps( _mm_load_ps( delta_f + i ), mask );
   1284 			__m128 vSign = _mm_cmplt_ps( vDeltaForce, SIMD_SP_zero );
   1285 			__m128 vForceLimit = _mm_sel_ps( _mm_load_ps( hi + i ), _mm_load_ps( lo + i ), vSign );
   1286 			__m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaForce, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_FORCE_EPSILON );
   1287 			__m128 vStep = _mm_div32_ps( _mm_sub_ps( vForceLimit, _mm_load_ps( f + i ) ), _mm_sel_ps( SIMD_SP_one, vDeltaForce, vM0 ) );
   1288 			__m128i vSetSide = _mm_or_si128( __m128c( vSign ), (__m128i &) SIMD_DW_one );
   1289 			__m128 vM1 = _mm_cmpneq_ps( _mm_and_ps( vForceLimit, (__m128 &) SIMD_SP_absMask ), SIMD_SP_infinity );
   1290 			__m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep );
   1291 			__m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 );
   1292 			vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 );
   1293 			vLimit = _mm_sel_si128( vLimit, index, vM3 );
   1294 			vLimitSide = _mm_sel_si128( vLimitSide, vSetSide, __m128c( vM3 ) );
   1295 			mask = (__m128 &) SIMD_SP_indexedStartMask[0];
   1296 			index = _mm_add_epi32( index, (__m128i &) SIMD_DW_four );
   1297 		}
   1298 		__m128 vDeltaForce = _mm_and_ps( _mm_load_ps( delta_f + i ), _mm_and_ps( mask, (__m128 &) SIMD_SP_indexedEndMask[numClamped & 3] ) );
   1299 		__m128 vSign = _mm_cmplt_ps( vDeltaForce, SIMD_SP_zero );
   1300 		__m128 vForceLimit = _mm_sel_ps( _mm_load_ps( hi + i ), _mm_load_ps( lo + i ), vSign );
   1301 		__m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaForce, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_FORCE_EPSILON );
   1302 		__m128 vStep = _mm_div32_ps( _mm_sub_ps( vForceLimit, _mm_load_ps( f + i ) ), _mm_sel_ps( SIMD_SP_one, vDeltaForce, vM0 ) );
   1303 		__m128i vSetSide = _mm_or_si128( __m128c( vSign ), (__m128i &) SIMD_DW_one );
   1304 		__m128 vM1 = _mm_cmpneq_ps( _mm_and_ps( vForceLimit, (__m128 &) SIMD_SP_absMask ), SIMD_SP_infinity );
   1305 		__m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep );
   1306 		__m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 );
   1307 		vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 );
   1308 		vLimit = _mm_sel_si128( vLimit, index, vM3 );
   1309 		vLimitSide = _mm_sel_si128( vLimitSide, vSetSide, __m128c( vM3 ) );
   1310 	}
   1311 
   1312 	// test the not clamped bounded variables
   1313 	{
   1314 		__m128 mask = (__m128 &) SIMD_SP_indexedStartMask[numClamped & 3];
   1315 		__m128i index = _mm_add_epi32( _mm_and_si128( _mm_shuffle_epi32( _mm_cvtsi32_si128( numClamped ), 0 ), (__m128i &) SIMD_DW_not3 ), (__m128i &) SIMD_DW_index );
   1316 		int i = numClamped & ~3;
   1317 		for ( ; i < d - 3; i += 4 ) {
   1318 			__m128 vNegAccel = _mm_xor_ps( _mm_load_ps( a + i ), (__m128 &) SIMD_SP_signBit );
   1319 			__m128 vDeltaAccel = _mm_and_ps( _mm_load_ps( delta_a + i ), mask );
   1320 			__m128 vSide = _mm_cvtepi32_ps( _mm_load_si128( (__m128i *) ( side + i ) ) );
   1321 			__m128 vM0 = _mm_cmpgt_ps( _mm_mul_ps( vSide, vDeltaAccel ), SIMD_SP_LCP_DELTA_ACCEL_EPSILON );
   1322 			__m128 vStep = _mm_div32_ps( vNegAccel, _mm_sel_ps( SIMD_SP_one, vDeltaAccel, vM0 ) );
   1323 			__m128 vM1 = _mm_or_ps( _mm_cmplt_ps( _mm_load_ps( lo + i ), SIMD_SP_neg_LCP_BOUND_EPSILON ), _mm_cmpgt_ps( _mm_load_ps( hi + i ), SIMD_SP_LCP_BOUND_EPSILON ) );
   1324 			__m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep );
   1325 			__m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 );
   1326 			vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 );
   1327 			vLimit = _mm_sel_si128( vLimit, index, vM3 );
   1328 			vLimitSide = _mm_sel_si128( vLimitSide, (__m128i &) SIMD_DW_zero, __m128c( vM3 ) );
   1329 			mask = (__m128 &) SIMD_SP_indexedStartMask[0];
   1330 			index = _mm_add_epi32( index, (__m128i &) SIMD_DW_four );
   1331 		}
   1332 		__m128 vNegAccel = _mm_xor_ps( _mm_load_ps( a + i ), (__m128 &) SIMD_SP_signBit );
   1333 		__m128 vDeltaAccel = _mm_and_ps( _mm_load_ps( delta_a + i ), _mm_and_ps( mask, (__m128 &) SIMD_SP_indexedEndMask[d & 3] ) );
   1334 		__m128 vSide = _mm_cvtepi32_ps( _mm_load_si128( (__m128i *) ( side + i ) ) );
   1335 		__m128 vM0 = _mm_cmpgt_ps( _mm_mul_ps( vSide, vDeltaAccel ), SIMD_SP_LCP_DELTA_ACCEL_EPSILON );
   1336 		__m128 vStep = _mm_div32_ps( vNegAccel, _mm_sel_ps( SIMD_SP_one, vDeltaAccel, vM0 ) );
   1337 		__m128 vM1 = _mm_or_ps( _mm_cmplt_ps( _mm_load_ps( lo + i ), SIMD_SP_neg_LCP_BOUND_EPSILON ), _mm_cmpgt_ps( _mm_load_ps( hi + i ), SIMD_SP_LCP_BOUND_EPSILON ) );
   1338 		__m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep );
   1339 		__m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 );
   1340 		vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 );
   1341 		vLimit = _mm_sel_si128( vLimit, index, vM3 );
   1342 		vLimitSide = _mm_sel_si128( vLimitSide, (__m128i &) SIMD_DW_zero, __m128c( vM3 ) );
   1343 	}
   1344 
   1345 	{
   1346 		__m128 tMaxStep = _mm_shuffle_ps( vMaxStep, vMaxStep, _MM_SHUFFLE( 1, 0, 3, 2 ) );
   1347 		__m128i tLimit = _mm_shuffle_epi32( vLimit, _MM_SHUFFLE( 1, 0, 3, 2 ) );
   1348 		__m128i tLimitSide = _mm_shuffle_epi32( vLimitSide, _MM_SHUFFLE( 1, 0, 3, 2 ) );
   1349 		__m128c mask = _mm_cmplt_ps( tMaxStep, vMaxStep );
   1350 		vMaxStep = _mm_min_ps( vMaxStep, tMaxStep );
   1351 		vLimit = _mm_sel_si128( vLimit, tLimit, mask );
   1352 		vLimitSide = _mm_sel_si128( vLimitSide, tLimitSide, mask );
   1353 	}
   1354 
   1355 	{
   1356 		__m128 tMaxStep = _mm_shuffle_ps( vMaxStep, vMaxStep, _MM_SHUFFLE( 2, 3, 0, 1 ) );
   1357 		__m128i tLimit = _mm_shuffle_epi32( vLimit, _MM_SHUFFLE( 2, 3, 0, 1 ) );
   1358 		__m128i tLimitSide = _mm_shuffle_epi32( vLimitSide, _MM_SHUFFLE( 2, 3, 0, 1 ) );
   1359 		__m128c mask = _mm_cmplt_ps( tMaxStep, vMaxStep );
   1360 		vMaxStep = _mm_min_ps( vMaxStep, tMaxStep );
   1361 		vLimit = _mm_sel_si128( vLimit, tLimit, mask );
   1362 		vLimitSide = _mm_sel_si128( vLimitSide, tLimitSide, mask );
   1363 	}
   1364 
   1365 	_mm_store_ss( & maxStep, vMaxStep );
   1366 	limit = _mm_cvtsi128_si32( vLimit );
   1367 	limitSide = _mm_cvtsi128_si32( vLimitSide );
   1368 
   1369 #else
   1370 
   1371 	// default to a full step for the current variable
   1372 	{
   1373 		float negAccel = -a[d];
   1374 		float deltaAccel = delta_a[d];
   1375 		int m0 = ( fabs( deltaAccel ) > LCP_DELTA_ACCEL_EPSILON );
   1376 		float step = negAccel / ( m0 ? deltaAccel : 1.0f );
   1377 		maxStep = m0 ? step : 0.0f;
   1378 		limit = d;
   1379 		limitSide = 0;
   1380 	}
   1381 
   1382 	// test the current variable
   1383 	{
   1384 		float deltaForce = dir;
   1385 		float forceLimit = ( deltaForce < 0.0f ) ? lo[d] : hi[d];
   1386 		float step = ( forceLimit - f[d] ) / deltaForce;
   1387 		int setSide = ( deltaForce < 0.0f ) ? -1 : 1;
   1388 		int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON );
   1389 		int m1 = ( fabs( forceLimit ) != idMath::INFINITY );
   1390 		int m2 = ( step < maxStep );
   1391 		int m3 = ( m0 & m1 & m2 );
   1392 		maxStep = m3 ? step : maxStep;
   1393 		limit = m3 ? d : limit;
   1394 		limitSide = m3 ? setSide : limitSide;
   1395 	}
   1396 
   1397 	// test the clamped bounded variables
   1398 	for ( int i = numUnbounded; i < numClamped; i++ ) {
   1399 		float deltaForce = delta_f[i];
   1400 		float forceLimit = ( deltaForce < 0.0f ) ? lo[i] : hi[i];
   1401 		int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON );
   1402 		float step = ( forceLimit - f[i] ) / ( m0 ? deltaForce : 1.0f );
   1403 		int setSide = ( deltaForce < 0.0f ) ? -1 : 1;
   1404 		int m1 = ( fabs( forceLimit ) != idMath::INFINITY );
   1405 		int m2 = ( step < maxStep );
   1406 		int m3 = ( m0 & m1 & m2 );
   1407 		maxStep = m3 ? step : maxStep;
   1408 		limit = m3 ? i : limit;
   1409 		limitSide = m3 ? setSide : limitSide;
   1410 	}
   1411 
   1412 	// test the not clamped bounded variables
   1413 	for ( int i = numClamped; i < d; i++ ) {
   1414 		float negAccel = -a[i];
   1415 		float deltaAccel = delta_a[i];
   1416 		int m0 = ( side[i] * deltaAccel > LCP_DELTA_ACCEL_EPSILON );
   1417 		float step = negAccel / ( m0 ? deltaAccel : 1.0f );
   1418 		int m1 = ( lo[i] < -LCP_BOUND_EPSILON || hi[i] > LCP_BOUND_EPSILON );
   1419 		int m2 = ( step < maxStep );
   1420 		int m3 = ( m0 & m1 & m2 );
   1421 		maxStep = m3 ? step : maxStep;
   1422 		limit = m3 ? i : limit;
   1423 		limitSide = m3 ? 0 : limitSide;
   1424 	}
   1425 
   1426 #endif
   1427 }
   1428 
   1429 /*
   1430 ================================================================================================
   1431 
   1432 	SIMD test code
   1433 
   1434 ================================================================================================
   1435 */
   1436 
   1437 //#define ENABLE_TEST_CODE
   1438 
   1439 #ifdef ENABLE_TEST_CODE
   1440 
   1441 #define TEST_TRIANGULAR_SOLVE_SIMD_EPSILON		0.1f
   1442 #define TEST_TRIANGULAR_SOLVE_SIZE				50
   1443 #define TEST_FACTOR_SIMD_EPSILON				0.1f
   1444 #define TEST_FACTOR_SOLVE_SIZE					50
   1445 #define NUM_TESTS	50
   1446 
   1447 /*
   1448 ========================
   1449 PrintClocks
   1450 ========================
   1451 */
   1452 static void PrintClocks( const char * string, int dataCount, int64 clocks, int64 otherClocks = 0 ) {
   1453 	idLib::Printf( string );
   1454 	for ( int i = idStr::LengthWithoutColors(string); i < 48; i++ ) {
   1455 		idLib::Printf(" ");
   1456 	}
   1457 	if ( clocks && otherClocks ) {
   1458 		int p = 0;
   1459 		if ( clocks <= otherClocks ) {
   1460 			p = idMath::Ftoi( (float) ( otherClocks - clocks ) * 100.0f / (float) otherClocks );
   1461 		} else {
   1462 			p = - idMath::Ftoi( (float) ( clocks - otherClocks ) * 100.0f / (float) clocks );
   1463 		}
   1464 		idLib::Printf( "c = %4d, clcks = %5lld, %d%%\n", dataCount, clocks, p );
   1465 	} else {
   1466 		idLib::Printf( "c = %4d, clcks = %5lld\n", dataCount, clocks );
   1467 	}
   1468 }
   1469 
   1470 /*
   1471 ========================
   1472 DotProduct_Test
   1473 ========================
   1474 */
   1475 static void DotProduct_Test() {
   1476 	ALIGN16( float fsrc0[TEST_TRIANGULAR_SOLVE_SIZE + 1]; )
   1477 	ALIGN16( float fsrc1[TEST_TRIANGULAR_SOLVE_SIZE + 1]; )
   1478 
   1479 	idRandom srnd( 13 );
   1480 
   1481 	for ( int i = 0; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) {
   1482 		fsrc0[i] = srnd.CRandomFloat() * 10.0f;
   1483 		fsrc1[i] = srnd.CRandomFloat() * 10.0f;
   1484 	}
   1485 
   1486 	idTimer timer;
   1487 
   1488 	for ( int i = 0; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) {
   1489 
   1490 		float dot1 = DotProduct_Generic( fsrc0, fsrc1, i );
   1491 		int64 clocksGeneric = 0xFFFFFFFFFFFF;
   1492 		for ( int j = 0; j < NUM_TESTS; j++ ) {
   1493 			fsrc1[TEST_TRIANGULAR_SOLVE_SIZE] = j;
   1494 			timer.Clear();
   1495 			timer.Start();
   1496 			dot1 = DotProduct_Generic( fsrc0, fsrc1, i );
   1497 			timer.Stop();
   1498 			clocksGeneric = Min( clocksGeneric, timer.ClockTicks() );
   1499 		}
   1500 
   1501 		PrintClocks( va( "DotProduct_Generic %d", i ), 1, clocksGeneric );
   1502 
   1503 		float dot2 = DotProduct_SIMD( fsrc0, fsrc1, i );
   1504 		int64 clocksSIMD = 0xFFFFFFFFFFFF;
   1505 		for ( int j = 0; j < NUM_TESTS; j++ ) {
   1506 			fsrc1[TEST_TRIANGULAR_SOLVE_SIZE] = j;
   1507 			timer.Clear();
   1508 			timer.Start();
   1509 			dot2 = DotProduct_SIMD( fsrc0, fsrc1, i );
   1510 			timer.Stop();
   1511 			clocksSIMD = Min( clocksSIMD, timer.ClockTicks() );
   1512 		}
   1513 
   1514 		const char * result = idMath::Fabs( dot1 - dot2 ) < 1e-4f ? "ok" : S_COLOR_RED"X";
   1515 		PrintClocks( va( "DotProduct_SIMD    %d %s", i, result ), 1, clocksSIMD, clocksGeneric );
   1516 	}
   1517 }
   1518 
   1519 /*
   1520 ========================
   1521 LowerTriangularSolve_Test
   1522 ========================
   1523 */
   1524 static void LowerTriangularSolve_Test() {
   1525 	idMatX L;
   1526 	idVecX x, b, tst;
   1527 
   1528 	int paddedSize = ( TEST_TRIANGULAR_SOLVE_SIZE + 3 ) & ~3;
   1529 
   1530 	L.Random( paddedSize, paddedSize, 0, -1.0f, 1.0f );
   1531 	x.SetSize( paddedSize );
   1532 	b.Random( paddedSize, 0, -1.0f, 1.0f );
   1533 
   1534 	idTimer timer;
   1535 
   1536 	const int skip = 0;
   1537 
   1538 	for ( int i = 1; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) {
   1539 
   1540 		x.Zero( i );
   1541 
   1542 		LowerTriangularSolve_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip );
   1543 		int64 clocksGeneric = 0xFFFFFFFFFFFF;
   1544 		for ( int j = 0; j < NUM_TESTS; j++ ) {
   1545 			timer.Clear();
   1546 			timer.Start();
   1547 			LowerTriangularSolve_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip );
   1548 			timer.Stop();
   1549 			clocksGeneric = Min( clocksGeneric, timer.ClockTicks() );
   1550 		}
   1551 
   1552 		tst = x;
   1553 		x.Zero();
   1554 
   1555 		PrintClocks( va( "LowerTriangularSolve_Generic %dx%d", i, i ), 1, clocksGeneric );
   1556 
   1557 		LowerTriangularSolve_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip );
   1558 		int64 clocksSIMD = 0xFFFFFFFFFFFF;
   1559 		for ( int j = 0; j < NUM_TESTS; j++ ) {
   1560 			timer.Clear();
   1561 			timer.Start();
   1562 			LowerTriangularSolve_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip );
   1563 			timer.Stop();
   1564 			clocksSIMD = Min( clocksSIMD, timer.ClockTicks() );
   1565 		}
   1566 
   1567 		const char * result = x.Compare( tst, TEST_TRIANGULAR_SOLVE_SIMD_EPSILON ) ? "ok" : S_COLOR_RED"X";
   1568 		PrintClocks( va( "LowerTriangularSolve_SIMD    %dx%d %s", i, i, result ), 1, clocksSIMD, clocksGeneric );
   1569 	}
   1570 }
   1571 
   1572 /*
   1573 ========================
   1574 LowerTriangularSolveTranspose_Test
   1575 ========================
   1576 */
   1577 static void LowerTriangularSolveTranspose_Test() {
   1578 	idMatX L;
   1579 	idVecX x, b, tst;
   1580 
   1581 	int paddedSize = ( TEST_TRIANGULAR_SOLVE_SIZE + 3 ) & ~3;
   1582 
   1583 	L.Random( paddedSize, paddedSize, 0, -1.0f, 1.0f );
   1584 	x.SetSize( paddedSize );
   1585 	b.Random( paddedSize, 0, -1.0f, 1.0f );
   1586 
   1587 	idTimer timer;
   1588 
   1589 	for ( int i = 1; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) {
   1590 
   1591 		x.Zero( i );
   1592 
   1593 		LowerTriangularSolveTranspose_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i );
   1594 		int64 clocksGeneric = 0xFFFFFFFFFFFF;
   1595 		for ( int j = 0; j < NUM_TESTS; j++ ) {
   1596 			timer.Clear();
   1597 			timer.Start();
   1598 			LowerTriangularSolveTranspose_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i );
   1599 			timer.Stop();
   1600 			clocksGeneric = Min( clocksGeneric, timer.ClockTicks() );
   1601 		}
   1602 
   1603 		tst = x;
   1604 		x.Zero();
   1605 
   1606 		PrintClocks( va( "LowerTriangularSolveTranspose_Generic %dx%d", i, i ), 1, clocksGeneric );
   1607 
   1608 		LowerTriangularSolveTranspose_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i );
   1609 		int64 clocksSIMD = 0xFFFFFFFFFFFF;
   1610 		for ( int j = 0; j < NUM_TESTS; j++ ) {
   1611 			timer.Clear();
   1612 			timer.Start();
   1613 			LowerTriangularSolveTranspose_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i );
   1614 			timer.Stop();
   1615 			clocksSIMD = Min( clocksSIMD, timer.ClockTicks() );
   1616 		}
   1617 
   1618 		const char * result = x.Compare( tst, TEST_TRIANGULAR_SOLVE_SIMD_EPSILON ) ? "ok" : S_COLOR_RED"X";
   1619 		PrintClocks( va( "LowerTriangularSolveTranspose_SIMD    %dx%d %s", i, i, result ), 1, clocksSIMD, clocksGeneric );
   1620 	}
   1621 }
   1622 
   1623 /*
   1624 ========================
   1625 LDLT_Factor_Test
   1626 ========================
   1627 */
   1628 static void LDLT_Factor_Test() {
   1629 	idMatX src, original, mat1, mat2;
   1630 	idVecX invDiag1, invDiag2;
   1631 
   1632 	int paddedSize = ( TEST_FACTOR_SOLVE_SIZE + 3 ) & ~3;
   1633 
   1634 	original.SetSize( paddedSize, paddedSize );
   1635 	src.Random( paddedSize, paddedSize, 0, -1.0f, 1.0f );
   1636 	src.TransposeMultiply( original, src );
   1637 
   1638 	idTimer timer;
   1639 
   1640 	for ( int i = 1; i < TEST_FACTOR_SOLVE_SIZE; i++ ) {
   1641 
   1642 		int64 clocksGeneric = 0xFFFFFFFFFFFF;
   1643 		for ( int j = 0; j < NUM_TESTS; j++ ) {
   1644 			mat1 = original;
   1645 			invDiag1.Zero( TEST_FACTOR_SOLVE_SIZE );
   1646 			timer.Clear();
   1647 			timer.Start();
   1648 			LDLT_Factor_Generic( mat1, invDiag1, i );
   1649 			timer.Stop();
   1650 			clocksGeneric = Min( clocksGeneric, timer.ClockTicks() );
   1651 		}
   1652 
   1653 		PrintClocks( va( "LDLT_Factor_Generic %dx%d", i, i ), 1, clocksGeneric );
   1654 
   1655 		int64 clocksSIMD = 0xFFFFFFFFFFFF;
   1656 		for ( int j = 0; j < NUM_TESTS; j++ ) {
   1657 			mat2 = original;
   1658 			invDiag2.Zero( TEST_FACTOR_SOLVE_SIZE );
   1659 			timer.Clear();
   1660 			timer.Start();
   1661 			LDLT_Factor_SIMD( mat2, invDiag2, i );
   1662 			timer.Stop();
   1663 			clocksSIMD = Min( clocksSIMD, timer.ClockTicks() );
   1664 		}
   1665 
   1666 		const char * result = mat1.Compare( mat2, TEST_FACTOR_SIMD_EPSILON ) && invDiag1.Compare( invDiag2, TEST_FACTOR_SIMD_EPSILON ) ? "ok" : S_COLOR_RED"X";
   1667 		PrintClocks( va( "LDLT_Factor_SIMD    %dx%d %s", i, i, result ), 1, clocksSIMD, clocksGeneric );
   1668 	}
   1669 }
   1670 #endif
   1671 
   1672 #define Multiply						Multiply_SIMD
   1673 #define MultiplyAdd						MultiplyAdd_SIMD
   1674 #define BigDotProduct					DotProduct_SIMD
   1675 #define LowerTriangularSolve			LowerTriangularSolve_SIMD
   1676 #define LowerTriangularSolveTranspose	LowerTriangularSolveTranspose_SIMD
   1677 #define UpperTriangularSolve			UpperTriangularSolve_SIMD
   1678 #define LU_Factor						LU_Factor_SIMD
   1679 #define LDLT_Factor						LDLT_Factor_SIMD
   1680 #define GetMaxStep						GetMaxStep_SIMD
   1681 
   1682 /*
   1683 ================================================================================================
   1684 
   1685 	idLCP_Square
   1686 
   1687 ================================================================================================
   1688 */
   1689 
   1690 /*
   1691 ================================================
   1692 idLCP_Square
   1693 ================================================
   1694 */
   1695 class idLCP_Square : public idLCP {
   1696 public:
   1697 	virtual bool	Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex );
   1698 
   1699 private:
   1700 	idMatX			m;					// original matrix
   1701 	idVecX			b;					// right hand side
   1702 	idVecX			lo, hi;				// low and high bounds
   1703 	idVecX			f, a;				// force and acceleration
   1704 	idVecX			delta_f, delta_a;	// delta force and delta acceleration
   1705 	idMatX			clamped;			// LU factored sub matrix for clamped variables
   1706 	idVecX			diagonal;			// reciprocal of diagonal of U of the LU factored sub matrix for clamped variables
   1707 	int				numUnbounded;		// number of unbounded variables
   1708 	int				numClamped;			// number of clamped variables
   1709 	float **		rowPtrs;			// pointers to the rows of m
   1710 	int *			boxIndex;			// box index
   1711 	int *			side;				// tells if a variable is at the low boundary = -1, high boundary = 1 or inbetween = 0
   1712 	int *			permuted;			// index to keep track of the permutation
   1713 	bool			padded;				// set to true if the rows of the initial matrix are 16 byte padded
   1714 
   1715 	bool			FactorClamped();
   1716 	void			SolveClamped( idVecX & x, const float * b );
   1717 	void			Swap( int i, int j );
   1718 	void			AddClamped( int r );
   1719 	void			RemoveClamped( int r );
   1720 	void			CalcForceDelta( int d, float dir );
   1721 	void			CalcAccelDelta( int d );
   1722 	void			ChangeForce( int d, float step );
   1723 	void			ChangeAccel( int d, float step );
   1724 };
   1725 
   1726 /*
   1727 ========================
   1728 idLCP_Square::FactorClamped
   1729 ========================
   1730 */
   1731 bool idLCP_Square::FactorClamped() {
   1732 	for ( int i = 0; i < numClamped; i++ ) {
   1733 		memcpy( clamped[i], rowPtrs[i], numClamped * sizeof( float ) );
   1734 	}
   1735 	return LU_Factor( clamped, diagonal, numClamped );
   1736 }
   1737 
   1738 /*
   1739 ========================
   1740 idLCP_Square::SolveClamped
   1741 ========================
   1742 */
   1743 void idLCP_Square::SolveClamped( idVecX & x, const float * b ) {
   1744 	// solve L
   1745 	LowerTriangularSolve( clamped, x.ToFloatPtr(), b, numClamped, 0 );
   1746 
   1747 	// solve U
   1748 	UpperTriangularSolve( clamped, diagonal.ToFloatPtr(), x.ToFloatPtr(), x.ToFloatPtr(), numClamped );
   1749 }
   1750 
   1751 /*
   1752 ========================
   1753 idLCP_Square::Swap
   1754 ========================
   1755 */
   1756 void idLCP_Square::Swap( int i, int j ) {
   1757 
   1758 	if ( i == j ) {
   1759 		return;
   1760 	}
   1761 
   1762 	SwapValues( rowPtrs[i], rowPtrs[j] );
   1763 	m.SwapColumns( i, j );
   1764 	b.SwapElements( i, j );
   1765 	lo.SwapElements( i, j );
   1766 	hi.SwapElements( i, j );
   1767 	a.SwapElements( i, j );
   1768 	f.SwapElements( i, j );
   1769 	if ( boxIndex != NULL ) {
   1770 		SwapValues( boxIndex[i], boxIndex[j] );
   1771 	}
   1772 	SwapValues( side[i], side[j] );
   1773 	SwapValues( permuted[i], permuted[j] );
   1774 }
   1775 
   1776 /*
   1777 ========================
   1778 idLCP_Square::AddClamped
   1779 ========================
   1780 */
   1781 void idLCP_Square::AddClamped( int r ) {
   1782 
   1783 	assert( r >= numClamped );
   1784 
   1785 	// add a row at the bottom and a column at the right of the factored
   1786 	// matrix for the clamped variables
   1787 
   1788 	Swap( numClamped, r );
   1789 
   1790 	// add row to L
   1791 	for ( int i = 0; i < numClamped; i++ ) {
   1792 		float sum = rowPtrs[numClamped][i];
   1793 		for ( int j = 0; j < i; j++ ) {
   1794 			sum -= clamped[numClamped][j] * clamped[j][i];
   1795 		}
   1796 		clamped[numClamped][i] = sum * diagonal[i];
   1797 	}
   1798 
   1799 	// add column to U
   1800 	for ( int i = 0; i <= numClamped; i++ ) {
   1801 		float sum = rowPtrs[i][numClamped];
   1802 		for ( int j = 0; j < i; j++ ) {
   1803 			sum -= clamped[i][j] * clamped[j][numClamped];
   1804 		}
   1805 		clamped[i][numClamped] = sum;
   1806 	}
   1807 
   1808 	diagonal[numClamped] = 1.0f / clamped[numClamped][numClamped];
   1809 
   1810 	numClamped++;
   1811 }
   1812 
   1813 /*
   1814 ========================
   1815 idLCP_Square::RemoveClamped
   1816 ========================
   1817 */
   1818 void idLCP_Square::RemoveClamped( int r ) {
   1819 
   1820 	if ( !verify( r < numClamped ) ) {
   1821 		// complete fail, most likely due to exceptional floating point values
   1822 		return;
   1823 	}
   1824 
   1825 	numClamped--;
   1826 
   1827 	// no need to swap and update the factored matrix when the last row and column are removed
   1828 	if ( r == numClamped ) {
   1829 		return;
   1830 	}
   1831 
   1832 	float * y0 = (float *) _alloca16( numClamped * sizeof( float ) );
   1833 	float * z0 = (float *) _alloca16( numClamped * sizeof( float ) );
   1834 	float * y1 = (float *) _alloca16( numClamped * sizeof( float ) );
   1835 	float * z1 = (float *) _alloca16( numClamped * sizeof( float ) );
   1836 
   1837 	// the row/column need to be subtracted from the factorization
   1838 	for ( int i = 0; i < numClamped; i++ ) {
   1839 		y0[i] = -rowPtrs[i][r];
   1840 	}
   1841 
   1842 	memset( y1, 0, numClamped * sizeof( float ) );
   1843 	y1[r] = 1.0f;
   1844 
   1845 	memset( z0, 0, numClamped * sizeof( float ) );
   1846 	z0[r] = 1.0f;
   1847 
   1848 	for ( int i = 0; i < numClamped; i++ ) {
   1849 		z1[i] = -rowPtrs[r][i];
   1850 	}
   1851 
   1852 	// swap the to be removed row/column with the last row/column
   1853 	Swap( r, numClamped );
   1854 
   1855 	// the swapped last row/column need to be added to the factorization
   1856 	for ( int i = 0; i < numClamped; i++ ) {
   1857 		y0[i] += rowPtrs[i][r];
   1858 	}
   1859 
   1860 	for ( int i = 0; i < numClamped; i++ ) {
   1861 		z1[i] += rowPtrs[r][i];
   1862 	}
   1863 	z1[r] = 0.0f;
   1864 
   1865 	// update the beginning of the to be updated row and column
   1866 	for ( int i = 0; i < r; i++ ) {
   1867 		float p0 = y0[i];
   1868 		float beta1 = z1[i] * diagonal[i];
   1869 
   1870 		clamped[i][r] += p0;
   1871 		for ( int j = i+1; j < numClamped; j++ ) {
   1872 			z1[j] -= beta1 * clamped[i][j];
   1873 		}
   1874 		for ( int j = i+1; j < numClamped; j++ ) {
   1875 			y0[j] -= p0 * clamped[j][i];
   1876 		}
   1877 		clamped[r][i] += beta1;
   1878 	}
   1879 
   1880 	// update the lower right corner starting at r,r
   1881 	for ( int i = r; i < numClamped; i++ ) {
   1882 		float diag = clamped[i][i];
   1883 
   1884 		float p0 = y0[i];
   1885 		float p1 = z0[i];
   1886 		diag += p0 * p1;
   1887 
   1888 		if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
   1889 			idLib::Printf( "idLCP_Square::RemoveClamped: updating factorization failed\n" );
   1890 			diag = idMath::FLT_SMALLEST_NON_DENORMAL;
   1891 		}
   1892 
   1893 		float beta0 = p1 / diag;
   1894 
   1895 		float q0 = y1[i];
   1896 		float q1 = z1[i];
   1897 		diag += q0 * q1;
   1898 
   1899 		if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
   1900 			idLib::Printf( "idLCP_Square::RemoveClamped: updating factorization failed\n" );
   1901 			diag = idMath::FLT_SMALLEST_NON_DENORMAL;
   1902 		}
   1903 
   1904 		float d = 1.0f / diag;
   1905 		float beta1 = q1 * d;
   1906 
   1907 		clamped[i][i] = diag;
   1908 		diagonal[i] = d;
   1909 
   1910 		for ( int j = i+1; j < numClamped; j++ ) {
   1911 
   1912 			d = clamped[i][j];
   1913 
   1914 			d += p0 * z0[j];
   1915 			z0[j] -= beta0 * d;
   1916 
   1917 			d += q0 * z1[j];
   1918 			z1[j] -= beta1 * d;
   1919 
   1920 			clamped[i][j] = d;
   1921 		}
   1922 
   1923 		for ( int j = i+1; j < numClamped; j++ ) {
   1924 
   1925 			d = clamped[j][i];
   1926 
   1927 			y0[j] -= p0 * d;
   1928 			d += beta0 * y0[j];
   1929 
   1930 			y1[j] -= q0 * d;
   1931 			d += beta1 * y1[j];
   1932 
   1933 			clamped[j][i] = d;
   1934 		}
   1935 	}
   1936 	return;
   1937 }
   1938 
   1939 /*
   1940 ========================
   1941 idLCP_Square::CalcForceDelta
   1942 
   1943 Modifies this->delta_f.
   1944 ========================
   1945 */
   1946 void idLCP_Square::CalcForceDelta( int d, float dir ) {
   1947 
   1948 	delta_f[d] = dir;
   1949 
   1950 	if ( numClamped <= 0 ) {
   1951 		return;
   1952 	}
   1953 
   1954 	// get column d of matrix
   1955 	float * ptr = (float *) _alloca16( numClamped * sizeof( float ) );
   1956 	for ( int i = 0; i < numClamped; i++ ) {
   1957 		ptr[i] = rowPtrs[i][d];
   1958 	}
   1959 
   1960 	// solve force delta
   1961 	SolveClamped( delta_f, ptr );
   1962 
   1963 	// flip force delta based on direction
   1964 	if ( dir > 0.0f ) {
   1965 		ptr = delta_f.ToFloatPtr();
   1966 		for ( int i = 0; i < numClamped; i++ ) {
   1967 			ptr[i] = - ptr[i];
   1968 		}
   1969 	}
   1970 }
   1971 
   1972 /*
   1973 ========================
   1974 idLCP_Square::CalcAccelDelta
   1975 
   1976 Modifies this->delta_a and uses this->delta_f.
   1977 ========================
   1978 */
   1979 ID_INLINE void idLCP_Square::CalcAccelDelta( int d ) {
   1980 	// only the not clamped variables, including the current variable, can have a change in acceleration
   1981 	for ( int j = numClamped; j <= d; j++ ) {
   1982 		// only the clamped variables and the current variable have a force delta unequal zero
   1983 		float dot = BigDotProduct( rowPtrs[j], delta_f.ToFloatPtr(), numClamped );
   1984 		delta_a[j] = dot + rowPtrs[j][d] * delta_f[d];
   1985 	}
   1986 }
   1987 
   1988 /*
   1989 ========================
   1990 idLCP_Square::ChangeForce
   1991 
   1992 Modifies this->f and uses this->delta_f.
   1993 ========================
   1994 */
   1995 ID_INLINE void idLCP_Square::ChangeForce( int d, float step ) {
   1996 	// only the clamped variables and current variable have a force delta unequal zero
   1997 	MultiplyAdd( f.ToFloatPtr(), step, delta_f.ToFloatPtr(), numClamped );
   1998 	f[d] += step * delta_f[d];
   1999 }
   2000 
   2001 /*
   2002 ========================
   2003 idLCP_Square::ChangeAccel
   2004 
   2005 Modifies this->a and uses this->delta_a.
   2006 ========================
   2007 */
   2008 ID_INLINE void idLCP_Square::ChangeAccel( int d, float step ) {
   2009 	// only the not clamped variables, including the current variable, can have an acceleration unequal zero
   2010 	MultiplyAdd( a.ToFloatPtr() + numClamped, step, delta_a.ToFloatPtr() + numClamped, d - numClamped + 1 );
   2011 }
   2012 
   2013 /*
   2014 ========================
   2015 idLCP_Square::Solve
   2016 ========================
   2017 */
   2018 bool idLCP_Square::Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex ) {
   2019 
   2020 	// true when the matrix rows are 16 byte padded
   2021 	padded = ((o_m.GetNumRows()+3)&~3) == o_m.GetNumColumns();
   2022 
   2023 	assert( padded || o_m.GetNumRows() == o_m.GetNumColumns() );
   2024 	assert( o_x.GetSize() == o_m.GetNumRows() );
   2025 	assert( o_b.GetSize() == o_m.GetNumRows() );
   2026 	assert( o_lo.GetSize() == o_m.GetNumRows() );
   2027 	assert( o_hi.GetSize() == o_m.GetNumRows() );
   2028 
   2029 	// allocate memory for permuted input
   2030 	f.SetData( o_m.GetNumRows(), VECX_ALLOCA( o_m.GetNumRows() ) );
   2031 	a.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
   2032 	b.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
   2033 	lo.SetData( o_lo.GetSize(), VECX_ALLOCA( o_lo.GetSize() ) );
   2034 	hi.SetData( o_hi.GetSize(), VECX_ALLOCA( o_hi.GetSize() ) );
   2035 	if ( o_boxIndex != NULL ) {
   2036 		boxIndex = (int *)_alloca16( o_x.GetSize() * sizeof( int ) );
   2037 		memcpy( boxIndex, o_boxIndex, o_x.GetSize() * sizeof( int ) );
   2038 	} else {
   2039 		boxIndex = NULL;
   2040 	}
   2041 
   2042 	// we override the const on o_m here but on exit the matrix is unchanged
   2043 	m.SetData( o_m.GetNumRows(), o_m.GetNumColumns(), const_cast<float *>(o_m[0]) );
   2044 	f.Zero();
   2045 	a.Zero();
   2046 	b = o_b;
   2047 	lo = o_lo;
   2048 	hi = o_hi;
   2049 
   2050 	// pointers to the rows of m
   2051 	rowPtrs = (float **) _alloca16( m.GetNumRows() * sizeof( float * ) );
   2052 	for ( int i = 0; i < m.GetNumRows(); i++ ) {
   2053 		rowPtrs[i] = m[i];
   2054 	}
   2055 
   2056 	// tells if a variable is at the low boundary, high boundary or inbetween
   2057 	side = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
   2058 
   2059 	// index to keep track of the permutation
   2060 	permuted = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
   2061 	for ( int i = 0; i < m.GetNumRows(); i++ ) {
   2062 		permuted[i] = i;
   2063 	}
   2064 
   2065 	// permute input so all unbounded variables come first
   2066 	numUnbounded = 0;
   2067 	for ( int i = 0; i < m.GetNumRows(); i++ ) {
   2068 		if ( lo[i] == -idMath::INFINITY && hi[i] == idMath::INFINITY ) {
   2069 			if ( numUnbounded != i ) {
   2070 				Swap( numUnbounded, i );
   2071 			}
   2072 			numUnbounded++;
   2073 		}
   2074 	}
   2075 
   2076 	// permute input so all variables using the boxIndex come last
   2077 	int boxStartIndex = m.GetNumRows();
   2078 	if ( boxIndex ) {
   2079 		for ( int i = m.GetNumRows() - 1; i >= numUnbounded; i-- ) {
   2080 			if ( boxIndex[i] >= 0 ) {
   2081 				boxStartIndex--;
   2082 				if ( boxStartIndex != i ) {
   2083 					Swap( boxStartIndex, i );
   2084 				}
   2085 			}
   2086 		}
   2087 	}
   2088 
   2089 	// sub matrix for factorization 
   2090 	clamped.SetData( m.GetNumRows(), m.GetNumColumns(), MATX_ALLOCA( m.GetNumRows() * m.GetNumColumns() ) );
   2091 	diagonal.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
   2092 
   2093 	// all unbounded variables are clamped
   2094 	numClamped = numUnbounded;
   2095 
   2096 	// if there are unbounded variables
   2097 	if ( numUnbounded ) {
   2098 
   2099 		// factor and solve for unbounded variables
   2100 		if ( !FactorClamped() ) {
   2101 			idLib::Printf( "idLCP_Square::Solve: unbounded factorization failed\n" );
   2102 			return false;
   2103 		}
   2104 		SolveClamped( f, b.ToFloatPtr() );
   2105 
   2106 		// if there are no bounded variables we are done
   2107 		if ( numUnbounded == m.GetNumRows() ) {
   2108 			o_x = f;	// the vector is not permuted
   2109 			return true;
   2110 		}
   2111 	}
   2112 
   2113 	int numIgnored = 0;
   2114 
   2115 	// allocate for delta force and delta acceleration
   2116 	delta_f.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
   2117 	delta_a.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
   2118 
   2119 	// solve for bounded variables
   2120 	idStr failed;
   2121 	for ( int i = numUnbounded; i < m.GetNumRows(); i++ ) {
   2122 
   2123 		// once we hit the box start index we can initialize the low and high boundaries of the variables using the box index
   2124 		if ( i == boxStartIndex ) {
   2125 			for ( int j = 0; j < boxStartIndex; j++ ) {
   2126 				o_x[permuted[j]] = f[j];
   2127 			}
   2128 			for ( int j = boxStartIndex; j < m.GetNumRows(); j++ ) {
   2129 				float s = o_x[boxIndex[j]];
   2130 				if ( lo[j] != -idMath::INFINITY ) {
   2131 					lo[j] = - idMath::Fabs( lo[j] * s );
   2132 				}
   2133 				if ( hi[j] != idMath::INFINITY ) {
   2134 					hi[j] = idMath::Fabs( hi[j] * s );
   2135 				}
   2136 			}
   2137 		}
   2138 
   2139 		// calculate acceleration for current variable
   2140 		float dot = BigDotProduct( rowPtrs[i], f.ToFloatPtr(), i );
   2141 		a[i] = dot - b[i];
   2142 
   2143 		// if already at the low boundary
   2144 		if ( lo[i] >= -LCP_BOUND_EPSILON && a[i] >= -LCP_ACCEL_EPSILON ) {
   2145 			side[i] = -1;
   2146 			continue;
   2147 		}
   2148 
   2149 		// if already at the high boundary
   2150 		if ( hi[i] <= LCP_BOUND_EPSILON && a[i] <= LCP_ACCEL_EPSILON ) {
   2151 			side[i] = 1;
   2152 			continue;
   2153 		}
   2154 
   2155 		// if inside the clamped region
   2156 		if ( idMath::Fabs( a[i] ) <= LCP_ACCEL_EPSILON ) {
   2157 			side[i] = 0;
   2158 			AddClamped( i );
   2159 			continue;
   2160 		}
   2161 
   2162 		// drive the current variable into a valid region
   2163 		int n = 0;
   2164 		for ( ; n < maxIterations; n++ ) {
   2165 
   2166 			// direction to move
   2167 			float dir = ( a[i] <= 0.0f ) ? 1.0f : -1.0f;
   2168 
   2169 			// calculate force delta
   2170 			CalcForceDelta( i, dir );
   2171 
   2172 			// calculate acceleration delta: delta_a = m * delta_f;
   2173 			CalcAccelDelta( i );
   2174 
   2175 			float maxStep;
   2176 			int limit;
   2177 			int limitSide;
   2178 
   2179 			// maximum step we can take
   2180 			GetMaxStep( f.ToFloatPtr(), a.ToFloatPtr(), delta_f.ToFloatPtr(), delta_a.ToFloatPtr(),
   2181 							lo.ToFloatPtr(), hi.ToFloatPtr(), side, numUnbounded, numClamped,
   2182 								i, dir, maxStep, limit, limitSide );
   2183 
   2184 			if ( maxStep <= 0.0f ) {
   2185 #ifdef IGNORE_UNSATISFIABLE_VARIABLES
   2186 				// ignore the current variable completely
   2187 				lo[i] = hi[i] = 0.0f;
   2188 				f[i] = 0.0f;
   2189 				side[i] = -1;
   2190 				numIgnored++;
   2191 #else
   2192 				failed.Format( "invalid step size %.4f", maxStep );
   2193 				for ( int j = i; j < m.GetNumRows(); j++ ) {
   2194 					f[j] = 0.0f;
   2195 				}
   2196 				numIgnored = m.GetNumRows() - i;
   2197 #endif
   2198 				break;
   2199 			}
   2200 
   2201 			// change force
   2202 			ChangeForce( i, maxStep );
   2203 
   2204 			// change acceleration
   2205 			ChangeAccel( i, maxStep );
   2206 
   2207 			// clamp/unclamp the variable that limited this step
   2208 			side[limit] = limitSide;
   2209 			if ( limitSide == 0 ) {
   2210 				a[limit] = 0.0f;
   2211 				AddClamped( limit );
   2212 			} else if ( limitSide == -1 ) {
   2213 				f[limit] = lo[limit];
   2214 				if ( limit != i ) {
   2215 					RemoveClamped( limit );
   2216 				}
   2217 			} else if ( limitSide == 1 ) {
   2218 				f[limit] = hi[limit];
   2219 				if ( limit != i ) {
   2220 					RemoveClamped( limit );
   2221 				}
   2222 			}
   2223 
   2224 			// if the current variable limited the step we can continue with the next variable
   2225 			if ( limit == i ) {
   2226 				break;
   2227 			}
   2228 		}
   2229 
   2230 		if ( n >= maxIterations ) {
   2231 			failed.Format( "max iterations %d", maxIterations );
   2232 			break;
   2233 		}
   2234 
   2235 		if ( failed.Length() ) {
   2236 			break;
   2237 		}
   2238 	}
   2239 
   2240 #ifdef IGNORE_UNSATISFIABLE_VARIABLES
   2241 	if ( numIgnored ) {
   2242 		if ( lcp_showFailures.GetBool() ) {
   2243 			idLib::Printf( "idLCP_Square::Solve: %d of %d bounded variables ignored\n", numIgnored, m.GetNumRows() - numUnbounded );
   2244 		}
   2245 	}
   2246 #endif
   2247 
   2248 	// if failed clear remaining forces
   2249 	if ( failed.Length() ) {
   2250 		if ( lcp_showFailures.GetBool() ) {
   2251 			idLib::Printf( "idLCP_Square::Solve: %s (%d of %d bounded variables ignored)\n", failed.c_str(), numIgnored, m.GetNumRows() - numUnbounded );
   2252 		}
   2253 	}
   2254 
   2255 #if defined(_DEBUG) && 0
   2256 	if ( failed.Length() ) {
   2257 		// test whether or not the solution satisfies the complementarity conditions
   2258 		for ( int i = 0; i < m.GetNumRows(); i++ ) {
   2259 			a[i] = -b[i];
   2260 			for ( int j = 0; j < m.GetNumRows(); j++ ) {
   2261 				a[i] += rowPtrs[i][j] * f[j];
   2262 			}
   2263 
   2264 			if ( f[i] == lo[i] ) {
   2265 				if ( lo[i] != hi[i] && a[i] < -LCP_ACCEL_EPSILON ) {
   2266 					int bah1 = 1;
   2267 				}
   2268 			} else if ( f[i] == hi[i] ) {
   2269 				if ( lo[i] != hi[i] && a[i] > LCP_ACCEL_EPSILON ) {
   2270 					int bah2 = 1;
   2271 				}
   2272 			} else if ( f[i] < lo[i] || f[i] > hi[i] || idMath::Fabs( a[i] ) > 1.0f ) {
   2273 				int bah3 = 1;
   2274 			}
   2275 		}
   2276 	}
   2277 #endif
   2278 
   2279 	// unpermute result
   2280 	for ( int i = 0; i < f.GetSize(); i++ ) {
   2281 		o_x[permuted[i]] = f[i];
   2282 	}
   2283 
   2284 	return true;
   2285 }
   2286 
   2287 /*
   2288 ================================================================================================
   2289 
   2290 	idLCP_Symmetric
   2291 
   2292 ================================================================================================
   2293 */
   2294 
   2295 /*
   2296 ================================================
   2297 idLCP_Symmetric
   2298 ================================================
   2299 */
   2300 class idLCP_Symmetric : public idLCP {
   2301 public:
   2302 	virtual bool	Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex );
   2303 
   2304 private:
   2305 	idMatX			m;					// original matrix
   2306 	idVecX			b;					// right hand side
   2307 	idVecX			lo, hi;				// low and high bounds
   2308 	idVecX			f, a;				// force and acceleration
   2309 	idVecX			delta_f, delta_a;	// delta force and delta acceleration
   2310 	idMatX			clamped;			// LDLt factored sub matrix for clamped variables
   2311 	idVecX			diagonal;			// reciprocal of diagonal of LDLt factored sub matrix for clamped variables
   2312 	idVecX			solveCache1;		// intermediate result cached in SolveClamped
   2313 	idVecX			solveCache2;		// "
   2314 	int				numUnbounded;		// number of unbounded variables
   2315 	int				numClamped;			// number of clamped variables
   2316 	int				clampedChangeStart;	// lowest row/column changed in the clamped matrix during an iteration
   2317 	float **		rowPtrs;			// pointers to the rows of m
   2318 	int *			boxIndex;			// box index
   2319 	int *			side;				// tells if a variable is at the low boundary = -1, high boundary = 1 or inbetween = 0
   2320 	int *			permuted;			// index to keep track of the permutation
   2321 	bool			padded;				// set to true if the rows of the initial matrix are 16 byte padded
   2322 
   2323 	bool			FactorClamped();
   2324 	void			SolveClamped( idVecX &x, const float *b );
   2325 	void			Swap( int i, int j );
   2326 	void			AddClamped( int r, bool useSolveCache );
   2327 	void			RemoveClamped( int r );
   2328 	void			CalcForceDelta( int d, float dir );
   2329 	void			CalcAccelDelta( int d );
   2330 	void			ChangeForce( int d, float step );
   2331 	void			ChangeAccel( int d, float step );
   2332 };
   2333 
   2334 /*
   2335 ========================
   2336 idLCP_Symmetric::FactorClamped
   2337 ========================
   2338 */
   2339 bool idLCP_Symmetric::FactorClamped() {
   2340 
   2341 	clampedChangeStart = 0;
   2342 
   2343 	for ( int i = 0; i < numClamped; i++ ) {
   2344 		memcpy( clamped[i], rowPtrs[i], numClamped * sizeof( float ) );
   2345 	}
   2346 	return LDLT_Factor( clamped, diagonal, numClamped );
   2347 }
   2348 
   2349 /*
   2350 ========================
   2351 idLCP_Symmetric::SolveClamped
   2352 ========================
   2353 */
   2354 void idLCP_Symmetric::SolveClamped( idVecX &x, const float *b ) {
   2355 
   2356 	// solve L
   2357 	LowerTriangularSolve( clamped, solveCache1.ToFloatPtr(), b, numClamped, clampedChangeStart );
   2358 
   2359 	// scale with D
   2360 	Multiply( solveCache2.ToFloatPtr(), solveCache1.ToFloatPtr(), diagonal.ToFloatPtr(), numClamped );
   2361 
   2362 	// solve Lt
   2363 	LowerTriangularSolveTranspose( clamped, x.ToFloatPtr(), solveCache2.ToFloatPtr(), numClamped );
   2364 
   2365 	clampedChangeStart = numClamped;
   2366 }
   2367 
   2368 /*
   2369 ========================
   2370 idLCP_Symmetric::Swap
   2371 ========================
   2372 */
   2373 void idLCP_Symmetric::Swap( int i, int j ) {
   2374 
   2375 	if ( i == j ) {
   2376 		return;
   2377 	}
   2378 
   2379 	SwapValues( rowPtrs[i], rowPtrs[j] );
   2380 	m.SwapColumns( i, j );
   2381 	b.SwapElements( i, j );
   2382 	lo.SwapElements( i, j );
   2383 	hi.SwapElements( i, j );
   2384 	a.SwapElements( i, j );
   2385 	f.SwapElements( i, j );
   2386 	if ( boxIndex != NULL ) {
   2387 		SwapValues( boxIndex[i], boxIndex[j] );
   2388 	}
   2389 	SwapValues( side[i], side[j] );
   2390 	SwapValues( permuted[i], permuted[j] );
   2391 }
   2392 
   2393 /*
   2394 ========================
   2395 idLCP_Symmetric::AddClamped
   2396 ========================
   2397 */
   2398 void idLCP_Symmetric::AddClamped( int r, bool useSolveCache ) {
   2399 
   2400 	assert( r >= numClamped );
   2401 
   2402 	if ( numClamped < clampedChangeStart ) {
   2403 		clampedChangeStart = numClamped;
   2404 	}
   2405 
   2406 	// add a row at the bottom and a column at the right of the factored
   2407 	// matrix for the clamped variables
   2408 
   2409 	Swap( numClamped, r );
   2410 
   2411 	// solve for v in L * v = rowPtr[numClamped]
   2412 	float dot;
   2413 	if ( useSolveCache ) {
   2414 
   2415 		// the lower triangular solve was cached in SolveClamped called by CalcForceDelta
   2416 		memcpy( clamped[numClamped], solveCache2.ToFloatPtr(), numClamped * sizeof( float ) );
   2417 		// calculate row dot product
   2418 		dot = BigDotProduct( solveCache2.ToFloatPtr(), solveCache1.ToFloatPtr(), numClamped );
   2419 
   2420 	} else {
   2421 
   2422 		float *v = (float *) _alloca16( numClamped * sizeof( float ) );
   2423 
   2424 		LowerTriangularSolve( clamped, v, rowPtrs[numClamped], numClamped, 0 );
   2425 		// add bottom row to L
   2426 		Multiply( clamped[numClamped], v, diagonal.ToFloatPtr(), numClamped );
   2427 		// calculate row dot product
   2428 		dot = BigDotProduct( clamped[numClamped], v, numClamped );
   2429 	}
   2430 
   2431 	// update diagonal[numClamped]
   2432 	float d = rowPtrs[numClamped][numClamped] - dot;
   2433 
   2434 	if ( fabs( d ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
   2435 		idLib::Printf( "idLCP_Symmetric::AddClamped: updating factorization failed\n" );
   2436 		d = idMath::FLT_SMALLEST_NON_DENORMAL;
   2437 	}
   2438 
   2439 	clamped[numClamped][numClamped] = d;
   2440 	diagonal[numClamped] = 1.0f / d;
   2441 
   2442 	numClamped++;
   2443 }
   2444 
   2445 /*
   2446 ========================
   2447 idLCP_Symmetric::RemoveClamped
   2448 ========================
   2449 */
   2450 void idLCP_Symmetric::RemoveClamped( int r ) {
   2451 
   2452 	if ( !verify( r < numClamped ) ) {
   2453 		// complete fail, most likely due to exceptional floating point values
   2454 		return;
   2455 	}
   2456 
   2457 	if ( r < clampedChangeStart ) {
   2458 		clampedChangeStart = r;
   2459 	}
   2460 
   2461 	numClamped--;
   2462 
   2463 	// no need to swap and update the factored matrix when the last row and column are removed
   2464 	if ( r == numClamped ) {
   2465 		return;
   2466 	}
   2467 
   2468 	// swap the to be removed row/column with the last row/column
   2469 	Swap( r, numClamped );
   2470 
   2471 	// update the factored matrix
   2472 	float * addSub = (float *) _alloca16( numClamped * sizeof( float ) );
   2473 
   2474 	if ( r == 0 ) {
   2475 
   2476 		if ( numClamped == 1 ) {
   2477 			float diag = rowPtrs[0][0];
   2478 			if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
   2479 				idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
   2480 				diag = idMath::FLT_SMALLEST_NON_DENORMAL;
   2481 			}
   2482 			clamped[0][0] = diag;
   2483 			diagonal[0] = 1.0f / diag;
   2484 			return;
   2485 		}
   2486 
   2487 		// calculate the row/column to be added to the lower right sub matrix starting at (r, r)
   2488 		float * original = rowPtrs[numClamped];
   2489 		float * ptr = rowPtrs[r];
   2490 		addSub[0] = ptr[0] - original[numClamped];
   2491 		for ( int i = 1; i < numClamped; i++ ) {
   2492 			addSub[i] = ptr[i] - original[i];
   2493 		}
   2494 
   2495 	} else {
   2496 
   2497 		float * v = (float *) _alloca16( numClamped * sizeof( float ) );
   2498 
   2499 		// solve for v in L * v = rowPtr[r]
   2500 		LowerTriangularSolve( clamped, v, rowPtrs[r], r, 0 );
   2501 
   2502 		// update removed row
   2503 		Multiply( clamped[r], v, diagonal.ToFloatPtr(), r );
   2504 
   2505 		// if the last row/column of the matrix is updated
   2506 		if ( r == numClamped - 1 ) {
   2507 			// only calculate new diagonal
   2508 			float dot = BigDotProduct( clamped[r], v, r );
   2509 			float diag = rowPtrs[r][r] - dot;
   2510 			if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
   2511 				idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
   2512 				diag = idMath::FLT_SMALLEST_NON_DENORMAL;
   2513 			}
   2514 			clamped[r][r] = diag;
   2515 			diagonal[r] = 1.0f / diag;
   2516 			return;
   2517 		}
   2518 
   2519 		// calculate the row/column to be added to the lower right sub matrix starting at (r, r)
   2520 		for ( int i = 0; i < r; i++ ) {
   2521 			v[i] = clamped[r][i] * clamped[i][i];
   2522 		}
   2523 		for ( int i = r; i < numClamped; i++ ) {
   2524 			float sum;
   2525 			if ( i == r ) {
   2526 				sum = clamped[r][r];
   2527 			} else {
   2528 				sum = clamped[r][r] * clamped[i][r];
   2529 			}
   2530 			float * ptr = clamped[i];
   2531 			for ( int j = 0; j < r; j++ ) {
   2532 				sum += ptr[j] * v[j];
   2533 			}
   2534 			addSub[i] = rowPtrs[r][i] - sum;
   2535 		}
   2536 	}
   2537 
   2538 	// add row/column to the lower right sub matrix starting at (r, r)
   2539 
   2540 	float * v1 = (float *) _alloca16( numClamped * sizeof( float ) );
   2541 	float * v2 = (float *) _alloca16( numClamped * sizeof( float ) );
   2542 
   2543 	float diag = idMath::SQRT_1OVER2;
   2544 	v1[r] = ( 0.5f * addSub[r] + 1.0f ) * diag;
   2545 	v2[r] = ( 0.5f * addSub[r] - 1.0f ) * diag;
   2546 	for ( int i = r+1; i < numClamped; i++ ) {
   2547 		v1[i] = v2[i] = addSub[i] * diag;
   2548 	}
   2549 
   2550 	float alpha1 = 1.0f;
   2551 	float alpha2 = -1.0f;
   2552 
   2553 	// simultaneous update/downdate of the sub matrix starting at (r, r)
   2554 	int n = clamped.GetNumColumns();
   2555 	for ( int i = r; i < numClamped; i++ ) {
   2556 
   2557 		diag = clamped[i][i];
   2558 		float p1 = v1[i];
   2559 		float newDiag = diag + alpha1 * p1 * p1;
   2560 
   2561 		if ( fabs( newDiag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
   2562 			idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
   2563 			newDiag = idMath::FLT_SMALLEST_NON_DENORMAL;
   2564 		}
   2565 
   2566 		alpha1 /= newDiag;
   2567 		float beta1 = p1 * alpha1;
   2568 		alpha1 *= diag;
   2569 
   2570 		diag = newDiag;
   2571 		float p2 = v2[i];
   2572 		newDiag = diag + alpha2 * p2 * p2;
   2573 
   2574 		if ( fabs( newDiag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) {
   2575 			idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" );
   2576 			newDiag = idMath::FLT_SMALLEST_NON_DENORMAL;
   2577 		}
   2578 
   2579 		clamped[i][i] = newDiag;
   2580 		float invNewDiag = 1.0f / newDiag;
   2581 		diagonal[i] = invNewDiag;
   2582 
   2583 		alpha2 *= invNewDiag;
   2584 		float beta2 = p2 * alpha2;
   2585 		alpha2 *= diag;
   2586 
   2587 		// update column below diagonal (i,i)
   2588 		float * ptr = clamped.ToFloatPtr() + i;
   2589 
   2590 		int j;
   2591 		for ( j = i+1; j < numClamped - 1; j += 2 ) {
   2592 
   2593 			float sum0 = ptr[(j+0)*n];
   2594 			float sum1 = ptr[(j+1)*n];
   2595 
   2596 			v1[j+0] -= p1 * sum0;
   2597 			v1[j+1] -= p1 * sum1;
   2598 
   2599 			sum0 += beta1 * v1[j+0];
   2600 			sum1 += beta1 * v1[j+1];
   2601 
   2602 			v2[j+0] -= p2 * sum0;
   2603 			v2[j+1] -= p2 * sum1;
   2604 
   2605 			sum0 += beta2 * v2[j+0];
   2606 			sum1 += beta2 * v2[j+1];
   2607 
   2608 			ptr[(j+0)*n] = sum0;
   2609 			ptr[(j+1)*n] = sum1;
   2610 		}
   2611 
   2612 		for ( ; j < numClamped; j++ ) {
   2613 
   2614 			float sum = ptr[j*n];
   2615 
   2616 			v1[j] -= p1 * sum;
   2617 			sum += beta1 * v1[j];
   2618 
   2619 			v2[j] -= p2 * sum;
   2620 			sum += beta2 * v2[j];
   2621 
   2622 			ptr[j*n] = sum;
   2623 		}
   2624 	}
   2625 }
   2626 
   2627 /*
   2628 ========================
   2629 idLCP_Symmetric::CalcForceDelta
   2630 
   2631 Modifies this->delta_f.
   2632 ========================
   2633 */
   2634 ID_INLINE void idLCP_Symmetric::CalcForceDelta( int d, float dir ) {
   2635 
   2636 	delta_f[d] = dir;
   2637 
   2638 	if ( numClamped == 0 ) {
   2639 		return;
   2640 	}
   2641 
   2642 	// solve force delta
   2643 	SolveClamped( delta_f, rowPtrs[d] );
   2644 
   2645 	// flip force delta based on direction
   2646 	if ( dir > 0.0f ) {
   2647 		float * ptr = delta_f.ToFloatPtr();
   2648 		for ( int i = 0; i < numClamped; i++ ) {
   2649 			ptr[i] = - ptr[i];
   2650 		}
   2651 	}
   2652 }
   2653 
   2654 /*
   2655 ========================
   2656 idLCP_Symmetric::CalcAccelDelta
   2657 
   2658 Modifies this->delta_a and uses this->delta_f.
   2659 ========================
   2660 */
   2661 ID_INLINE void idLCP_Symmetric::CalcAccelDelta( int d ) {
   2662 	// only the not clamped variables, including the current variable, can have a change in acceleration
   2663 	for ( int j = numClamped; j <= d; j++ ) {
   2664 		// only the clamped variables and the current variable have a force delta unequal zero
   2665 		float dot = BigDotProduct( rowPtrs[j], delta_f.ToFloatPtr(), numClamped );
   2666 		delta_a[j] = dot + rowPtrs[j][d] * delta_f[d];
   2667 	}
   2668 }
   2669 
   2670 /*
   2671 ========================
   2672 idLCP_Symmetric::ChangeForce
   2673 
   2674 Modifies this->f and uses this->delta_f.
   2675 ========================
   2676 */
   2677 ID_INLINE void idLCP_Symmetric::ChangeForce( int d, float step ) {
   2678 	// only the clamped variables and current variable have a force delta unequal zero
   2679 	MultiplyAdd( f.ToFloatPtr(), step, delta_f.ToFloatPtr(), numClamped );
   2680 	f[d] += step * delta_f[d];
   2681 }
   2682 
   2683 /*
   2684 ========================
   2685 idLCP_Symmetric::ChangeAccel
   2686 
   2687 Modifies this->a and uses this->delta_a.
   2688 ========================
   2689 */
   2690 ID_INLINE void idLCP_Symmetric::ChangeAccel( int d, float step ) {
   2691 	// only the not clamped variables, including the current variable, can have an acceleration unequal zero
   2692 	MultiplyAdd( a.ToFloatPtr() + numClamped, step, delta_a.ToFloatPtr() + numClamped, d - numClamped + 1 );
   2693 }
   2694 
   2695 /*
   2696 ========================
   2697 idLCP_Symmetric::Solve
   2698 ========================
   2699 */
   2700 bool idLCP_Symmetric::Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex ) {
   2701 
   2702 	// true when the matrix rows are 16 byte padded
   2703 	padded = ((o_m.GetNumRows()+3)&~3) == o_m.GetNumColumns();
   2704 
   2705 	assert( padded || o_m.GetNumRows() == o_m.GetNumColumns() );
   2706 	assert( o_x.GetSize() == o_m.GetNumRows() );
   2707 	assert( o_b.GetSize() == o_m.GetNumRows() );
   2708 	assert( o_lo.GetSize() == o_m.GetNumRows() );
   2709 	assert( o_hi.GetSize() == o_m.GetNumRows() );
   2710 
   2711 	// allocate memory for permuted input
   2712 	f.SetData( o_m.GetNumRows(), VECX_ALLOCA( o_m.GetNumRows() ) );
   2713 	a.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
   2714 	b.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) );
   2715 	lo.SetData( o_lo.GetSize(), VECX_ALLOCA( o_lo.GetSize() ) );
   2716 	hi.SetData( o_hi.GetSize(), VECX_ALLOCA( o_hi.GetSize() ) );
   2717 	if ( o_boxIndex != NULL ) {
   2718 		boxIndex = (int *)_alloca16( o_x.GetSize() * sizeof( int ) );
   2719 		memcpy( boxIndex, o_boxIndex, o_x.GetSize() * sizeof( int ) );
   2720 	} else {
   2721 		boxIndex = NULL;
   2722 	}
   2723 
   2724 	// we override the const on o_m here but on exit the matrix is unchanged
   2725 	m.SetData( o_m.GetNumRows(), o_m.GetNumColumns(), const_cast< float * >( o_m[0] ) );
   2726 	f.Zero();
   2727 	a.Zero();
   2728 	b = o_b;
   2729 	lo = o_lo;
   2730 	hi = o_hi;
   2731 
   2732 	// pointers to the rows of m
   2733 	rowPtrs = (float **) _alloca16( m.GetNumRows() * sizeof( float * ) );
   2734 	for ( int i = 0; i < m.GetNumRows(); i++ ) {
   2735 		rowPtrs[i] = m[i];
   2736 	}
   2737 
   2738 	// tells if a variable is at the low boundary, high boundary or inbetween
   2739 	side = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
   2740 
   2741 	// index to keep track of the permutation
   2742 	permuted = (int *) _alloca16( m.GetNumRows() * sizeof( int ) );
   2743 	for ( int i = 0; i < m.GetNumRows(); i++ ) {
   2744 		permuted[i] = i;
   2745 	}
   2746 
   2747 	// permute input so all unbounded variables come first
   2748 	numUnbounded = 0;
   2749 	for ( int i = 0; i < m.GetNumRows(); i++ ) {
   2750 		if ( lo[i] == -idMath::INFINITY && hi[i] == idMath::INFINITY ) {
   2751 			if ( numUnbounded != i ) {
   2752 				Swap( numUnbounded, i );
   2753 			}
   2754 			numUnbounded++;
   2755 		}
   2756 	}
   2757 
   2758 	// permute input so all variables using the boxIndex come last
   2759 	int boxStartIndex = m.GetNumRows();
   2760 	if ( boxIndex != NULL ) {
   2761 		for ( int i = m.GetNumRows() - 1; i >= numUnbounded; i-- ) {
   2762 			if ( boxIndex[i] >= 0 ) {
   2763 				boxStartIndex--;
   2764 				if ( boxStartIndex != i ) {
   2765 					Swap( boxStartIndex, i );
   2766 				}
   2767 			}
   2768 		}
   2769 	}
   2770 
   2771 	// sub matrix for factorization 
   2772 	clamped.SetDataCacheLines( m.GetNumRows(), m.GetNumColumns(), MATX_ALLOCA_CACHE_LINES( m.GetNumRows() * m.GetNumColumns() ), true );
   2773 	diagonal.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
   2774 	solveCache1.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
   2775 	solveCache2.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
   2776 
   2777 	// all unbounded variables are clamped
   2778 	numClamped = numUnbounded;
   2779 
   2780 	// if there are unbounded variables
   2781 	if ( numUnbounded ) {
   2782 
   2783 		// factor and solve for unbounded variables
   2784 		if ( !FactorClamped() ) {
   2785 			idLib::Printf( "idLCP_Symmetric::Solve: unbounded factorization failed\n" );
   2786 			return false;
   2787 		}
   2788 		SolveClamped( f, b.ToFloatPtr() );
   2789 
   2790 		// if there are no bounded variables we are done
   2791 		if ( numUnbounded == m.GetNumRows() ) {
   2792 			o_x = f;	// the vector is not permuted
   2793 			return true;
   2794 		}
   2795 	}
   2796 
   2797 	int numIgnored = 0;
   2798 
   2799 	// allocate for delta force and delta acceleration
   2800 	delta_f.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
   2801 	delta_a.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) );
   2802 
   2803 	// solve for bounded variables
   2804 	idStr failed;
   2805 	for ( int i = numUnbounded; i < m.GetNumRows(); i++ ) {
   2806 
   2807 		clampedChangeStart = 0;
   2808 
   2809 		// once we hit the box start index we can initialize the low and high boundaries of the variables using the box index
   2810 		if ( i == boxStartIndex ) {
   2811 			for ( int j = 0; j < boxStartIndex; j++ ) {
   2812 				o_x[permuted[j]] = f[j];
   2813 			}
   2814 			for ( int j = boxStartIndex; j < m.GetNumRows(); j++ ) {
   2815 				float s = o_x[boxIndex[j]];
   2816 				if ( lo[j] != -idMath::INFINITY ) {
   2817 					lo[j] = - idMath::Fabs( lo[j] * s );
   2818 				}
   2819 				if ( hi[j] != idMath::INFINITY ) {
   2820 					hi[j] = idMath::Fabs( hi[j] * s );
   2821 				}
   2822 			}
   2823 		}
   2824 
   2825 		// calculate acceleration for current variable
   2826 		float dot = BigDotProduct( rowPtrs[i], f.ToFloatPtr(), i );
   2827 		a[i] = dot - b[i];
   2828 
   2829 		// if already at the low boundary
   2830 		if ( lo[i] >= -LCP_BOUND_EPSILON && a[i] >= -LCP_ACCEL_EPSILON ) {
   2831 			side[i] = -1;
   2832 			continue;
   2833 		}
   2834 
   2835 		// if already at the high boundary
   2836 		if ( hi[i] <= LCP_BOUND_EPSILON && a[i] <= LCP_ACCEL_EPSILON ) {
   2837 			side[i] = 1;
   2838 			continue;
   2839 		}
   2840 
   2841 		// if inside the clamped region
   2842 		if ( idMath::Fabs( a[i] ) <= LCP_ACCEL_EPSILON ) {
   2843 			side[i] = 0;
   2844 			AddClamped( i, false );
   2845 			continue;
   2846 		}
   2847 
   2848 		// drive the current variable into a valid region
   2849 		int n = 0;
   2850 		for ( ; n < maxIterations; n++ ) {
   2851 
   2852 			// direction to move
   2853 			float dir = ( a[i] <= 0.0f ) ? 1.0f : -1.0f;
   2854 
   2855 			// calculate force delta
   2856 			CalcForceDelta( i, dir );
   2857 
   2858 			// calculate acceleration delta: delta_a = m * delta_f;
   2859 			CalcAccelDelta( i );
   2860 
   2861 			float maxStep;
   2862 			int limit;
   2863 			int limitSide;
   2864 
   2865 			// maximum step we can take
   2866 			GetMaxStep( f.ToFloatPtr(), a.ToFloatPtr(), delta_f.ToFloatPtr(), delta_a.ToFloatPtr(),
   2867 							lo.ToFloatPtr(), hi.ToFloatPtr(), side, numUnbounded, numClamped,
   2868 								i, dir, maxStep, limit, limitSide );
   2869 
   2870 			if ( maxStep <= 0.0f ) {
   2871 #ifdef IGNORE_UNSATISFIABLE_VARIABLES
   2872 				// ignore the current variable completely
   2873 				lo[i] = hi[i] = 0.0f;
   2874 				f[i] = 0.0f;
   2875 				side[i] = -1;
   2876 				numIgnored++;
   2877 #else
   2878 				failed.Format( "invalid step size %.4f", maxStep );
   2879 				for ( int j = i; j < m.GetNumRows(); j++ ) {
   2880 					f[j] = 0.0f;
   2881 				}
   2882 				numIgnored = m.GetNumRows() - i;
   2883 #endif
   2884 				break;
   2885 			}
   2886 
   2887 			// change force
   2888 			ChangeForce( i, maxStep );
   2889 
   2890 			// change acceleration
   2891 			ChangeAccel( i, maxStep );
   2892 
   2893 			// clamp/unclamp the variable that limited this step
   2894 			side[limit] = limitSide;
   2895 			if ( limitSide == 0 ) {
   2896 				a[limit] = 0.0f;
   2897 				AddClamped( limit, ( limit == i ) );
   2898 			} else if ( limitSide == -1 ) {
   2899 				f[limit] = lo[limit];
   2900 				if ( limit != i ) {
   2901 					RemoveClamped( limit );
   2902 				}
   2903 			} else if ( limitSide == 1 ) {
   2904 				f[limit] = hi[limit];
   2905 				if ( limit != i ) {
   2906 					RemoveClamped( limit );
   2907 				}
   2908 			}
   2909 
   2910 			// if the current variable limited the step we can continue with the next variable
   2911 			if ( limit == i ) {
   2912 				break;
   2913 			}
   2914 		}
   2915 
   2916 		if ( n >= maxIterations ) {
   2917 			failed.Format( "max iterations %d", maxIterations );
   2918 			break;
   2919 		}
   2920 
   2921 		if ( failed.Length() ) {
   2922 			break;
   2923 		}
   2924 	}
   2925 
   2926 #ifdef IGNORE_UNSATISFIABLE_VARIABLES
   2927 	if ( numIgnored ) {
   2928 		if ( lcp_showFailures.GetBool() ) {
   2929 			idLib::Printf( "idLCP_Symmetric::Solve: %d of %d bounded variables ignored\n", numIgnored, m.GetNumRows() - numUnbounded );
   2930 		}
   2931 	}
   2932 #endif
   2933 
   2934 	// if failed clear remaining forces
   2935 	if ( failed.Length() ) {
   2936 		if ( lcp_showFailures.GetBool() ) {
   2937 			idLib::Printf( "idLCP_Symmetric::Solve: %s (%d of %d bounded variables ignored)\n", failed.c_str(), numIgnored, m.GetNumRows() - numUnbounded );
   2938 		}
   2939 	}
   2940 
   2941 #if defined(_DEBUG) && 0
   2942 	if ( failed.Length() ) {
   2943 		// test whether or not the solution satisfies the complementarity conditions
   2944 		for ( int i = 0; i < m.GetNumRows(); i++ ) {
   2945 			a[i] = -b[i];
   2946 			for ( j = 0; j < m.GetNumRows(); j++ ) {
   2947 				a[i] += rowPtrs[i][j] * f[j];
   2948 			}
   2949 
   2950 			if ( f[i] == lo[i] ) {
   2951 				if ( lo[i] != hi[i] && a[i] < -LCP_ACCEL_EPSILON ) {
   2952 					int bah1 = 1;
   2953 				}
   2954 			} else if ( f[i] == hi[i] ) {
   2955 				if ( lo[i] != hi[i] && a[i] > LCP_ACCEL_EPSILON ) {
   2956 					int bah2 = 1;
   2957 				}
   2958 			} else if ( f[i] < lo[i] || f[i] > hi[i] || idMath::Fabs( a[i] ) > 1.0f ) {
   2959 				int bah3 = 1;
   2960 			}
   2961 		}
   2962 	}
   2963 #endif
   2964 
   2965 	// unpermute result
   2966 	for ( int i = 0; i < f.GetSize(); i++ ) {
   2967 		o_x[permuted[i]] = f[i];
   2968 	}
   2969 
   2970 	return true;
   2971 }
   2972 
   2973 /*
   2974 ================================================================================================
   2975 
   2976 	idLCP
   2977 
   2978 ================================================================================================
   2979 */
   2980 
   2981 /*
   2982 ========================
   2983 idLCP::AllocSquare
   2984 ========================
   2985 */
   2986 idLCP *idLCP::AllocSquare() {
   2987 	idLCP *lcp = new idLCP_Square;
   2988 	lcp->SetMaxIterations( 32 );
   2989 	return lcp;
   2990 }
   2991 
   2992 /*
   2993 ========================
   2994 idLCP::AllocSymmetric
   2995 ========================
   2996 */
   2997 idLCP *idLCP::AllocSymmetric() {
   2998 	idLCP *lcp = new idLCP_Symmetric;
   2999 	lcp->SetMaxIterations( 32 );
   3000 	return lcp;
   3001 }
   3002 
   3003 /*
   3004 ========================
   3005 idLCP::~idLCP
   3006 ========================
   3007 */
   3008 idLCP::~idLCP() {
   3009 }
   3010 
   3011 /*
   3012 ========================
   3013 idLCP::SetMaxIterations
   3014 ========================
   3015 */
   3016 void idLCP::SetMaxIterations( int max ) {
   3017 	maxIterations = max;
   3018 }
   3019 
   3020 /*
   3021 ========================
   3022 idLCP::GetMaxIterations
   3023 ========================
   3024 */
   3025 int idLCP::GetMaxIterations() {
   3026 	return maxIterations;
   3027 }
   3028 
   3029 /*
   3030 ========================
   3031 idLCP::Test_f
   3032 ========================
   3033 */
   3034 void idLCP::Test_f( const idCmdArgs &args ) {
   3035 #ifdef ENABLE_TEST_CODE
   3036 	DotProduct_Test();
   3037 	LowerTriangularSolve_Test();
   3038 	LowerTriangularSolveTranspose_Test();
   3039 	LDLT_Factor_Test();
   3040 #endif
   3041 }