Lcp.cpp (88972B)
1 /* 2 =========================================================================== 3 4 Doom 3 BFG Edition GPL Source Code 5 Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. 6 7 This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). 8 9 Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation, either version 3 of the License, or 12 (at your option) any later version. 13 14 Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License 20 along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>. 21 22 In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below. 23 24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA. 25 26 =========================================================================== 27 */ 28 #pragma hdrstop 29 #include "../precompiled.h" 30 31 // this file is full of intentional case fall throughs 32 //lint -e616 33 34 // the code is correct, it can't be a NULL pointer 35 //lint -e613 36 37 static idCVar lcp_showFailures( "lcp_showFailures", "0", CVAR_BOOL, "show LCP solver failures" ); 38 39 const float LCP_BOUND_EPSILON = 1e-5f; 40 const float LCP_ACCEL_EPSILON = 1e-5f; 41 const float LCP_DELTA_ACCEL_EPSILON = 1e-9f; 42 const float LCP_DELTA_FORCE_EPSILON = 1e-9f; 43 44 #define IGNORE_UNSATISFIABLE_VARIABLES 45 46 47 #if defined( ID_WIN_X86_SSE_ASM ) || defined( ID_WIN_X86_SSE_INTRIN ) 48 49 ALIGN16( const __m128 SIMD_SP_zero ) = { 0.0f, 0.0f, 0.0f, 0.0f }; 50 ALIGN16( const __m128 SIMD_SP_one ) = { 1.0f, 1.0f, 1.0f, 1.0f }; 51 ALIGN16( const __m128 SIMD_SP_two ) = { 2.0f, 2.0f, 2.0f, 2.0f }; 52 ALIGN16( const __m128 SIMD_SP_tiny ) = { 1e-10f, 1e-10f, 1e-10f, 1e-10f }; 53 ALIGN16( const __m128 SIMD_SP_infinity ) = { idMath::INFINITY, idMath::INFINITY, idMath::INFINITY, idMath::INFINITY }; 54 ALIGN16( const __m128 SIMD_SP_LCP_DELTA_ACCEL_EPSILON ) = { LCP_DELTA_ACCEL_EPSILON, LCP_DELTA_ACCEL_EPSILON, LCP_DELTA_ACCEL_EPSILON, LCP_DELTA_ACCEL_EPSILON }; 55 ALIGN16( const __m128 SIMD_SP_LCP_DELTA_FORCE_EPSILON ) = { LCP_DELTA_FORCE_EPSILON, LCP_DELTA_FORCE_EPSILON, LCP_DELTA_FORCE_EPSILON, LCP_DELTA_FORCE_EPSILON }; 56 ALIGN16( const __m128 SIMD_SP_LCP_BOUND_EPSILON ) = { LCP_BOUND_EPSILON, LCP_BOUND_EPSILON, LCP_BOUND_EPSILON, LCP_BOUND_EPSILON }; 57 ALIGN16( const __m128 SIMD_SP_neg_LCP_BOUND_EPSILON ) = { -LCP_BOUND_EPSILON, -LCP_BOUND_EPSILON, -LCP_BOUND_EPSILON, -LCP_BOUND_EPSILON }; 58 ALIGN16( const unsigned int SIMD_SP_signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK }; 59 ALIGN16( const unsigned int SIMD_SP_absMask[4] ) = { ~IEEE_FLT_SIGN_MASK, ~IEEE_FLT_SIGN_MASK, ~IEEE_FLT_SIGN_MASK, ~IEEE_FLT_SIGN_MASK }; 60 ALIGN16( const unsigned int SIMD_SP_indexedStartMask[4][4] ) = { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF }, { 0, 0, 0, 0xFFFFFFFF } }; 61 ALIGN16( const unsigned int SIMD_SP_indexedEndMask[4][4] ) = { { 0, 0, 0, 0 }, { 0xFFFFFFFF, 0, 0, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 } }; 62 ALIGN16( const unsigned int SIMD_SP_clearLast1[4] ) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 }; 63 ALIGN16( const unsigned int SIMD_SP_clearLast2[4] ) = { 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 }; 64 ALIGN16( const unsigned int SIMD_SP_clearLast3[4] ) = { 0xFFFFFFFF, 0, 0, 0 }; 65 ALIGN16( const unsigned int SIMD_DW_zero[4] ) = { 0, 0, 0, 0 }; 66 ALIGN16( const unsigned int SIMD_DW_one[4] ) = { 1, 1, 1, 1 }; 67 ALIGN16( const unsigned int SIMD_DW_four[4] ) = { 4, 4, 4, 4 }; 68 ALIGN16( const unsigned int SIMD_DW_index[4] ) = { 0, 1, 2, 3 }; 69 ALIGN16( const int SIMD_DW_not3[4] ) = { ~3, ~3, ~3, ~3 }; 70 71 #endif 72 73 /* 74 ======================== 75 Multiply_SIMD 76 77 dst[i] = src0[i] * src1[i]; 78 79 Assumes the source and destination have the same memory alignment. 80 ======================== 81 */ 82 static void Multiply_SIMD( float * dst, const float * src0, const float * src1, const int count ) { 83 int i = 0; 84 for ( ; ( (unsigned int)dst & 0xF ) != 0 && i < count; i++ ) { 85 dst[i] = src0[i] * src1[i]; 86 } 87 88 #ifdef ID_WIN_X86_SSE_INTRIN 89 90 for ( ; i + 4 <= count; i += 4 ) { 91 assert_16_byte_aligned( &dst[i] ); 92 assert_16_byte_aligned( &src0[i] ); 93 assert_16_byte_aligned( &src1[i] ); 94 95 __m128 s0 = _mm_load_ps( src0 + i ); 96 __m128 s1 = _mm_load_ps( src1 + i ); 97 s0 = _mm_mul_ps( s0, s1 ); 98 _mm_store_ps( dst + i, s0 ); 99 } 100 101 #else 102 103 for ( ; i + 4 <= count; i += 4 ) { 104 assert_16_byte_aligned( &dst[i] ); 105 assert_16_byte_aligned( &src0[i] ); 106 assert_16_byte_aligned( &src1[i] ); 107 108 dst[i+0] = src0[i+0] * src1[i+0]; 109 dst[i+1] = src0[i+1] * src1[i+1]; 110 dst[i+2] = src0[i+2] * src1[i+2]; 111 dst[i+3] = src0[i+3] * src1[i+3]; 112 } 113 114 #endif 115 116 for ( ; i < count; i++ ) { 117 dst[i] = src0[i] * src1[i]; 118 } 119 } 120 121 /* 122 ======================== 123 MultiplyAdd_SIMD 124 125 dst[i] += constant * src[i]; 126 127 Assumes the source and destination have the same memory alignment. 128 ======================== 129 */ 130 static void MultiplyAdd_SIMD( float * dst, const float constant, const float * src, const int count ) { 131 int i = 0; 132 for ( ; ( (unsigned int)dst & 0xF ) != 0 && i < count; i++ ) { 133 dst[i] += constant * src[i]; 134 } 135 136 #ifdef ID_WIN_X86_SSE_INTRIN 137 138 __m128 c = _mm_load1_ps( & constant ); 139 for ( ; i + 4 <= count; i += 4 ) { 140 assert_16_byte_aligned( &dst[i] ); 141 assert_16_byte_aligned( &src[i] ); 142 143 __m128 s = _mm_load_ps( src + i ); 144 __m128 d = _mm_load_ps( dst + i ); 145 s = _mm_add_ps( _mm_mul_ps( s, c ), d ); 146 _mm_store_ps( dst + i, s ); 147 } 148 149 #else 150 151 for ( ; i + 4 <= count; i += 4 ) { 152 assert_16_byte_aligned( &src[i] ); 153 assert_16_byte_aligned( &dst[i] ); 154 155 dst[i+0] += constant * src[i+0]; 156 dst[i+1] += constant * src[i+1]; 157 dst[i+2] += constant * src[i+2]; 158 dst[i+3] += constant * src[i+3]; 159 } 160 161 #endif 162 163 for ( ; i < count; i++ ) { 164 dst[i] += constant * src[i]; 165 } 166 } 167 168 /* 169 ======================== 170 DotProduct_SIMD 171 172 dot = src0[0] * src1[0] + src0[1] * src1[1] + src0[2] * src1[2] + ... 173 ======================== 174 */ 175 static float DotProduct_SIMD( const float * src0, const float * src1, const int count ) { 176 assert_16_byte_aligned( src0 ); 177 assert_16_byte_aligned( src1 ); 178 179 #ifdef ID_WIN_X86_SSE_INTRIN 180 181 __m128 sum = (__m128 &) SIMD_SP_zero; 182 int i = 0; 183 for ( ; i < count - 3; i += 4 ) { 184 __m128 s0 = _mm_load_ps( src0 + i ); 185 __m128 s1 = _mm_load_ps( src1 + i ); 186 sum = _mm_add_ps( _mm_mul_ps( s0, s1 ), sum ); 187 } 188 __m128 mask = _mm_load_ps( (float *) &SIMD_SP_indexedEndMask[count & 3] ); 189 __m128 s0 = _mm_and_ps( _mm_load_ps( src0 + i ), mask ); 190 __m128 s1 = _mm_and_ps( _mm_load_ps( src1 + i ), mask ); 191 sum = _mm_add_ps( _mm_mul_ps( s0, s1 ), sum ); 192 sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 193 sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 194 float dot; 195 _mm_store_ss( & dot, sum ); 196 return dot; 197 198 #else 199 200 float s0 = 0.0f; 201 float s1 = 0.0f; 202 float s2 = 0.0f; 203 float s3 = 0.0f; 204 int i = 0; 205 for ( ; i < count - 3; i += 4 ) { 206 s0 += src0[i+4] * src1[i+4]; 207 s1 += src0[i+5] * src1[i+5]; 208 s2 += src0[i+6] * src1[i+6]; 209 s3 += src0[i+7] * src1[i+7]; 210 } 211 switch( count - i ) { 212 NODEFAULT; 213 case 3: s0 += src0[i+2] * src1[i+2]; 214 case 2: s1 += src0[i+1] * src1[i+1]; 215 case 1: s2 += src0[i+0] * src1[i+0]; 216 case 0: 217 break; 218 } 219 return s0 + s1 + s2 + s3; 220 221 #endif 222 } 223 224 /* 225 ======================== 226 LowerTriangularSolve_SIMD 227 228 Solves x in Lx = b for the n * n sub-matrix of L. 229 * if skip > 0 the first skip elements of x are assumed to be valid already 230 * L has to be a lower triangular matrix with (implicit) ones on the diagonal 231 * x == b is allowed 232 ======================== 233 */ 234 static void LowerTriangularSolve_SIMD( const idMatX & L, float * x, const float * b, const int n, int skip ) { 235 if ( skip >= n ) { 236 return; 237 } 238 239 const float *lptr = L.ToFloatPtr(); 240 int nc = L.GetNumColumns(); 241 242 assert( ( nc & 3 ) == 0 ); 243 244 // unrolled cases for n < 8 245 if ( n < 8 ) { 246 #define NSKIP( n, s ) ((n<<3)|(s&7)) 247 switch( NSKIP( n, skip ) ) { 248 case NSKIP( 1, 0 ): x[0] = b[0]; 249 return; 250 case NSKIP( 2, 0 ): x[0] = b[0]; 251 case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; 252 return; 253 case NSKIP( 3, 0 ): x[0] = b[0]; 254 case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; 255 case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; 256 return; 257 case NSKIP( 4, 0 ): x[0] = b[0]; 258 case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; 259 case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; 260 case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; 261 return; 262 case NSKIP( 5, 0 ): x[0] = b[0]; 263 case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; 264 case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; 265 case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; 266 case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; 267 return; 268 case NSKIP( 6, 0 ): x[0] = b[0]; 269 case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; 270 case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; 271 case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; 272 case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; 273 case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4]; 274 return; 275 case NSKIP( 7, 0 ): x[0] = b[0]; 276 case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; 277 case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; 278 case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; 279 case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; 280 case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4]; 281 case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5]; 282 return; 283 } 284 #undef NSKIP 285 return; 286 } 287 288 // process first 4 rows 289 switch( skip ) { 290 case 0: x[0] = b[0]; 291 case 1: x[1] = b[1] - lptr[1*nc+0] * x[0]; 292 case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; 293 case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; 294 skip = 4; 295 } 296 297 lptr = L[skip]; 298 299 int i = skip; 300 301 #ifdef ID_WIN_X86_SSE_INTRIN 302 303 // work up to a multiple of 4 rows 304 for ( ; ( i & 3 ) != 0 && i < n; i++ ) { 305 __m128 sum = _mm_load_ss( & b[i] ); 306 int j = 0; 307 for ( ; j < i - 3; j += 4 ) { 308 __m128 s0 = _mm_load_ps( lptr + j ); 309 __m128 s1 = _mm_load_ps( x + j ); 310 sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) ); 311 } 312 __m128 mask = _mm_load_ps( (float *) & SIMD_SP_indexedEndMask[i & 3] ); 313 __m128 s0 = _mm_and_ps( _mm_load_ps( lptr + j ), mask ); 314 __m128 s1 = _mm_and_ps( _mm_load_ps( x + j ), mask ); 315 sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) ); 316 sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 317 sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 318 _mm_store_ss( & x[i], sum ); 319 lptr += nc; 320 } 321 322 for ( ; i + 3 < n; i += 4 ) { 323 const float * lptr0 = &lptr[0*nc]; 324 const float * lptr1 = &lptr[1*nc]; 325 const float * lptr2 = &lptr[2*nc]; 326 const float * lptr3 = &lptr[3*nc]; 327 328 assert_16_byte_aligned( lptr0 ); 329 assert_16_byte_aligned( lptr1 ); 330 assert_16_byte_aligned( lptr2 ); 331 assert_16_byte_aligned( lptr3 ); 332 333 __m128 va = _mm_load_ss( & b[i+0] ); 334 __m128 vb = _mm_load_ss( & b[i+1] ); 335 __m128 vc = _mm_load_ss( & b[i+2] ); 336 __m128 vd = _mm_load_ss( & b[i+3] ); 337 338 __m128 x0 = _mm_load_ps( & x[0] ); 339 340 va = _mm_sub_ps( va, _mm_mul_ps( x0, _mm_load_ps( lptr0 + 0 ) ) ); 341 vb = _mm_sub_ps( vb, _mm_mul_ps( x0, _mm_load_ps( lptr1 + 0 ) ) ); 342 vc = _mm_sub_ps( vc, _mm_mul_ps( x0, _mm_load_ps( lptr2 + 0 ) ) ); 343 vd = _mm_sub_ps( vd, _mm_mul_ps( x0, _mm_load_ps( lptr3 + 0 ) ) ); 344 345 for ( int j = 4; j < i; j += 4 ) { 346 __m128 xj = _mm_load_ps( &x[j] ); 347 348 va = _mm_sub_ps( va, _mm_mul_ps( xj, _mm_load_ps( lptr0 + j ) ) ); 349 vb = _mm_sub_ps( vb, _mm_mul_ps( xj, _mm_load_ps( lptr1 + j ) ) ); 350 vc = _mm_sub_ps( vc, _mm_mul_ps( xj, _mm_load_ps( lptr2 + j ) ) ); 351 vd = _mm_sub_ps( vd, _mm_mul_ps( xj, _mm_load_ps( lptr3 + j ) ) ); 352 } 353 354 vb = _mm_sub_ps( vb, _mm_mul_ps( va, _mm_load1_ps( lptr1 + i + 0 ) ) ); 355 vc = _mm_sub_ps( vc, _mm_mul_ps( va, _mm_load1_ps( lptr2 + i + 0 ) ) ); 356 vc = _mm_sub_ps( vc, _mm_mul_ps( vb, _mm_load1_ps( lptr2 + i + 1 ) ) ); 357 vd = _mm_sub_ps( vd, _mm_mul_ps( va, _mm_load1_ps( lptr3 + i + 0 ) ) ); 358 vd = _mm_sub_ps( vd, _mm_mul_ps( vb, _mm_load1_ps( lptr3 + i + 1 ) ) ); 359 vd = _mm_sub_ps( vd, _mm_mul_ps( vc, _mm_load1_ps( lptr3 + i + 2 ) ) ); 360 361 __m128 ta = _mm_unpacklo_ps( va, vc ); // x0, z0, x1, z1 362 __m128 tb = _mm_unpackhi_ps( va, vc ); // x2, z2, x3, z3 363 __m128 tc = _mm_unpacklo_ps( vb, vd ); // y0, w0, y1, w1 364 __m128 td = _mm_unpackhi_ps( vb, vd ); // y2, w2, y3, w3 365 366 va = _mm_unpacklo_ps( ta, tc ); // x0, y0, z0, w0 367 vb = _mm_unpackhi_ps( ta, tc ); // x1, y1, z1, w1 368 vc = _mm_unpacklo_ps( tb, td ); // x2, y2, z2, w2 369 vd = _mm_unpackhi_ps( tb, td ); // x3, y3, z3, w3 370 371 va = _mm_add_ps( va, vb ); 372 vc = _mm_add_ps( vc, vd ); 373 va = _mm_add_ps( va, vc ); 374 375 _mm_store_ps( & x[i], va ); 376 377 lptr += 4 * nc; 378 } 379 380 // go through any remaining rows 381 for ( ; i < n; i++ ) { 382 __m128 sum = _mm_load_ss( & b[i] ); 383 int j = 0; 384 for ( ; j < i - 3; j += 4 ) { 385 __m128 s0 = _mm_load_ps( lptr + j ); 386 __m128 s1 = _mm_load_ps( x + j ); 387 sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) ); 388 } 389 __m128 mask = _mm_load_ps( (float *) & SIMD_SP_indexedEndMask[i & 3] ); 390 __m128 s0 = _mm_and_ps( _mm_load_ps( lptr + j ), mask ); 391 __m128 s1 = _mm_and_ps( _mm_load_ps( x + j ), mask ); 392 sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) ); 393 sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 394 sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 395 _mm_store_ss( & x[i], sum ); 396 lptr += nc; 397 } 398 399 #else 400 401 // work up to a multiple of 4 rows 402 for ( ; ( i & 3 ) != 0 && i < n; i++ ) { 403 float sum = b[i]; 404 for ( int j = 0; j < i; j++ ) { 405 sum -= lptr[j] * x[j]; 406 } 407 x[i] = sum; 408 lptr += nc; 409 } 410 411 assert_16_byte_aligned( x ); 412 413 for ( ; i + 3 < n; i += 4 ) { 414 const float * lptr0 = &lptr[0*nc]; 415 const float * lptr1 = &lptr[1*nc]; 416 const float * lptr2 = &lptr[2*nc]; 417 const float * lptr3 = &lptr[3*nc]; 418 419 assert_16_byte_aligned( lptr0 ); 420 assert_16_byte_aligned( lptr1 ); 421 assert_16_byte_aligned( lptr2 ); 422 assert_16_byte_aligned( lptr3 ); 423 424 float a0 = - lptr0[0] * x[0] + b[i+0]; 425 float a1 = - lptr0[1] * x[1]; 426 float a2 = - lptr0[2] * x[2]; 427 float a3 = - lptr0[3] * x[3]; 428 429 float b0 = - lptr1[0] * x[0] + b[i+1]; 430 float b1 = - lptr1[1] * x[1]; 431 float b2 = - lptr1[2] * x[2]; 432 float b3 = - lptr1[3] * x[3]; 433 434 float c0 = - lptr2[0] * x[0] + b[i+2]; 435 float c1 = - lptr2[1] * x[1]; 436 float c2 = - lptr2[2] * x[2]; 437 float c3 = - lptr2[3] * x[3]; 438 439 float d0 = - lptr3[0] * x[0] + b[i+3]; 440 float d1 = - lptr3[1] * x[1]; 441 float d2 = - lptr3[2] * x[2]; 442 float d3 = - lptr3[3] * x[3]; 443 444 for ( int j = 4; j < i; j += 4 ) { 445 a0 -= lptr0[j+0] * x[j+0]; 446 a1 -= lptr0[j+1] * x[j+1]; 447 a2 -= lptr0[j+2] * x[j+2]; 448 a3 -= lptr0[j+3] * x[j+3]; 449 450 b0 -= lptr1[j+0] * x[j+0]; 451 b1 -= lptr1[j+1] * x[j+1]; 452 b2 -= lptr1[j+2] * x[j+2]; 453 b3 -= lptr1[j+3] * x[j+3]; 454 455 c0 -= lptr2[j+0] * x[j+0]; 456 c1 -= lptr2[j+1] * x[j+1]; 457 c2 -= lptr2[j+2] * x[j+2]; 458 c3 -= lptr2[j+3] * x[j+3]; 459 460 d0 -= lptr3[j+0] * x[j+0]; 461 d1 -= lptr3[j+1] * x[j+1]; 462 d2 -= lptr3[j+2] * x[j+2]; 463 d3 -= lptr3[j+3] * x[j+3]; 464 } 465 466 b0 -= lptr1[i+0] * a0; 467 b1 -= lptr1[i+0] * a1; 468 b2 -= lptr1[i+0] * a2; 469 b3 -= lptr1[i+0] * a3; 470 471 c0 -= lptr2[i+0] * a0; 472 c1 -= lptr2[i+0] * a1; 473 c2 -= lptr2[i+0] * a2; 474 c3 -= lptr2[i+0] * a3; 475 476 c0 -= lptr2[i+1] * b0; 477 c1 -= lptr2[i+1] * b1; 478 c2 -= lptr2[i+1] * b2; 479 c3 -= lptr2[i+1] * b3; 480 481 d0 -= lptr3[i+0] * a0; 482 d1 -= lptr3[i+0] * a1; 483 d2 -= lptr3[i+0] * a2; 484 d3 -= lptr3[i+0] * a3; 485 486 d0 -= lptr3[i+1] * b0; 487 d1 -= lptr3[i+1] * b1; 488 d2 -= lptr3[i+1] * b2; 489 d3 -= lptr3[i+1] * b3; 490 491 d0 -= lptr3[i+2] * c0; 492 d1 -= lptr3[i+2] * c1; 493 d2 -= lptr3[i+2] * c2; 494 d3 -= lptr3[i+2] * c3; 495 496 x[i+0] = a0 + a1 + a2 + a3; 497 x[i+1] = b0 + b1 + b2 + b3; 498 x[i+2] = c0 + c1 + c2 + c3; 499 x[i+3] = d0 + d1 + d2 + d3; 500 501 lptr += 4 * nc; 502 } 503 504 // go through any remaining rows 505 for ( ; i < n; i++ ) { 506 float sum = b[i]; 507 for ( int j = 0; j < i; j++ ) { 508 sum -= lptr[j] * x[j]; 509 } 510 x[i] = sum; 511 lptr += nc; 512 } 513 514 #endif 515 516 } 517 518 /* 519 ======================== 520 LowerTriangularSolveTranspose_SIMD 521 522 Solves x in L'x = b for the n * n sub-matrix of L. 523 * L has to be a lower triangular matrix with (implicit) ones on the diagonal 524 * x == b is allowed 525 ======================== 526 */ 527 static void LowerTriangularSolveTranspose_SIMD( const idMatX & L, float * x, const float * b, const int n ) { 528 int nc = L.GetNumColumns(); 529 530 assert( ( nc & 3 ) == 0 ); 531 532 int m = n; 533 int r = n & 3; 534 535 if ( ( m & 3 ) != 0 ) { 536 const float * lptr = L.ToFloatPtr() + m * nc + m; 537 if ( ( m & 3 ) == 1 ) { 538 x[m-1] = b[m-1]; 539 m -= 1; 540 } else if ( ( m & 3 ) == 2 ) { 541 x[m-1] = b[m-1]; 542 x[m-2] = b[m-2] - lptr[-1*nc-2] * x[m-1]; 543 m -= 2; 544 } else { 545 x[m-1] = b[m-1]; 546 x[m-2] = b[m-2] - lptr[-1*nc-2] * x[m-1]; 547 x[m-3] = b[m-3] - lptr[-1*nc-3] * x[m-1] - lptr[-2*nc-3] * x[m-2]; 548 m -= 3; 549 } 550 } 551 552 const float * lptr = L.ToFloatPtr() + m * nc + m - 4; 553 float * xptr = x + m; 554 555 #ifdef ID_WIN_X86_SSE2_INTRIN 556 557 // process 4 rows at a time 558 for ( int i = m; i >= 4; i -= 4 ) { 559 assert_16_byte_aligned( b ); 560 assert_16_byte_aligned( xptr ); 561 assert_16_byte_aligned( lptr ); 562 563 __m128 s0 = _mm_load_ps( &b[i-4] ); 564 __m128 s1 = (__m128 &)SIMD_SP_zero; 565 __m128 s2 = (__m128 &)SIMD_SP_zero; 566 __m128 s3 = (__m128 &)SIMD_SP_zero; 567 568 // process 4x4 blocks 569 const float * xptr2 = xptr; // x + i; 570 const float * lptr2 = lptr; // ptr = L[i] + i - 4; 571 for ( int j = i; j < m; j += 4 ) { 572 __m128 xj = _mm_load_ps( xptr2 ); 573 574 s0 = _mm_sub_ps( s0, _mm_mul_ps( _mm_splat_ps( xj, 0 ), _mm_load_ps( lptr2 + 0 * nc ) ) ); 575 s1 = _mm_sub_ps( s1, _mm_mul_ps( _mm_splat_ps( xj, 1 ), _mm_load_ps( lptr2 + 1 * nc ) ) ); 576 s2 = _mm_sub_ps( s2, _mm_mul_ps( _mm_splat_ps( xj, 2 ), _mm_load_ps( lptr2 + 2 * nc ) ) ); 577 s3 = _mm_sub_ps( s3, _mm_mul_ps( _mm_splat_ps( xj, 3 ), _mm_load_ps( lptr2 + 3 * nc ) ) ); 578 579 lptr2 += 4 * nc; 580 xptr2 += 4; 581 } 582 for ( int j = 0; j < r; j++ ) { 583 s0 = _mm_sub_ps( s0, _mm_mul_ps( _mm_load_ps( lptr2 ), _mm_load1_ps( &xptr2[j] ) ) ); 584 lptr2 += nc; 585 } 586 s0 = _mm_add_ps( s0, s1 ); 587 s2 = _mm_add_ps( s2, s3 ); 588 s0 = _mm_add_ps( s0, s2 ); 589 // process left over of the 4 rows 590 lptr -= 4 * nc; 591 __m128 t0 = _mm_and_ps( _mm_load_ps( lptr + 3 * nc ), (__m128 &)SIMD_SP_clearLast1 ); 592 __m128 t1 = _mm_and_ps( _mm_load_ps( lptr + 2 * nc ), (__m128 &)SIMD_SP_clearLast2 ); 593 __m128 t2 = _mm_load_ss( lptr + 1 * nc ); 594 s0 = _mm_sub_ps( s0, _mm_mul_ps( t0, _mm_splat_ps( s0, 3 ) ) ); 595 s0 = _mm_sub_ps( s0, _mm_mul_ps( t1, _mm_splat_ps( s0, 2 ) ) ); 596 s0 = _mm_sub_ps( s0, _mm_mul_ps( t2, _mm_splat_ps( s0, 1 ) ) ); 597 // store result 598 _mm_store_ps( &xptr[-4], s0 ); 599 // update pointers for next four rows 600 lptr -= 4; 601 xptr -= 4; 602 } 603 604 #else 605 606 // process 4 rows at a time 607 for ( int i = m; i >= 4; i -= 4 ) { 608 assert_16_byte_aligned( b ); 609 assert_16_byte_aligned( xptr ); 610 assert_16_byte_aligned( lptr ); 611 612 float s0 = b[i-4]; 613 float s1 = b[i-3]; 614 float s2 = b[i-2]; 615 float s3 = b[i-1]; 616 // process 4x4 blocks 617 const float * xptr2 = xptr; // x + i; 618 const float * lptr2 = lptr; // ptr = L[i] + i - 4; 619 for ( int j = i; j < m; j += 4 ) { 620 float t0 = xptr2[0]; 621 s0 -= lptr2[0] * t0; 622 s1 -= lptr2[1] * t0; 623 s2 -= lptr2[2] * t0; 624 s3 -= lptr2[3] * t0; 625 lptr2 += nc; 626 float t1 = xptr2[1]; 627 s0 -= lptr2[0] * t1; 628 s1 -= lptr2[1] * t1; 629 s2 -= lptr2[2] * t1; 630 s3 -= lptr2[3] * t1; 631 lptr2 += nc; 632 float t2 = xptr2[2]; 633 s0 -= lptr2[0] * t2; 634 s1 -= lptr2[1] * t2; 635 s2 -= lptr2[2] * t2; 636 s3 -= lptr2[3] * t2; 637 lptr2 += nc; 638 float t3 = xptr2[3]; 639 s0 -= lptr2[0] * t3; 640 s1 -= lptr2[1] * t3; 641 s2 -= lptr2[2] * t3; 642 s3 -= lptr2[3] * t3; 643 lptr2 += nc; 644 xptr2 += 4; 645 } 646 for ( int j = 0; j < r; j++ ) { 647 float t = xptr2[j]; 648 s0 -= lptr2[0] * t; 649 s1 -= lptr2[1] * t; 650 s2 -= lptr2[2] * t; 651 s3 -= lptr2[3] * t; 652 lptr2 += nc; 653 } 654 // process left over of the 4 rows 655 lptr -= nc; 656 s0 -= lptr[0] * s3; 657 s1 -= lptr[1] * s3; 658 s2 -= lptr[2] * s3; 659 lptr -= nc; 660 s0 -= lptr[0] * s2; 661 s1 -= lptr[1] * s2; 662 lptr -= nc; 663 s0 -= lptr[0] * s1; 664 lptr -= nc; 665 // store result 666 xptr[-4] = s0; 667 xptr[-3] = s1; 668 xptr[-2] = s2; 669 xptr[-1] = s3; 670 // update pointers for next four rows 671 lptr -= 4; 672 xptr -= 4; 673 } 674 675 #endif 676 677 } 678 679 /* 680 ======================== 681 UpperTriangularSolve_SIMD 682 683 Solves x in Ux = b for the n * n sub-matrix of U. 684 * U has to be a upper triangular matrix 685 * invDiag is the reciprical of the diagonal of the upper triangular matrix. 686 * x == b is allowed 687 ======================== 688 */ 689 static void UpperTriangularSolve_SIMD( const idMatX & U, const float * invDiag, float * x, const float * b, const int n ) { 690 for ( int i = n - 1; i >= 0; i-- ) { 691 float sum = b[i]; 692 const float * uptr = U[i]; 693 for ( int j = i + 1; j < n; j++ ) { 694 sum -= uptr[j] * x[j]; 695 } 696 x[i] = sum * invDiag[i]; 697 } 698 } 699 700 /* 701 ======================== 702 LU_Factor_SIMD 703 704 In-place factorization LU of the n * n sub-matrix of mat. The reciprocal of the diagonal 705 elements of U are stored in invDiag. No pivoting is used. 706 ======================== 707 */ 708 static bool LU_Factor_SIMD( idMatX & mat, idVecX & invDiag, const int n ) { 709 for ( int i = 0; i < n; i++ ) { 710 711 float d1 = mat[i][i]; 712 713 if ( fabs( d1 ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { 714 return false; 715 } 716 717 invDiag[i] = d1 = 1.0f / d1; 718 719 float * ptr1 = mat[i]; 720 721 for ( int j = i + 1; j < n; j++ ) { 722 723 float * ptr2 = mat[j]; 724 float d2 = ptr2[i] * d1; 725 ptr2[i] = d2; 726 727 int k; 728 for ( k = n - 1; k > i + 15; k -= 16 ) { 729 ptr2[k-0] -= d2 * ptr1[k-0]; 730 ptr2[k-1] -= d2 * ptr1[k-1]; 731 ptr2[k-2] -= d2 * ptr1[k-2]; 732 ptr2[k-3] -= d2 * ptr1[k-3]; 733 ptr2[k-4] -= d2 * ptr1[k-4]; 734 ptr2[k-5] -= d2 * ptr1[k-5]; 735 ptr2[k-6] -= d2 * ptr1[k-6]; 736 ptr2[k-7] -= d2 * ptr1[k-7]; 737 ptr2[k-8] -= d2 * ptr1[k-8]; 738 ptr2[k-9] -= d2 * ptr1[k-9]; 739 ptr2[k-10] -= d2 * ptr1[k-10]; 740 ptr2[k-11] -= d2 * ptr1[k-11]; 741 ptr2[k-12] -= d2 * ptr1[k-12]; 742 ptr2[k-13] -= d2 * ptr1[k-13]; 743 ptr2[k-14] -= d2 * ptr1[k-14]; 744 ptr2[k-15] -= d2 * ptr1[k-15]; 745 } 746 switch( k - i ) { 747 NODEFAULT; 748 case 15: ptr2[k-14] -= d2 * ptr1[k-14]; 749 case 14: ptr2[k-13] -= d2 * ptr1[k-13]; 750 case 13: ptr2[k-12] -= d2 * ptr1[k-12]; 751 case 12: ptr2[k-11] -= d2 * ptr1[k-11]; 752 case 11: ptr2[k-10] -= d2 * ptr1[k-10]; 753 case 10: ptr2[k-9] -= d2 * ptr1[k-9]; 754 case 9: ptr2[k-8] -= d2 * ptr1[k-8]; 755 case 8: ptr2[k-7] -= d2 * ptr1[k-7]; 756 case 7: ptr2[k-6] -= d2 * ptr1[k-6]; 757 case 6: ptr2[k-5] -= d2 * ptr1[k-5]; 758 case 5: ptr2[k-4] -= d2 * ptr1[k-4]; 759 case 4: ptr2[k-3] -= d2 * ptr1[k-3]; 760 case 3: ptr2[k-2] -= d2 * ptr1[k-2]; 761 case 2: ptr2[k-1] -= d2 * ptr1[k-1]; 762 case 1: ptr2[k-0] -= d2 * ptr1[k-0]; 763 case 0: break; 764 } 765 } 766 } 767 768 return true; 769 } 770 771 /* 772 ======================== 773 LDLT_Factor_SIMD 774 775 In-place factorization LDL' of the n * n sub-matrix of mat. The reciprocal of the diagonal 776 elements are stored in invDiag. 777 778 NOTE: The number of columns of mat must be a multiple of 4. 779 ======================== 780 */ 781 static bool LDLT_Factor_SIMD( idMatX & mat, idVecX & invDiag, const int n ) { 782 float s0, s1, s2, d; 783 784 float * v = (float *) _alloca16( ( ( n + 3 ) & ~3 ) * sizeof( float ) ); 785 float * diag = (float *) _alloca16( ( ( n + 3 ) & ~3 ) * sizeof( float ) ); 786 float * invDiagPtr = invDiag.ToFloatPtr(); 787 788 int nc = mat.GetNumColumns(); 789 790 assert( ( nc & 3 ) == 0 ); 791 792 if ( n <= 0 ) { 793 return true; 794 } 795 796 float * mptr = mat[0]; 797 798 float sum = mptr[0]; 799 800 if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { 801 return false; 802 } 803 804 diag[0] = sum; 805 invDiagPtr[0] = d = 1.0f / sum; 806 807 if ( n <= 1 ) { 808 return true; 809 } 810 811 mptr = mat[0]; 812 for ( int j = 1; j < n; j++ ) { 813 mptr[j*nc+0] = ( mptr[j*nc+0] ) * d; 814 } 815 816 mptr = mat[1]; 817 818 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; 819 sum = mptr[1] - s0; 820 821 if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { 822 return false; 823 } 824 825 mat[1][1] = sum; 826 diag[1] = sum; 827 invDiagPtr[1] = d = 1.0f / sum; 828 829 if ( n <= 2 ) { 830 return true; 831 } 832 833 mptr = mat[0]; 834 for ( int j = 2; j < n; j++ ) { 835 mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d; 836 } 837 838 mptr = mat[2]; 839 840 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; 841 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; 842 sum = mptr[2] - s0 - s1; 843 844 if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { 845 return false; 846 } 847 848 mat[2][2] = sum; 849 diag[2] = sum; 850 invDiagPtr[2] = d = 1.0f / sum; 851 852 if ( n <= 3 ) { 853 return true; 854 } 855 856 mptr = mat[0]; 857 for ( int j = 3; j < n; j++ ) { 858 mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d; 859 } 860 861 mptr = mat[3]; 862 863 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; 864 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; 865 v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2]; 866 sum = mptr[3] - s0 - s1 - s2; 867 868 if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { 869 return false; 870 } 871 872 mat[3][3] = sum; 873 diag[3] = sum; 874 invDiagPtr[3] = d = 1.0f / sum; 875 876 if ( n <= 4 ) { 877 return true; 878 } 879 880 mptr = mat[0]; 881 for ( int j = 4; j < n; j++ ) { 882 mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d; 883 } 884 885 #ifdef ID_WIN_X86_SSE2_INTRIN 886 887 __m128 vzero = _mm_setzero_ps(); 888 for ( int i = 4; i < n; i += 4 ) { 889 _mm_store_ps( diag + i, vzero ); 890 } 891 892 for ( int i = 4; i < n; i++ ) { 893 mptr = mat[i]; 894 895 assert_16_byte_aligned( v ); 896 assert_16_byte_aligned( mptr ); 897 assert_16_byte_aligned( diag ); 898 899 __m128 m0 = _mm_load_ps( mptr + 0 ); 900 __m128 d0 = _mm_load_ps( diag + 0 ); 901 __m128 v0 = _mm_mul_ps( d0, m0 ); 902 __m128 t0 = _mm_load_ss( mptr + i ); 903 t0 = _mm_sub_ps( t0, _mm_mul_ps( m0, v0 ) ); 904 _mm_store_ps( v + 0, v0 ); 905 906 int k = 4; 907 for ( ; k < i - 3; k += 4 ) { 908 m0 = _mm_load_ps( mptr + k ); 909 d0 = _mm_load_ps( diag + k ); 910 v0 = _mm_mul_ps( d0, m0 ); 911 t0 = _mm_sub_ps( t0, _mm_mul_ps( m0, v0 ) ); 912 _mm_store_ps( v + k, v0 ); 913 } 914 915 __m128 mask = (__m128 &) SIMD_SP_indexedEndMask[i & 3]; 916 917 m0 = _mm_and_ps( _mm_load_ps( mptr + k ), mask ); 918 d0 = _mm_load_ps( diag + k ); 919 v0 = _mm_mul_ps( d0, m0 ); 920 t0 = _mm_sub_ps( t0, _mm_mul_ps( m0, v0 ) ); 921 _mm_store_ps( v + k, v0 ); 922 923 t0 = _mm_add_ps( t0, _mm_shuffle_ps( t0, t0, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 924 t0 = _mm_add_ps( t0, _mm_shuffle_ps( t0, t0, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 925 926 __m128 tiny = _mm_and_ps( _mm_cmpeq_ps( t0, SIMD_SP_zero ), SIMD_SP_tiny ); 927 t0 = _mm_or_ps( t0, tiny ); 928 929 _mm_store_ss( mptr + i, t0 ); 930 _mm_store_ss( diag + i, t0 ); 931 932 __m128 d = _mm_rcp32_ps( t0 ); 933 _mm_store_ss( invDiagPtr + i, d ); 934 935 if ( i + 1 >= n ) { 936 return true; 937 } 938 939 int j = i + 1; 940 for ( ; j < n - 3; j += 4 ) { 941 float * ra = mat[j+0]; 942 float * rb = mat[j+1]; 943 float * rc = mat[j+2]; 944 float * rd = mat[j+3]; 945 946 assert_16_byte_aligned( v ); 947 assert_16_byte_aligned( ra ); 948 assert_16_byte_aligned( rb ); 949 assert_16_byte_aligned( rc ); 950 assert_16_byte_aligned( rd ); 951 952 __m128 va = _mm_load_ss( ra + i ); 953 __m128 vb = _mm_load_ss( rb + i ); 954 __m128 vc = _mm_load_ss( rc + i ); 955 __m128 vd = _mm_load_ss( rd + i ); 956 957 __m128 v0 = _mm_load_ps( v + 0 ); 958 959 va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( ra + 0 ), v0 ) ); 960 vb = _mm_sub_ps( vb, _mm_mul_ps( _mm_load_ps( rb + 0 ), v0 ) ); 961 vc = _mm_sub_ps( vc, _mm_mul_ps( _mm_load_ps( rc + 0 ), v0 ) ); 962 vd = _mm_sub_ps( vd, _mm_mul_ps( _mm_load_ps( rd + 0 ), v0 ) ); 963 964 int k = 4; 965 for ( ; k < i - 3; k += 4 ) { 966 v0 = _mm_load_ps( v + k ); 967 968 va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( ra + k ), v0 ) ); 969 vb = _mm_sub_ps( vb, _mm_mul_ps( _mm_load_ps( rb + k ), v0 ) ); 970 vc = _mm_sub_ps( vc, _mm_mul_ps( _mm_load_ps( rc + k ), v0 ) ); 971 vd = _mm_sub_ps( vd, _mm_mul_ps( _mm_load_ps( rd + k ), v0 ) ); 972 } 973 974 v0 = _mm_load_ps( v + k ); 975 976 va = _mm_sub_ps( va, _mm_mul_ps( _mm_and_ps( _mm_load_ps( ra + k ), mask ), v0 ) ); 977 vb = _mm_sub_ps( vb, _mm_mul_ps( _mm_and_ps( _mm_load_ps( rb + k ), mask ), v0 ) ); 978 vc = _mm_sub_ps( vc, _mm_mul_ps( _mm_and_ps( _mm_load_ps( rc + k ), mask ), v0 ) ); 979 vd = _mm_sub_ps( vd, _mm_mul_ps( _mm_and_ps( _mm_load_ps( rd + k ), mask ), v0 ) ); 980 981 __m128 ta = _mm_unpacklo_ps( va, vc ); // x0, z0, x1, z1 982 __m128 tb = _mm_unpackhi_ps( va, vc ); // x2, z2, x3, z3 983 __m128 tc = _mm_unpacklo_ps( vb, vd ); // y0, w0, y1, w1 984 __m128 td = _mm_unpackhi_ps( vb, vd ); // y2, w2, y3, w3 985 986 va = _mm_unpacklo_ps( ta, tc ); // x0, y0, z0, w0 987 vb = _mm_unpackhi_ps( ta, tc ); // x1, y1, z1, w1 988 vc = _mm_unpacklo_ps( tb, td ); // x2, y2, z2, w2 989 vd = _mm_unpackhi_ps( tb, td ); // x3, y3, z3, w3 990 991 va = _mm_add_ps( va, vb ); 992 vc = _mm_add_ps( vc, vd ); 993 va = _mm_add_ps( va, vc ); 994 va = _mm_mul_ps( va, d ); 995 996 _mm_store_ss( ra + i, _mm_splat_ps( va, 0 ) ); 997 _mm_store_ss( rb + i, _mm_splat_ps( va, 1 ) ); 998 _mm_store_ss( rc + i, _mm_splat_ps( va, 2 ) ); 999 _mm_store_ss( rd + i, _mm_splat_ps( va, 3 ) ); 1000 } 1001 for ( ; j < n; j++ ) { 1002 float * mptr = mat[j]; 1003 1004 assert_16_byte_aligned( v ); 1005 assert_16_byte_aligned( mptr ); 1006 1007 __m128 va = _mm_load_ss( mptr + i ); 1008 __m128 v0 = _mm_load_ps( v + 0 ); 1009 1010 va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( mptr + 0 ), v0 ) ); 1011 1012 int k = 4; 1013 for ( ; k < i - 3; k += 4 ) { 1014 v0 = _mm_load_ps( v + k ); 1015 va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( mptr + k ), v0 ) ); 1016 } 1017 1018 v0 = _mm_load_ps( v + k ); 1019 va = _mm_sub_ps( va, _mm_mul_ps( _mm_and_ps( _mm_load_ps( mptr + k ), mask ), v0 ) ); 1020 1021 va = _mm_add_ps( va, _mm_shuffle_ps( va, va, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 1022 va = _mm_add_ps( va, _mm_shuffle_ps( va, va, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 1023 va = _mm_mul_ps( va, d ); 1024 1025 _mm_store_ss( mptr + i, va ); 1026 } 1027 } 1028 return true; 1029 1030 #else 1031 1032 for ( int i = 4; i < n; i += 4 ) { 1033 diag[i+0] = 0.0f; 1034 diag[i+1] = 0.0f; 1035 diag[i+2] = 0.0f; 1036 diag[i+3] = 0.0f; 1037 } 1038 1039 for ( int i = 4; i < n; i++ ) { 1040 mptr = mat[i]; 1041 1042 assert_16_byte_aligned( v ); 1043 assert_16_byte_aligned( mptr ); 1044 assert_16_byte_aligned( diag ); 1045 1046 v[0] = diag[0] * mptr[0]; 1047 v[1] = diag[1] * mptr[1]; 1048 v[2] = diag[2] * mptr[2]; 1049 v[3] = diag[3] * mptr[3]; 1050 1051 float t0 = - mptr[0] * v[0] + mptr[i]; 1052 float t1 = - mptr[1] * v[1]; 1053 float t2 = - mptr[2] * v[2]; 1054 float t3 = - mptr[3] * v[3]; 1055 1056 int k = 4; 1057 for ( ; k < i - 3; k += 4 ) { 1058 v[k+0] = diag[k+0] * mptr[k+0]; 1059 v[k+1] = diag[k+1] * mptr[k+1]; 1060 v[k+2] = diag[k+2] * mptr[k+2]; 1061 v[k+3] = diag[k+3] * mptr[k+3]; 1062 1063 t0 -= mptr[k+0] * v[k+0]; 1064 t1 -= mptr[k+1] * v[k+1]; 1065 t2 -= mptr[k+2] * v[k+2]; 1066 t3 -= mptr[k+3] * v[k+3]; 1067 } 1068 1069 float m0 = ( i - k > 0 ) ? mptr[k+0] : 0.0f; 1070 float m1 = ( i - k > 1 ) ? mptr[k+1] : 0.0f; 1071 float m2 = ( i - k > 2 ) ? mptr[k+2] : 0.0f; 1072 float m3 = ( i - k > 3 ) ? mptr[k+3] : 0.0f; 1073 1074 v[k+0] = diag[k+0] * m0; 1075 v[k+1] = diag[k+1] * m1; 1076 v[k+2] = diag[k+2] * m2; 1077 v[k+3] = diag[k+3] * m3; 1078 1079 t0 -= m0 * v[k+0]; 1080 t1 -= m1 * v[k+1]; 1081 t2 -= m2 * v[k+2]; 1082 t3 -= m3 * v[k+3]; 1083 1084 sum = t0 + t1 + t2 + t3; 1085 1086 if ( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { 1087 return false; 1088 } 1089 1090 mat[i][i] = sum; 1091 diag[i] = sum; 1092 invDiagPtr[i] = d = 1.0f / sum; 1093 1094 if ( i + 1 >= n ) { 1095 return true; 1096 } 1097 1098 int j = i + 1; 1099 for ( ; j < n - 3; j += 4 ) { 1100 float * ra = mat[j+0]; 1101 float * rb = mat[j+1]; 1102 float * rc = mat[j+2]; 1103 float * rd = mat[j+3]; 1104 1105 assert_16_byte_aligned( v ); 1106 assert_16_byte_aligned( ra ); 1107 assert_16_byte_aligned( rb ); 1108 assert_16_byte_aligned( rc ); 1109 assert_16_byte_aligned( rd ); 1110 1111 float a0 = - ra[0] * v[0] + ra[i]; 1112 float a1 = - ra[1] * v[1]; 1113 float a2 = - ra[2] * v[2]; 1114 float a3 = - ra[3] * v[3]; 1115 1116 float b0 = - rb[0] * v[0] + rb[i]; 1117 float b1 = - rb[1] * v[1]; 1118 float b2 = - rb[2] * v[2]; 1119 float b3 = - rb[3] * v[3]; 1120 1121 float c0 = - rc[0] * v[0] + rc[i]; 1122 float c1 = - rc[1] * v[1]; 1123 float c2 = - rc[2] * v[2]; 1124 float c3 = - rc[3] * v[3]; 1125 1126 float d0 = - rd[0] * v[0] + rd[i]; 1127 float d1 = - rd[1] * v[1]; 1128 float d2 = - rd[2] * v[2]; 1129 float d3 = - rd[3] * v[3]; 1130 1131 int k = 4; 1132 for ( ; k < i - 3; k += 4 ) { 1133 a0 -= ra[k+0] * v[k+0]; 1134 a1 -= ra[k+1] * v[k+1]; 1135 a2 -= ra[k+2] * v[k+2]; 1136 a3 -= ra[k+3] * v[k+3]; 1137 1138 b0 -= rb[k+0] * v[k+0]; 1139 b1 -= rb[k+1] * v[k+1]; 1140 b2 -= rb[k+2] * v[k+2]; 1141 b3 -= rb[k+3] * v[k+3]; 1142 1143 c0 -= rc[k+0] * v[k+0]; 1144 c1 -= rc[k+1] * v[k+1]; 1145 c2 -= rc[k+2] * v[k+2]; 1146 c3 -= rc[k+3] * v[k+3]; 1147 1148 d0 -= rd[k+0] * v[k+0]; 1149 d1 -= rd[k+1] * v[k+1]; 1150 d2 -= rd[k+2] * v[k+2]; 1151 d3 -= rd[k+3] * v[k+3]; 1152 } 1153 1154 float ra0 = ( i - k > 0 ) ? ra[k+0] : 0.0f; 1155 float ra1 = ( i - k > 1 ) ? ra[k+1] : 0.0f; 1156 float ra2 = ( i - k > 2 ) ? ra[k+2] : 0.0f; 1157 float ra3 = ( i - k > 3 ) ? ra[k+3] : 0.0f; 1158 1159 float rb0 = ( i - k > 0 ) ? rb[k+0] : 0.0f; 1160 float rb1 = ( i - k > 1 ) ? rb[k+1] : 0.0f; 1161 float rb2 = ( i - k > 2 ) ? rb[k+2] : 0.0f; 1162 float rb3 = ( i - k > 3 ) ? rb[k+3] : 0.0f; 1163 1164 float rc0 = ( i - k > 0 ) ? rc[k+0] : 0.0f; 1165 float rc1 = ( i - k > 1 ) ? rc[k+1] : 0.0f; 1166 float rc2 = ( i - k > 2 ) ? rc[k+2] : 0.0f; 1167 float rc3 = ( i - k > 3 ) ? rc[k+3] : 0.0f; 1168 1169 float rd0 = ( i - k > 0 ) ? rd[k+0] : 0.0f; 1170 float rd1 = ( i - k > 1 ) ? rd[k+1] : 0.0f; 1171 float rd2 = ( i - k > 2 ) ? rd[k+2] : 0.0f; 1172 float rd3 = ( i - k > 3 ) ? rd[k+3] : 0.0f; 1173 1174 a0 -= ra0 * v[k+0]; 1175 a1 -= ra1 * v[k+1]; 1176 a2 -= ra2 * v[k+2]; 1177 a3 -= ra3 * v[k+3]; 1178 1179 b0 -= rb0 * v[k+0]; 1180 b1 -= rb1 * v[k+1]; 1181 b2 -= rb2 * v[k+2]; 1182 b3 -= rb3 * v[k+3]; 1183 1184 c0 -= rc0 * v[k+0]; 1185 c1 -= rc1 * v[k+1]; 1186 c2 -= rc2 * v[k+2]; 1187 c3 -= rc3 * v[k+3]; 1188 1189 d0 -= rd0 * v[k+0]; 1190 d1 -= rd1 * v[k+1]; 1191 d2 -= rd2 * v[k+2]; 1192 d3 -= rd3 * v[k+3]; 1193 1194 ra[i] = ( a0 + a1 + a2 + a3 ) * d; 1195 rb[i] = ( b0 + b1 + b2 + b3 ) * d; 1196 rc[i] = ( c0 + c1 + c2 + c3 ) * d; 1197 rd[i] = ( d0 + d1 + d2 + d3 ) * d; 1198 } 1199 for ( ; j < n; j++ ) { 1200 mptr = mat[j]; 1201 1202 assert_16_byte_aligned( v ); 1203 assert_16_byte_aligned( mptr ); 1204 1205 float a0 = - mptr[0] * v[0] + mptr[i]; 1206 float a1 = - mptr[1] * v[1]; 1207 float a2 = - mptr[2] * v[2]; 1208 float a3 = - mptr[3] * v[3]; 1209 1210 int k = 4; 1211 for ( ; k < i - 3; k += 4 ) { 1212 a0 -= mptr[k+0] * v[k+0]; 1213 a1 -= mptr[k+1] * v[k+1]; 1214 a2 -= mptr[k+2] * v[k+2]; 1215 a3 -= mptr[k+3] * v[k+3]; 1216 } 1217 1218 float m0 = ( i - k > 0 ) ? mptr[k+0] : 0.0f; 1219 float m1 = ( i - k > 1 ) ? mptr[k+1] : 0.0f; 1220 float m2 = ( i - k > 2 ) ? mptr[k+2] : 0.0f; 1221 float m3 = ( i - k > 3 ) ? mptr[k+3] : 0.0f; 1222 1223 a0 -= m0 * v[k+0]; 1224 a1 -= m1 * v[k+1]; 1225 a2 -= m2 * v[k+2]; 1226 a3 -= m3 * v[k+3]; 1227 1228 mptr[i] = ( a0 + a1 + a2 + a3 ) * d; 1229 } 1230 } 1231 return true; 1232 1233 #endif 1234 } 1235 1236 /* 1237 ======================== 1238 GetMaxStep_SIMD 1239 ======================== 1240 */ 1241 static void GetMaxStep_SIMD( const float * f, const float * a, const float * delta_f, const float * delta_a, 1242 const float * lo, const float * hi, const int * side, int numUnbounded, int numClamped, 1243 int d, float dir, float & maxStep, int & limit, int & limitSide ) { 1244 1245 #ifdef ID_WIN_X86_SSE2_INTRIN 1246 1247 __m128 vMaxStep; 1248 __m128i vLimit; 1249 __m128i vLimitSide; 1250 1251 // default to a full step for the current variable 1252 { 1253 __m128 vNegAccel = _mm_xor_ps( _mm_load1_ps( a + d ), (__m128 &) SIMD_SP_signBit ); 1254 __m128 vDeltaAccel = _mm_load1_ps( delta_a + d ); 1255 __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaAccel, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_ACCEL_EPSILON ); 1256 __m128 vStep = _mm_div32_ps( vNegAccel, _mm_sel_ps( SIMD_SP_one, vDeltaAccel, vM0 ) ); 1257 vMaxStep = _mm_sel_ps( SIMD_SP_zero, vStep, vM0 ); 1258 vLimit = _mm_shuffle_epi32( _mm_cvtsi32_si128( d ), 0 ); 1259 vLimitSide = (__m128i &) SIMD_DW_zero; 1260 } 1261 1262 // test the current variable 1263 { 1264 __m128 vDeltaForce = _mm_load1_ps( & dir ); 1265 __m128 vSign = _mm_cmplt_ps( vDeltaForce, SIMD_SP_zero ); 1266 __m128 vForceLimit = _mm_sel_ps( _mm_load1_ps( hi + d ), _mm_load1_ps( lo + d ), vSign ); 1267 __m128 vStep = _mm_div32_ps( _mm_sub_ps( vForceLimit, _mm_load1_ps( f + d ) ), vDeltaForce ); 1268 __m128i vSetSide = _mm_or_si128( __m128c( vSign ), (__m128i &) SIMD_DW_one ); 1269 __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaForce, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_FORCE_EPSILON ); 1270 __m128 vM1 = _mm_cmpneq_ps( _mm_and_ps( vForceLimit, (__m128 &) SIMD_SP_absMask ), SIMD_SP_infinity ); 1271 __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep ); 1272 __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 ); 1273 vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 ); 1274 vLimitSide = _mm_sel_si128( vLimitSide, vSetSide, __m128c( vM3 ) ); 1275 } 1276 1277 // test the clamped bounded variables 1278 { 1279 __m128 mask = (__m128 &) SIMD_SP_indexedStartMask[numUnbounded & 3]; 1280 __m128i index = _mm_add_epi32( _mm_and_si128( _mm_shuffle_epi32( _mm_cvtsi32_si128( numUnbounded ), 0 ), (__m128i &) SIMD_DW_not3 ), (__m128i &) SIMD_DW_index ); 1281 int i = numUnbounded & ~3; 1282 for ( ; i < numClamped - 3; i += 4 ) { 1283 __m128 vDeltaForce = _mm_and_ps( _mm_load_ps( delta_f + i ), mask ); 1284 __m128 vSign = _mm_cmplt_ps( vDeltaForce, SIMD_SP_zero ); 1285 __m128 vForceLimit = _mm_sel_ps( _mm_load_ps( hi + i ), _mm_load_ps( lo + i ), vSign ); 1286 __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaForce, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_FORCE_EPSILON ); 1287 __m128 vStep = _mm_div32_ps( _mm_sub_ps( vForceLimit, _mm_load_ps( f + i ) ), _mm_sel_ps( SIMD_SP_one, vDeltaForce, vM0 ) ); 1288 __m128i vSetSide = _mm_or_si128( __m128c( vSign ), (__m128i &) SIMD_DW_one ); 1289 __m128 vM1 = _mm_cmpneq_ps( _mm_and_ps( vForceLimit, (__m128 &) SIMD_SP_absMask ), SIMD_SP_infinity ); 1290 __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep ); 1291 __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 ); 1292 vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 ); 1293 vLimit = _mm_sel_si128( vLimit, index, vM3 ); 1294 vLimitSide = _mm_sel_si128( vLimitSide, vSetSide, __m128c( vM3 ) ); 1295 mask = (__m128 &) SIMD_SP_indexedStartMask[0]; 1296 index = _mm_add_epi32( index, (__m128i &) SIMD_DW_four ); 1297 } 1298 __m128 vDeltaForce = _mm_and_ps( _mm_load_ps( delta_f + i ), _mm_and_ps( mask, (__m128 &) SIMD_SP_indexedEndMask[numClamped & 3] ) ); 1299 __m128 vSign = _mm_cmplt_ps( vDeltaForce, SIMD_SP_zero ); 1300 __m128 vForceLimit = _mm_sel_ps( _mm_load_ps( hi + i ), _mm_load_ps( lo + i ), vSign ); 1301 __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaForce, (__m128 &) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_FORCE_EPSILON ); 1302 __m128 vStep = _mm_div32_ps( _mm_sub_ps( vForceLimit, _mm_load_ps( f + i ) ), _mm_sel_ps( SIMD_SP_one, vDeltaForce, vM0 ) ); 1303 __m128i vSetSide = _mm_or_si128( __m128c( vSign ), (__m128i &) SIMD_DW_one ); 1304 __m128 vM1 = _mm_cmpneq_ps( _mm_and_ps( vForceLimit, (__m128 &) SIMD_SP_absMask ), SIMD_SP_infinity ); 1305 __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep ); 1306 __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 ); 1307 vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 ); 1308 vLimit = _mm_sel_si128( vLimit, index, vM3 ); 1309 vLimitSide = _mm_sel_si128( vLimitSide, vSetSide, __m128c( vM3 ) ); 1310 } 1311 1312 // test the not clamped bounded variables 1313 { 1314 __m128 mask = (__m128 &) SIMD_SP_indexedStartMask[numClamped & 3]; 1315 __m128i index = _mm_add_epi32( _mm_and_si128( _mm_shuffle_epi32( _mm_cvtsi32_si128( numClamped ), 0 ), (__m128i &) SIMD_DW_not3 ), (__m128i &) SIMD_DW_index ); 1316 int i = numClamped & ~3; 1317 for ( ; i < d - 3; i += 4 ) { 1318 __m128 vNegAccel = _mm_xor_ps( _mm_load_ps( a + i ), (__m128 &) SIMD_SP_signBit ); 1319 __m128 vDeltaAccel = _mm_and_ps( _mm_load_ps( delta_a + i ), mask ); 1320 __m128 vSide = _mm_cvtepi32_ps( _mm_load_si128( (__m128i *) ( side + i ) ) ); 1321 __m128 vM0 = _mm_cmpgt_ps( _mm_mul_ps( vSide, vDeltaAccel ), SIMD_SP_LCP_DELTA_ACCEL_EPSILON ); 1322 __m128 vStep = _mm_div32_ps( vNegAccel, _mm_sel_ps( SIMD_SP_one, vDeltaAccel, vM0 ) ); 1323 __m128 vM1 = _mm_or_ps( _mm_cmplt_ps( _mm_load_ps( lo + i ), SIMD_SP_neg_LCP_BOUND_EPSILON ), _mm_cmpgt_ps( _mm_load_ps( hi + i ), SIMD_SP_LCP_BOUND_EPSILON ) ); 1324 __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep ); 1325 __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 ); 1326 vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 ); 1327 vLimit = _mm_sel_si128( vLimit, index, vM3 ); 1328 vLimitSide = _mm_sel_si128( vLimitSide, (__m128i &) SIMD_DW_zero, __m128c( vM3 ) ); 1329 mask = (__m128 &) SIMD_SP_indexedStartMask[0]; 1330 index = _mm_add_epi32( index, (__m128i &) SIMD_DW_four ); 1331 } 1332 __m128 vNegAccel = _mm_xor_ps( _mm_load_ps( a + i ), (__m128 &) SIMD_SP_signBit ); 1333 __m128 vDeltaAccel = _mm_and_ps( _mm_load_ps( delta_a + i ), _mm_and_ps( mask, (__m128 &) SIMD_SP_indexedEndMask[d & 3] ) ); 1334 __m128 vSide = _mm_cvtepi32_ps( _mm_load_si128( (__m128i *) ( side + i ) ) ); 1335 __m128 vM0 = _mm_cmpgt_ps( _mm_mul_ps( vSide, vDeltaAccel ), SIMD_SP_LCP_DELTA_ACCEL_EPSILON ); 1336 __m128 vStep = _mm_div32_ps( vNegAccel, _mm_sel_ps( SIMD_SP_one, vDeltaAccel, vM0 ) ); 1337 __m128 vM1 = _mm_or_ps( _mm_cmplt_ps( _mm_load_ps( lo + i ), SIMD_SP_neg_LCP_BOUND_EPSILON ), _mm_cmpgt_ps( _mm_load_ps( hi + i ), SIMD_SP_LCP_BOUND_EPSILON ) ); 1338 __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep ); 1339 __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 ); 1340 vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 ); 1341 vLimit = _mm_sel_si128( vLimit, index, vM3 ); 1342 vLimitSide = _mm_sel_si128( vLimitSide, (__m128i &) SIMD_DW_zero, __m128c( vM3 ) ); 1343 } 1344 1345 { 1346 __m128 tMaxStep = _mm_shuffle_ps( vMaxStep, vMaxStep, _MM_SHUFFLE( 1, 0, 3, 2 ) ); 1347 __m128i tLimit = _mm_shuffle_epi32( vLimit, _MM_SHUFFLE( 1, 0, 3, 2 ) ); 1348 __m128i tLimitSide = _mm_shuffle_epi32( vLimitSide, _MM_SHUFFLE( 1, 0, 3, 2 ) ); 1349 __m128c mask = _mm_cmplt_ps( tMaxStep, vMaxStep ); 1350 vMaxStep = _mm_min_ps( vMaxStep, tMaxStep ); 1351 vLimit = _mm_sel_si128( vLimit, tLimit, mask ); 1352 vLimitSide = _mm_sel_si128( vLimitSide, tLimitSide, mask ); 1353 } 1354 1355 { 1356 __m128 tMaxStep = _mm_shuffle_ps( vMaxStep, vMaxStep, _MM_SHUFFLE( 2, 3, 0, 1 ) ); 1357 __m128i tLimit = _mm_shuffle_epi32( vLimit, _MM_SHUFFLE( 2, 3, 0, 1 ) ); 1358 __m128i tLimitSide = _mm_shuffle_epi32( vLimitSide, _MM_SHUFFLE( 2, 3, 0, 1 ) ); 1359 __m128c mask = _mm_cmplt_ps( tMaxStep, vMaxStep ); 1360 vMaxStep = _mm_min_ps( vMaxStep, tMaxStep ); 1361 vLimit = _mm_sel_si128( vLimit, tLimit, mask ); 1362 vLimitSide = _mm_sel_si128( vLimitSide, tLimitSide, mask ); 1363 } 1364 1365 _mm_store_ss( & maxStep, vMaxStep ); 1366 limit = _mm_cvtsi128_si32( vLimit ); 1367 limitSide = _mm_cvtsi128_si32( vLimitSide ); 1368 1369 #else 1370 1371 // default to a full step for the current variable 1372 { 1373 float negAccel = -a[d]; 1374 float deltaAccel = delta_a[d]; 1375 int m0 = ( fabs( deltaAccel ) > LCP_DELTA_ACCEL_EPSILON ); 1376 float step = negAccel / ( m0 ? deltaAccel : 1.0f ); 1377 maxStep = m0 ? step : 0.0f; 1378 limit = d; 1379 limitSide = 0; 1380 } 1381 1382 // test the current variable 1383 { 1384 float deltaForce = dir; 1385 float forceLimit = ( deltaForce < 0.0f ) ? lo[d] : hi[d]; 1386 float step = ( forceLimit - f[d] ) / deltaForce; 1387 int setSide = ( deltaForce < 0.0f ) ? -1 : 1; 1388 int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON ); 1389 int m1 = ( fabs( forceLimit ) != idMath::INFINITY ); 1390 int m2 = ( step < maxStep ); 1391 int m3 = ( m0 & m1 & m2 ); 1392 maxStep = m3 ? step : maxStep; 1393 limit = m3 ? d : limit; 1394 limitSide = m3 ? setSide : limitSide; 1395 } 1396 1397 // test the clamped bounded variables 1398 for ( int i = numUnbounded; i < numClamped; i++ ) { 1399 float deltaForce = delta_f[i]; 1400 float forceLimit = ( deltaForce < 0.0f ) ? lo[i] : hi[i]; 1401 int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON ); 1402 float step = ( forceLimit - f[i] ) / ( m0 ? deltaForce : 1.0f ); 1403 int setSide = ( deltaForce < 0.0f ) ? -1 : 1; 1404 int m1 = ( fabs( forceLimit ) != idMath::INFINITY ); 1405 int m2 = ( step < maxStep ); 1406 int m3 = ( m0 & m1 & m2 ); 1407 maxStep = m3 ? step : maxStep; 1408 limit = m3 ? i : limit; 1409 limitSide = m3 ? setSide : limitSide; 1410 } 1411 1412 // test the not clamped bounded variables 1413 for ( int i = numClamped; i < d; i++ ) { 1414 float negAccel = -a[i]; 1415 float deltaAccel = delta_a[i]; 1416 int m0 = ( side[i] * deltaAccel > LCP_DELTA_ACCEL_EPSILON ); 1417 float step = negAccel / ( m0 ? deltaAccel : 1.0f ); 1418 int m1 = ( lo[i] < -LCP_BOUND_EPSILON || hi[i] > LCP_BOUND_EPSILON ); 1419 int m2 = ( step < maxStep ); 1420 int m3 = ( m0 & m1 & m2 ); 1421 maxStep = m3 ? step : maxStep; 1422 limit = m3 ? i : limit; 1423 limitSide = m3 ? 0 : limitSide; 1424 } 1425 1426 #endif 1427 } 1428 1429 /* 1430 ================================================================================================ 1431 1432 SIMD test code 1433 1434 ================================================================================================ 1435 */ 1436 1437 //#define ENABLE_TEST_CODE 1438 1439 #ifdef ENABLE_TEST_CODE 1440 1441 #define TEST_TRIANGULAR_SOLVE_SIMD_EPSILON 0.1f 1442 #define TEST_TRIANGULAR_SOLVE_SIZE 50 1443 #define TEST_FACTOR_SIMD_EPSILON 0.1f 1444 #define TEST_FACTOR_SOLVE_SIZE 50 1445 #define NUM_TESTS 50 1446 1447 /* 1448 ======================== 1449 PrintClocks 1450 ======================== 1451 */ 1452 static void PrintClocks( const char * string, int dataCount, int64 clocks, int64 otherClocks = 0 ) { 1453 idLib::Printf( string ); 1454 for ( int i = idStr::LengthWithoutColors(string); i < 48; i++ ) { 1455 idLib::Printf(" "); 1456 } 1457 if ( clocks && otherClocks ) { 1458 int p = 0; 1459 if ( clocks <= otherClocks ) { 1460 p = idMath::Ftoi( (float) ( otherClocks - clocks ) * 100.0f / (float) otherClocks ); 1461 } else { 1462 p = - idMath::Ftoi( (float) ( clocks - otherClocks ) * 100.0f / (float) clocks ); 1463 } 1464 idLib::Printf( "c = %4d, clcks = %5lld, %d%%\n", dataCount, clocks, p ); 1465 } else { 1466 idLib::Printf( "c = %4d, clcks = %5lld\n", dataCount, clocks ); 1467 } 1468 } 1469 1470 /* 1471 ======================== 1472 DotProduct_Test 1473 ======================== 1474 */ 1475 static void DotProduct_Test() { 1476 ALIGN16( float fsrc0[TEST_TRIANGULAR_SOLVE_SIZE + 1]; ) 1477 ALIGN16( float fsrc1[TEST_TRIANGULAR_SOLVE_SIZE + 1]; ) 1478 1479 idRandom srnd( 13 ); 1480 1481 for ( int i = 0; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) { 1482 fsrc0[i] = srnd.CRandomFloat() * 10.0f; 1483 fsrc1[i] = srnd.CRandomFloat() * 10.0f; 1484 } 1485 1486 idTimer timer; 1487 1488 for ( int i = 0; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) { 1489 1490 float dot1 = DotProduct_Generic( fsrc0, fsrc1, i ); 1491 int64 clocksGeneric = 0xFFFFFFFFFFFF; 1492 for ( int j = 0; j < NUM_TESTS; j++ ) { 1493 fsrc1[TEST_TRIANGULAR_SOLVE_SIZE] = j; 1494 timer.Clear(); 1495 timer.Start(); 1496 dot1 = DotProduct_Generic( fsrc0, fsrc1, i ); 1497 timer.Stop(); 1498 clocksGeneric = Min( clocksGeneric, timer.ClockTicks() ); 1499 } 1500 1501 PrintClocks( va( "DotProduct_Generic %d", i ), 1, clocksGeneric ); 1502 1503 float dot2 = DotProduct_SIMD( fsrc0, fsrc1, i ); 1504 int64 clocksSIMD = 0xFFFFFFFFFFFF; 1505 for ( int j = 0; j < NUM_TESTS; j++ ) { 1506 fsrc1[TEST_TRIANGULAR_SOLVE_SIZE] = j; 1507 timer.Clear(); 1508 timer.Start(); 1509 dot2 = DotProduct_SIMD( fsrc0, fsrc1, i ); 1510 timer.Stop(); 1511 clocksSIMD = Min( clocksSIMD, timer.ClockTicks() ); 1512 } 1513 1514 const char * result = idMath::Fabs( dot1 - dot2 ) < 1e-4f ? "ok" : S_COLOR_RED"X"; 1515 PrintClocks( va( "DotProduct_SIMD %d %s", i, result ), 1, clocksSIMD, clocksGeneric ); 1516 } 1517 } 1518 1519 /* 1520 ======================== 1521 LowerTriangularSolve_Test 1522 ======================== 1523 */ 1524 static void LowerTriangularSolve_Test() { 1525 idMatX L; 1526 idVecX x, b, tst; 1527 1528 int paddedSize = ( TEST_TRIANGULAR_SOLVE_SIZE + 3 ) & ~3; 1529 1530 L.Random( paddedSize, paddedSize, 0, -1.0f, 1.0f ); 1531 x.SetSize( paddedSize ); 1532 b.Random( paddedSize, 0, -1.0f, 1.0f ); 1533 1534 idTimer timer; 1535 1536 const int skip = 0; 1537 1538 for ( int i = 1; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) { 1539 1540 x.Zero( i ); 1541 1542 LowerTriangularSolve_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip ); 1543 int64 clocksGeneric = 0xFFFFFFFFFFFF; 1544 for ( int j = 0; j < NUM_TESTS; j++ ) { 1545 timer.Clear(); 1546 timer.Start(); 1547 LowerTriangularSolve_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip ); 1548 timer.Stop(); 1549 clocksGeneric = Min( clocksGeneric, timer.ClockTicks() ); 1550 } 1551 1552 tst = x; 1553 x.Zero(); 1554 1555 PrintClocks( va( "LowerTriangularSolve_Generic %dx%d", i, i ), 1, clocksGeneric ); 1556 1557 LowerTriangularSolve_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip ); 1558 int64 clocksSIMD = 0xFFFFFFFFFFFF; 1559 for ( int j = 0; j < NUM_TESTS; j++ ) { 1560 timer.Clear(); 1561 timer.Start(); 1562 LowerTriangularSolve_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip ); 1563 timer.Stop(); 1564 clocksSIMD = Min( clocksSIMD, timer.ClockTicks() ); 1565 } 1566 1567 const char * result = x.Compare( tst, TEST_TRIANGULAR_SOLVE_SIMD_EPSILON ) ? "ok" : S_COLOR_RED"X"; 1568 PrintClocks( va( "LowerTriangularSolve_SIMD %dx%d %s", i, i, result ), 1, clocksSIMD, clocksGeneric ); 1569 } 1570 } 1571 1572 /* 1573 ======================== 1574 LowerTriangularSolveTranspose_Test 1575 ======================== 1576 */ 1577 static void LowerTriangularSolveTranspose_Test() { 1578 idMatX L; 1579 idVecX x, b, tst; 1580 1581 int paddedSize = ( TEST_TRIANGULAR_SOLVE_SIZE + 3 ) & ~3; 1582 1583 L.Random( paddedSize, paddedSize, 0, -1.0f, 1.0f ); 1584 x.SetSize( paddedSize ); 1585 b.Random( paddedSize, 0, -1.0f, 1.0f ); 1586 1587 idTimer timer; 1588 1589 for ( int i = 1; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) { 1590 1591 x.Zero( i ); 1592 1593 LowerTriangularSolveTranspose_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i ); 1594 int64 clocksGeneric = 0xFFFFFFFFFFFF; 1595 for ( int j = 0; j < NUM_TESTS; j++ ) { 1596 timer.Clear(); 1597 timer.Start(); 1598 LowerTriangularSolveTranspose_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i ); 1599 timer.Stop(); 1600 clocksGeneric = Min( clocksGeneric, timer.ClockTicks() ); 1601 } 1602 1603 tst = x; 1604 x.Zero(); 1605 1606 PrintClocks( va( "LowerTriangularSolveTranspose_Generic %dx%d", i, i ), 1, clocksGeneric ); 1607 1608 LowerTriangularSolveTranspose_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i ); 1609 int64 clocksSIMD = 0xFFFFFFFFFFFF; 1610 for ( int j = 0; j < NUM_TESTS; j++ ) { 1611 timer.Clear(); 1612 timer.Start(); 1613 LowerTriangularSolveTranspose_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i ); 1614 timer.Stop(); 1615 clocksSIMD = Min( clocksSIMD, timer.ClockTicks() ); 1616 } 1617 1618 const char * result = x.Compare( tst, TEST_TRIANGULAR_SOLVE_SIMD_EPSILON ) ? "ok" : S_COLOR_RED"X"; 1619 PrintClocks( va( "LowerTriangularSolveTranspose_SIMD %dx%d %s", i, i, result ), 1, clocksSIMD, clocksGeneric ); 1620 } 1621 } 1622 1623 /* 1624 ======================== 1625 LDLT_Factor_Test 1626 ======================== 1627 */ 1628 static void LDLT_Factor_Test() { 1629 idMatX src, original, mat1, mat2; 1630 idVecX invDiag1, invDiag2; 1631 1632 int paddedSize = ( TEST_FACTOR_SOLVE_SIZE + 3 ) & ~3; 1633 1634 original.SetSize( paddedSize, paddedSize ); 1635 src.Random( paddedSize, paddedSize, 0, -1.0f, 1.0f ); 1636 src.TransposeMultiply( original, src ); 1637 1638 idTimer timer; 1639 1640 for ( int i = 1; i < TEST_FACTOR_SOLVE_SIZE; i++ ) { 1641 1642 int64 clocksGeneric = 0xFFFFFFFFFFFF; 1643 for ( int j = 0; j < NUM_TESTS; j++ ) { 1644 mat1 = original; 1645 invDiag1.Zero( TEST_FACTOR_SOLVE_SIZE ); 1646 timer.Clear(); 1647 timer.Start(); 1648 LDLT_Factor_Generic( mat1, invDiag1, i ); 1649 timer.Stop(); 1650 clocksGeneric = Min( clocksGeneric, timer.ClockTicks() ); 1651 } 1652 1653 PrintClocks( va( "LDLT_Factor_Generic %dx%d", i, i ), 1, clocksGeneric ); 1654 1655 int64 clocksSIMD = 0xFFFFFFFFFFFF; 1656 for ( int j = 0; j < NUM_TESTS; j++ ) { 1657 mat2 = original; 1658 invDiag2.Zero( TEST_FACTOR_SOLVE_SIZE ); 1659 timer.Clear(); 1660 timer.Start(); 1661 LDLT_Factor_SIMD( mat2, invDiag2, i ); 1662 timer.Stop(); 1663 clocksSIMD = Min( clocksSIMD, timer.ClockTicks() ); 1664 } 1665 1666 const char * result = mat1.Compare( mat2, TEST_FACTOR_SIMD_EPSILON ) && invDiag1.Compare( invDiag2, TEST_FACTOR_SIMD_EPSILON ) ? "ok" : S_COLOR_RED"X"; 1667 PrintClocks( va( "LDLT_Factor_SIMD %dx%d %s", i, i, result ), 1, clocksSIMD, clocksGeneric ); 1668 } 1669 } 1670 #endif 1671 1672 #define Multiply Multiply_SIMD 1673 #define MultiplyAdd MultiplyAdd_SIMD 1674 #define BigDotProduct DotProduct_SIMD 1675 #define LowerTriangularSolve LowerTriangularSolve_SIMD 1676 #define LowerTriangularSolveTranspose LowerTriangularSolveTranspose_SIMD 1677 #define UpperTriangularSolve UpperTriangularSolve_SIMD 1678 #define LU_Factor LU_Factor_SIMD 1679 #define LDLT_Factor LDLT_Factor_SIMD 1680 #define GetMaxStep GetMaxStep_SIMD 1681 1682 /* 1683 ================================================================================================ 1684 1685 idLCP_Square 1686 1687 ================================================================================================ 1688 */ 1689 1690 /* 1691 ================================================ 1692 idLCP_Square 1693 ================================================ 1694 */ 1695 class idLCP_Square : public idLCP { 1696 public: 1697 virtual bool Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex ); 1698 1699 private: 1700 idMatX m; // original matrix 1701 idVecX b; // right hand side 1702 idVecX lo, hi; // low and high bounds 1703 idVecX f, a; // force and acceleration 1704 idVecX delta_f, delta_a; // delta force and delta acceleration 1705 idMatX clamped; // LU factored sub matrix for clamped variables 1706 idVecX diagonal; // reciprocal of diagonal of U of the LU factored sub matrix for clamped variables 1707 int numUnbounded; // number of unbounded variables 1708 int numClamped; // number of clamped variables 1709 float ** rowPtrs; // pointers to the rows of m 1710 int * boxIndex; // box index 1711 int * side; // tells if a variable is at the low boundary = -1, high boundary = 1 or inbetween = 0 1712 int * permuted; // index to keep track of the permutation 1713 bool padded; // set to true if the rows of the initial matrix are 16 byte padded 1714 1715 bool FactorClamped(); 1716 void SolveClamped( idVecX & x, const float * b ); 1717 void Swap( int i, int j ); 1718 void AddClamped( int r ); 1719 void RemoveClamped( int r ); 1720 void CalcForceDelta( int d, float dir ); 1721 void CalcAccelDelta( int d ); 1722 void ChangeForce( int d, float step ); 1723 void ChangeAccel( int d, float step ); 1724 }; 1725 1726 /* 1727 ======================== 1728 idLCP_Square::FactorClamped 1729 ======================== 1730 */ 1731 bool idLCP_Square::FactorClamped() { 1732 for ( int i = 0; i < numClamped; i++ ) { 1733 memcpy( clamped[i], rowPtrs[i], numClamped * sizeof( float ) ); 1734 } 1735 return LU_Factor( clamped, diagonal, numClamped ); 1736 } 1737 1738 /* 1739 ======================== 1740 idLCP_Square::SolveClamped 1741 ======================== 1742 */ 1743 void idLCP_Square::SolveClamped( idVecX & x, const float * b ) { 1744 // solve L 1745 LowerTriangularSolve( clamped, x.ToFloatPtr(), b, numClamped, 0 ); 1746 1747 // solve U 1748 UpperTriangularSolve( clamped, diagonal.ToFloatPtr(), x.ToFloatPtr(), x.ToFloatPtr(), numClamped ); 1749 } 1750 1751 /* 1752 ======================== 1753 idLCP_Square::Swap 1754 ======================== 1755 */ 1756 void idLCP_Square::Swap( int i, int j ) { 1757 1758 if ( i == j ) { 1759 return; 1760 } 1761 1762 SwapValues( rowPtrs[i], rowPtrs[j] ); 1763 m.SwapColumns( i, j ); 1764 b.SwapElements( i, j ); 1765 lo.SwapElements( i, j ); 1766 hi.SwapElements( i, j ); 1767 a.SwapElements( i, j ); 1768 f.SwapElements( i, j ); 1769 if ( boxIndex != NULL ) { 1770 SwapValues( boxIndex[i], boxIndex[j] ); 1771 } 1772 SwapValues( side[i], side[j] ); 1773 SwapValues( permuted[i], permuted[j] ); 1774 } 1775 1776 /* 1777 ======================== 1778 idLCP_Square::AddClamped 1779 ======================== 1780 */ 1781 void idLCP_Square::AddClamped( int r ) { 1782 1783 assert( r >= numClamped ); 1784 1785 // add a row at the bottom and a column at the right of the factored 1786 // matrix for the clamped variables 1787 1788 Swap( numClamped, r ); 1789 1790 // add row to L 1791 for ( int i = 0; i < numClamped; i++ ) { 1792 float sum = rowPtrs[numClamped][i]; 1793 for ( int j = 0; j < i; j++ ) { 1794 sum -= clamped[numClamped][j] * clamped[j][i]; 1795 } 1796 clamped[numClamped][i] = sum * diagonal[i]; 1797 } 1798 1799 // add column to U 1800 for ( int i = 0; i <= numClamped; i++ ) { 1801 float sum = rowPtrs[i][numClamped]; 1802 for ( int j = 0; j < i; j++ ) { 1803 sum -= clamped[i][j] * clamped[j][numClamped]; 1804 } 1805 clamped[i][numClamped] = sum; 1806 } 1807 1808 diagonal[numClamped] = 1.0f / clamped[numClamped][numClamped]; 1809 1810 numClamped++; 1811 } 1812 1813 /* 1814 ======================== 1815 idLCP_Square::RemoveClamped 1816 ======================== 1817 */ 1818 void idLCP_Square::RemoveClamped( int r ) { 1819 1820 if ( !verify( r < numClamped ) ) { 1821 // complete fail, most likely due to exceptional floating point values 1822 return; 1823 } 1824 1825 numClamped--; 1826 1827 // no need to swap and update the factored matrix when the last row and column are removed 1828 if ( r == numClamped ) { 1829 return; 1830 } 1831 1832 float * y0 = (float *) _alloca16( numClamped * sizeof( float ) ); 1833 float * z0 = (float *) _alloca16( numClamped * sizeof( float ) ); 1834 float * y1 = (float *) _alloca16( numClamped * sizeof( float ) ); 1835 float * z1 = (float *) _alloca16( numClamped * sizeof( float ) ); 1836 1837 // the row/column need to be subtracted from the factorization 1838 for ( int i = 0; i < numClamped; i++ ) { 1839 y0[i] = -rowPtrs[i][r]; 1840 } 1841 1842 memset( y1, 0, numClamped * sizeof( float ) ); 1843 y1[r] = 1.0f; 1844 1845 memset( z0, 0, numClamped * sizeof( float ) ); 1846 z0[r] = 1.0f; 1847 1848 for ( int i = 0; i < numClamped; i++ ) { 1849 z1[i] = -rowPtrs[r][i]; 1850 } 1851 1852 // swap the to be removed row/column with the last row/column 1853 Swap( r, numClamped ); 1854 1855 // the swapped last row/column need to be added to the factorization 1856 for ( int i = 0; i < numClamped; i++ ) { 1857 y0[i] += rowPtrs[i][r]; 1858 } 1859 1860 for ( int i = 0; i < numClamped; i++ ) { 1861 z1[i] += rowPtrs[r][i]; 1862 } 1863 z1[r] = 0.0f; 1864 1865 // update the beginning of the to be updated row and column 1866 for ( int i = 0; i < r; i++ ) { 1867 float p0 = y0[i]; 1868 float beta1 = z1[i] * diagonal[i]; 1869 1870 clamped[i][r] += p0; 1871 for ( int j = i+1; j < numClamped; j++ ) { 1872 z1[j] -= beta1 * clamped[i][j]; 1873 } 1874 for ( int j = i+1; j < numClamped; j++ ) { 1875 y0[j] -= p0 * clamped[j][i]; 1876 } 1877 clamped[r][i] += beta1; 1878 } 1879 1880 // update the lower right corner starting at r,r 1881 for ( int i = r; i < numClamped; i++ ) { 1882 float diag = clamped[i][i]; 1883 1884 float p0 = y0[i]; 1885 float p1 = z0[i]; 1886 diag += p0 * p1; 1887 1888 if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { 1889 idLib::Printf( "idLCP_Square::RemoveClamped: updating factorization failed\n" ); 1890 diag = idMath::FLT_SMALLEST_NON_DENORMAL; 1891 } 1892 1893 float beta0 = p1 / diag; 1894 1895 float q0 = y1[i]; 1896 float q1 = z1[i]; 1897 diag += q0 * q1; 1898 1899 if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { 1900 idLib::Printf( "idLCP_Square::RemoveClamped: updating factorization failed\n" ); 1901 diag = idMath::FLT_SMALLEST_NON_DENORMAL; 1902 } 1903 1904 float d = 1.0f / diag; 1905 float beta1 = q1 * d; 1906 1907 clamped[i][i] = diag; 1908 diagonal[i] = d; 1909 1910 for ( int j = i+1; j < numClamped; j++ ) { 1911 1912 d = clamped[i][j]; 1913 1914 d += p0 * z0[j]; 1915 z0[j] -= beta0 * d; 1916 1917 d += q0 * z1[j]; 1918 z1[j] -= beta1 * d; 1919 1920 clamped[i][j] = d; 1921 } 1922 1923 for ( int j = i+1; j < numClamped; j++ ) { 1924 1925 d = clamped[j][i]; 1926 1927 y0[j] -= p0 * d; 1928 d += beta0 * y0[j]; 1929 1930 y1[j] -= q0 * d; 1931 d += beta1 * y1[j]; 1932 1933 clamped[j][i] = d; 1934 } 1935 } 1936 return; 1937 } 1938 1939 /* 1940 ======================== 1941 idLCP_Square::CalcForceDelta 1942 1943 Modifies this->delta_f. 1944 ======================== 1945 */ 1946 void idLCP_Square::CalcForceDelta( int d, float dir ) { 1947 1948 delta_f[d] = dir; 1949 1950 if ( numClamped <= 0 ) { 1951 return; 1952 } 1953 1954 // get column d of matrix 1955 float * ptr = (float *) _alloca16( numClamped * sizeof( float ) ); 1956 for ( int i = 0; i < numClamped; i++ ) { 1957 ptr[i] = rowPtrs[i][d]; 1958 } 1959 1960 // solve force delta 1961 SolveClamped( delta_f, ptr ); 1962 1963 // flip force delta based on direction 1964 if ( dir > 0.0f ) { 1965 ptr = delta_f.ToFloatPtr(); 1966 for ( int i = 0; i < numClamped; i++ ) { 1967 ptr[i] = - ptr[i]; 1968 } 1969 } 1970 } 1971 1972 /* 1973 ======================== 1974 idLCP_Square::CalcAccelDelta 1975 1976 Modifies this->delta_a and uses this->delta_f. 1977 ======================== 1978 */ 1979 ID_INLINE void idLCP_Square::CalcAccelDelta( int d ) { 1980 // only the not clamped variables, including the current variable, can have a change in acceleration 1981 for ( int j = numClamped; j <= d; j++ ) { 1982 // only the clamped variables and the current variable have a force delta unequal zero 1983 float dot = BigDotProduct( rowPtrs[j], delta_f.ToFloatPtr(), numClamped ); 1984 delta_a[j] = dot + rowPtrs[j][d] * delta_f[d]; 1985 } 1986 } 1987 1988 /* 1989 ======================== 1990 idLCP_Square::ChangeForce 1991 1992 Modifies this->f and uses this->delta_f. 1993 ======================== 1994 */ 1995 ID_INLINE void idLCP_Square::ChangeForce( int d, float step ) { 1996 // only the clamped variables and current variable have a force delta unequal zero 1997 MultiplyAdd( f.ToFloatPtr(), step, delta_f.ToFloatPtr(), numClamped ); 1998 f[d] += step * delta_f[d]; 1999 } 2000 2001 /* 2002 ======================== 2003 idLCP_Square::ChangeAccel 2004 2005 Modifies this->a and uses this->delta_a. 2006 ======================== 2007 */ 2008 ID_INLINE void idLCP_Square::ChangeAccel( int d, float step ) { 2009 // only the not clamped variables, including the current variable, can have an acceleration unequal zero 2010 MultiplyAdd( a.ToFloatPtr() + numClamped, step, delta_a.ToFloatPtr() + numClamped, d - numClamped + 1 ); 2011 } 2012 2013 /* 2014 ======================== 2015 idLCP_Square::Solve 2016 ======================== 2017 */ 2018 bool idLCP_Square::Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex ) { 2019 2020 // true when the matrix rows are 16 byte padded 2021 padded = ((o_m.GetNumRows()+3)&~3) == o_m.GetNumColumns(); 2022 2023 assert( padded || o_m.GetNumRows() == o_m.GetNumColumns() ); 2024 assert( o_x.GetSize() == o_m.GetNumRows() ); 2025 assert( o_b.GetSize() == o_m.GetNumRows() ); 2026 assert( o_lo.GetSize() == o_m.GetNumRows() ); 2027 assert( o_hi.GetSize() == o_m.GetNumRows() ); 2028 2029 // allocate memory for permuted input 2030 f.SetData( o_m.GetNumRows(), VECX_ALLOCA( o_m.GetNumRows() ) ); 2031 a.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) ); 2032 b.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) ); 2033 lo.SetData( o_lo.GetSize(), VECX_ALLOCA( o_lo.GetSize() ) ); 2034 hi.SetData( o_hi.GetSize(), VECX_ALLOCA( o_hi.GetSize() ) ); 2035 if ( o_boxIndex != NULL ) { 2036 boxIndex = (int *)_alloca16( o_x.GetSize() * sizeof( int ) ); 2037 memcpy( boxIndex, o_boxIndex, o_x.GetSize() * sizeof( int ) ); 2038 } else { 2039 boxIndex = NULL; 2040 } 2041 2042 // we override the const on o_m here but on exit the matrix is unchanged 2043 m.SetData( o_m.GetNumRows(), o_m.GetNumColumns(), const_cast<float *>(o_m[0]) ); 2044 f.Zero(); 2045 a.Zero(); 2046 b = o_b; 2047 lo = o_lo; 2048 hi = o_hi; 2049 2050 // pointers to the rows of m 2051 rowPtrs = (float **) _alloca16( m.GetNumRows() * sizeof( float * ) ); 2052 for ( int i = 0; i < m.GetNumRows(); i++ ) { 2053 rowPtrs[i] = m[i]; 2054 } 2055 2056 // tells if a variable is at the low boundary, high boundary or inbetween 2057 side = (int *) _alloca16( m.GetNumRows() * sizeof( int ) ); 2058 2059 // index to keep track of the permutation 2060 permuted = (int *) _alloca16( m.GetNumRows() * sizeof( int ) ); 2061 for ( int i = 0; i < m.GetNumRows(); i++ ) { 2062 permuted[i] = i; 2063 } 2064 2065 // permute input so all unbounded variables come first 2066 numUnbounded = 0; 2067 for ( int i = 0; i < m.GetNumRows(); i++ ) { 2068 if ( lo[i] == -idMath::INFINITY && hi[i] == idMath::INFINITY ) { 2069 if ( numUnbounded != i ) { 2070 Swap( numUnbounded, i ); 2071 } 2072 numUnbounded++; 2073 } 2074 } 2075 2076 // permute input so all variables using the boxIndex come last 2077 int boxStartIndex = m.GetNumRows(); 2078 if ( boxIndex ) { 2079 for ( int i = m.GetNumRows() - 1; i >= numUnbounded; i-- ) { 2080 if ( boxIndex[i] >= 0 ) { 2081 boxStartIndex--; 2082 if ( boxStartIndex != i ) { 2083 Swap( boxStartIndex, i ); 2084 } 2085 } 2086 } 2087 } 2088 2089 // sub matrix for factorization 2090 clamped.SetData( m.GetNumRows(), m.GetNumColumns(), MATX_ALLOCA( m.GetNumRows() * m.GetNumColumns() ) ); 2091 diagonal.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); 2092 2093 // all unbounded variables are clamped 2094 numClamped = numUnbounded; 2095 2096 // if there are unbounded variables 2097 if ( numUnbounded ) { 2098 2099 // factor and solve for unbounded variables 2100 if ( !FactorClamped() ) { 2101 idLib::Printf( "idLCP_Square::Solve: unbounded factorization failed\n" ); 2102 return false; 2103 } 2104 SolveClamped( f, b.ToFloatPtr() ); 2105 2106 // if there are no bounded variables we are done 2107 if ( numUnbounded == m.GetNumRows() ) { 2108 o_x = f; // the vector is not permuted 2109 return true; 2110 } 2111 } 2112 2113 int numIgnored = 0; 2114 2115 // allocate for delta force and delta acceleration 2116 delta_f.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); 2117 delta_a.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); 2118 2119 // solve for bounded variables 2120 idStr failed; 2121 for ( int i = numUnbounded; i < m.GetNumRows(); i++ ) { 2122 2123 // once we hit the box start index we can initialize the low and high boundaries of the variables using the box index 2124 if ( i == boxStartIndex ) { 2125 for ( int j = 0; j < boxStartIndex; j++ ) { 2126 o_x[permuted[j]] = f[j]; 2127 } 2128 for ( int j = boxStartIndex; j < m.GetNumRows(); j++ ) { 2129 float s = o_x[boxIndex[j]]; 2130 if ( lo[j] != -idMath::INFINITY ) { 2131 lo[j] = - idMath::Fabs( lo[j] * s ); 2132 } 2133 if ( hi[j] != idMath::INFINITY ) { 2134 hi[j] = idMath::Fabs( hi[j] * s ); 2135 } 2136 } 2137 } 2138 2139 // calculate acceleration for current variable 2140 float dot = BigDotProduct( rowPtrs[i], f.ToFloatPtr(), i ); 2141 a[i] = dot - b[i]; 2142 2143 // if already at the low boundary 2144 if ( lo[i] >= -LCP_BOUND_EPSILON && a[i] >= -LCP_ACCEL_EPSILON ) { 2145 side[i] = -1; 2146 continue; 2147 } 2148 2149 // if already at the high boundary 2150 if ( hi[i] <= LCP_BOUND_EPSILON && a[i] <= LCP_ACCEL_EPSILON ) { 2151 side[i] = 1; 2152 continue; 2153 } 2154 2155 // if inside the clamped region 2156 if ( idMath::Fabs( a[i] ) <= LCP_ACCEL_EPSILON ) { 2157 side[i] = 0; 2158 AddClamped( i ); 2159 continue; 2160 } 2161 2162 // drive the current variable into a valid region 2163 int n = 0; 2164 for ( ; n < maxIterations; n++ ) { 2165 2166 // direction to move 2167 float dir = ( a[i] <= 0.0f ) ? 1.0f : -1.0f; 2168 2169 // calculate force delta 2170 CalcForceDelta( i, dir ); 2171 2172 // calculate acceleration delta: delta_a = m * delta_f; 2173 CalcAccelDelta( i ); 2174 2175 float maxStep; 2176 int limit; 2177 int limitSide; 2178 2179 // maximum step we can take 2180 GetMaxStep( f.ToFloatPtr(), a.ToFloatPtr(), delta_f.ToFloatPtr(), delta_a.ToFloatPtr(), 2181 lo.ToFloatPtr(), hi.ToFloatPtr(), side, numUnbounded, numClamped, 2182 i, dir, maxStep, limit, limitSide ); 2183 2184 if ( maxStep <= 0.0f ) { 2185 #ifdef IGNORE_UNSATISFIABLE_VARIABLES 2186 // ignore the current variable completely 2187 lo[i] = hi[i] = 0.0f; 2188 f[i] = 0.0f; 2189 side[i] = -1; 2190 numIgnored++; 2191 #else 2192 failed.Format( "invalid step size %.4f", maxStep ); 2193 for ( int j = i; j < m.GetNumRows(); j++ ) { 2194 f[j] = 0.0f; 2195 } 2196 numIgnored = m.GetNumRows() - i; 2197 #endif 2198 break; 2199 } 2200 2201 // change force 2202 ChangeForce( i, maxStep ); 2203 2204 // change acceleration 2205 ChangeAccel( i, maxStep ); 2206 2207 // clamp/unclamp the variable that limited this step 2208 side[limit] = limitSide; 2209 if ( limitSide == 0 ) { 2210 a[limit] = 0.0f; 2211 AddClamped( limit ); 2212 } else if ( limitSide == -1 ) { 2213 f[limit] = lo[limit]; 2214 if ( limit != i ) { 2215 RemoveClamped( limit ); 2216 } 2217 } else if ( limitSide == 1 ) { 2218 f[limit] = hi[limit]; 2219 if ( limit != i ) { 2220 RemoveClamped( limit ); 2221 } 2222 } 2223 2224 // if the current variable limited the step we can continue with the next variable 2225 if ( limit == i ) { 2226 break; 2227 } 2228 } 2229 2230 if ( n >= maxIterations ) { 2231 failed.Format( "max iterations %d", maxIterations ); 2232 break; 2233 } 2234 2235 if ( failed.Length() ) { 2236 break; 2237 } 2238 } 2239 2240 #ifdef IGNORE_UNSATISFIABLE_VARIABLES 2241 if ( numIgnored ) { 2242 if ( lcp_showFailures.GetBool() ) { 2243 idLib::Printf( "idLCP_Square::Solve: %d of %d bounded variables ignored\n", numIgnored, m.GetNumRows() - numUnbounded ); 2244 } 2245 } 2246 #endif 2247 2248 // if failed clear remaining forces 2249 if ( failed.Length() ) { 2250 if ( lcp_showFailures.GetBool() ) { 2251 idLib::Printf( "idLCP_Square::Solve: %s (%d of %d bounded variables ignored)\n", failed.c_str(), numIgnored, m.GetNumRows() - numUnbounded ); 2252 } 2253 } 2254 2255 #if defined(_DEBUG) && 0 2256 if ( failed.Length() ) { 2257 // test whether or not the solution satisfies the complementarity conditions 2258 for ( int i = 0; i < m.GetNumRows(); i++ ) { 2259 a[i] = -b[i]; 2260 for ( int j = 0; j < m.GetNumRows(); j++ ) { 2261 a[i] += rowPtrs[i][j] * f[j]; 2262 } 2263 2264 if ( f[i] == lo[i] ) { 2265 if ( lo[i] != hi[i] && a[i] < -LCP_ACCEL_EPSILON ) { 2266 int bah1 = 1; 2267 } 2268 } else if ( f[i] == hi[i] ) { 2269 if ( lo[i] != hi[i] && a[i] > LCP_ACCEL_EPSILON ) { 2270 int bah2 = 1; 2271 } 2272 } else if ( f[i] < lo[i] || f[i] > hi[i] || idMath::Fabs( a[i] ) > 1.0f ) { 2273 int bah3 = 1; 2274 } 2275 } 2276 } 2277 #endif 2278 2279 // unpermute result 2280 for ( int i = 0; i < f.GetSize(); i++ ) { 2281 o_x[permuted[i]] = f[i]; 2282 } 2283 2284 return true; 2285 } 2286 2287 /* 2288 ================================================================================================ 2289 2290 idLCP_Symmetric 2291 2292 ================================================================================================ 2293 */ 2294 2295 /* 2296 ================================================ 2297 idLCP_Symmetric 2298 ================================================ 2299 */ 2300 class idLCP_Symmetric : public idLCP { 2301 public: 2302 virtual bool Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex ); 2303 2304 private: 2305 idMatX m; // original matrix 2306 idVecX b; // right hand side 2307 idVecX lo, hi; // low and high bounds 2308 idVecX f, a; // force and acceleration 2309 idVecX delta_f, delta_a; // delta force and delta acceleration 2310 idMatX clamped; // LDLt factored sub matrix for clamped variables 2311 idVecX diagonal; // reciprocal of diagonal of LDLt factored sub matrix for clamped variables 2312 idVecX solveCache1; // intermediate result cached in SolveClamped 2313 idVecX solveCache2; // " 2314 int numUnbounded; // number of unbounded variables 2315 int numClamped; // number of clamped variables 2316 int clampedChangeStart; // lowest row/column changed in the clamped matrix during an iteration 2317 float ** rowPtrs; // pointers to the rows of m 2318 int * boxIndex; // box index 2319 int * side; // tells if a variable is at the low boundary = -1, high boundary = 1 or inbetween = 0 2320 int * permuted; // index to keep track of the permutation 2321 bool padded; // set to true if the rows of the initial matrix are 16 byte padded 2322 2323 bool FactorClamped(); 2324 void SolveClamped( idVecX &x, const float *b ); 2325 void Swap( int i, int j ); 2326 void AddClamped( int r, bool useSolveCache ); 2327 void RemoveClamped( int r ); 2328 void CalcForceDelta( int d, float dir ); 2329 void CalcAccelDelta( int d ); 2330 void ChangeForce( int d, float step ); 2331 void ChangeAccel( int d, float step ); 2332 }; 2333 2334 /* 2335 ======================== 2336 idLCP_Symmetric::FactorClamped 2337 ======================== 2338 */ 2339 bool idLCP_Symmetric::FactorClamped() { 2340 2341 clampedChangeStart = 0; 2342 2343 for ( int i = 0; i < numClamped; i++ ) { 2344 memcpy( clamped[i], rowPtrs[i], numClamped * sizeof( float ) ); 2345 } 2346 return LDLT_Factor( clamped, diagonal, numClamped ); 2347 } 2348 2349 /* 2350 ======================== 2351 idLCP_Symmetric::SolveClamped 2352 ======================== 2353 */ 2354 void idLCP_Symmetric::SolveClamped( idVecX &x, const float *b ) { 2355 2356 // solve L 2357 LowerTriangularSolve( clamped, solveCache1.ToFloatPtr(), b, numClamped, clampedChangeStart ); 2358 2359 // scale with D 2360 Multiply( solveCache2.ToFloatPtr(), solveCache1.ToFloatPtr(), diagonal.ToFloatPtr(), numClamped ); 2361 2362 // solve Lt 2363 LowerTriangularSolveTranspose( clamped, x.ToFloatPtr(), solveCache2.ToFloatPtr(), numClamped ); 2364 2365 clampedChangeStart = numClamped; 2366 } 2367 2368 /* 2369 ======================== 2370 idLCP_Symmetric::Swap 2371 ======================== 2372 */ 2373 void idLCP_Symmetric::Swap( int i, int j ) { 2374 2375 if ( i == j ) { 2376 return; 2377 } 2378 2379 SwapValues( rowPtrs[i], rowPtrs[j] ); 2380 m.SwapColumns( i, j ); 2381 b.SwapElements( i, j ); 2382 lo.SwapElements( i, j ); 2383 hi.SwapElements( i, j ); 2384 a.SwapElements( i, j ); 2385 f.SwapElements( i, j ); 2386 if ( boxIndex != NULL ) { 2387 SwapValues( boxIndex[i], boxIndex[j] ); 2388 } 2389 SwapValues( side[i], side[j] ); 2390 SwapValues( permuted[i], permuted[j] ); 2391 } 2392 2393 /* 2394 ======================== 2395 idLCP_Symmetric::AddClamped 2396 ======================== 2397 */ 2398 void idLCP_Symmetric::AddClamped( int r, bool useSolveCache ) { 2399 2400 assert( r >= numClamped ); 2401 2402 if ( numClamped < clampedChangeStart ) { 2403 clampedChangeStart = numClamped; 2404 } 2405 2406 // add a row at the bottom and a column at the right of the factored 2407 // matrix for the clamped variables 2408 2409 Swap( numClamped, r ); 2410 2411 // solve for v in L * v = rowPtr[numClamped] 2412 float dot; 2413 if ( useSolveCache ) { 2414 2415 // the lower triangular solve was cached in SolveClamped called by CalcForceDelta 2416 memcpy( clamped[numClamped], solveCache2.ToFloatPtr(), numClamped * sizeof( float ) ); 2417 // calculate row dot product 2418 dot = BigDotProduct( solveCache2.ToFloatPtr(), solveCache1.ToFloatPtr(), numClamped ); 2419 2420 } else { 2421 2422 float *v = (float *) _alloca16( numClamped * sizeof( float ) ); 2423 2424 LowerTriangularSolve( clamped, v, rowPtrs[numClamped], numClamped, 0 ); 2425 // add bottom row to L 2426 Multiply( clamped[numClamped], v, diagonal.ToFloatPtr(), numClamped ); 2427 // calculate row dot product 2428 dot = BigDotProduct( clamped[numClamped], v, numClamped ); 2429 } 2430 2431 // update diagonal[numClamped] 2432 float d = rowPtrs[numClamped][numClamped] - dot; 2433 2434 if ( fabs( d ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { 2435 idLib::Printf( "idLCP_Symmetric::AddClamped: updating factorization failed\n" ); 2436 d = idMath::FLT_SMALLEST_NON_DENORMAL; 2437 } 2438 2439 clamped[numClamped][numClamped] = d; 2440 diagonal[numClamped] = 1.0f / d; 2441 2442 numClamped++; 2443 } 2444 2445 /* 2446 ======================== 2447 idLCP_Symmetric::RemoveClamped 2448 ======================== 2449 */ 2450 void idLCP_Symmetric::RemoveClamped( int r ) { 2451 2452 if ( !verify( r < numClamped ) ) { 2453 // complete fail, most likely due to exceptional floating point values 2454 return; 2455 } 2456 2457 if ( r < clampedChangeStart ) { 2458 clampedChangeStart = r; 2459 } 2460 2461 numClamped--; 2462 2463 // no need to swap and update the factored matrix when the last row and column are removed 2464 if ( r == numClamped ) { 2465 return; 2466 } 2467 2468 // swap the to be removed row/column with the last row/column 2469 Swap( r, numClamped ); 2470 2471 // update the factored matrix 2472 float * addSub = (float *) _alloca16( numClamped * sizeof( float ) ); 2473 2474 if ( r == 0 ) { 2475 2476 if ( numClamped == 1 ) { 2477 float diag = rowPtrs[0][0]; 2478 if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { 2479 idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" ); 2480 diag = idMath::FLT_SMALLEST_NON_DENORMAL; 2481 } 2482 clamped[0][0] = diag; 2483 diagonal[0] = 1.0f / diag; 2484 return; 2485 } 2486 2487 // calculate the row/column to be added to the lower right sub matrix starting at (r, r) 2488 float * original = rowPtrs[numClamped]; 2489 float * ptr = rowPtrs[r]; 2490 addSub[0] = ptr[0] - original[numClamped]; 2491 for ( int i = 1; i < numClamped; i++ ) { 2492 addSub[i] = ptr[i] - original[i]; 2493 } 2494 2495 } else { 2496 2497 float * v = (float *) _alloca16( numClamped * sizeof( float ) ); 2498 2499 // solve for v in L * v = rowPtr[r] 2500 LowerTriangularSolve( clamped, v, rowPtrs[r], r, 0 ); 2501 2502 // update removed row 2503 Multiply( clamped[r], v, diagonal.ToFloatPtr(), r ); 2504 2505 // if the last row/column of the matrix is updated 2506 if ( r == numClamped - 1 ) { 2507 // only calculate new diagonal 2508 float dot = BigDotProduct( clamped[r], v, r ); 2509 float diag = rowPtrs[r][r] - dot; 2510 if ( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { 2511 idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" ); 2512 diag = idMath::FLT_SMALLEST_NON_DENORMAL; 2513 } 2514 clamped[r][r] = diag; 2515 diagonal[r] = 1.0f / diag; 2516 return; 2517 } 2518 2519 // calculate the row/column to be added to the lower right sub matrix starting at (r, r) 2520 for ( int i = 0; i < r; i++ ) { 2521 v[i] = clamped[r][i] * clamped[i][i]; 2522 } 2523 for ( int i = r; i < numClamped; i++ ) { 2524 float sum; 2525 if ( i == r ) { 2526 sum = clamped[r][r]; 2527 } else { 2528 sum = clamped[r][r] * clamped[i][r]; 2529 } 2530 float * ptr = clamped[i]; 2531 for ( int j = 0; j < r; j++ ) { 2532 sum += ptr[j] * v[j]; 2533 } 2534 addSub[i] = rowPtrs[r][i] - sum; 2535 } 2536 } 2537 2538 // add row/column to the lower right sub matrix starting at (r, r) 2539 2540 float * v1 = (float *) _alloca16( numClamped * sizeof( float ) ); 2541 float * v2 = (float *) _alloca16( numClamped * sizeof( float ) ); 2542 2543 float diag = idMath::SQRT_1OVER2; 2544 v1[r] = ( 0.5f * addSub[r] + 1.0f ) * diag; 2545 v2[r] = ( 0.5f * addSub[r] - 1.0f ) * diag; 2546 for ( int i = r+1; i < numClamped; i++ ) { 2547 v1[i] = v2[i] = addSub[i] * diag; 2548 } 2549 2550 float alpha1 = 1.0f; 2551 float alpha2 = -1.0f; 2552 2553 // simultaneous update/downdate of the sub matrix starting at (r, r) 2554 int n = clamped.GetNumColumns(); 2555 for ( int i = r; i < numClamped; i++ ) { 2556 2557 diag = clamped[i][i]; 2558 float p1 = v1[i]; 2559 float newDiag = diag + alpha1 * p1 * p1; 2560 2561 if ( fabs( newDiag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { 2562 idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" ); 2563 newDiag = idMath::FLT_SMALLEST_NON_DENORMAL; 2564 } 2565 2566 alpha1 /= newDiag; 2567 float beta1 = p1 * alpha1; 2568 alpha1 *= diag; 2569 2570 diag = newDiag; 2571 float p2 = v2[i]; 2572 newDiag = diag + alpha2 * p2 * p2; 2573 2574 if ( fabs( newDiag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { 2575 idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" ); 2576 newDiag = idMath::FLT_SMALLEST_NON_DENORMAL; 2577 } 2578 2579 clamped[i][i] = newDiag; 2580 float invNewDiag = 1.0f / newDiag; 2581 diagonal[i] = invNewDiag; 2582 2583 alpha2 *= invNewDiag; 2584 float beta2 = p2 * alpha2; 2585 alpha2 *= diag; 2586 2587 // update column below diagonal (i,i) 2588 float * ptr = clamped.ToFloatPtr() + i; 2589 2590 int j; 2591 for ( j = i+1; j < numClamped - 1; j += 2 ) { 2592 2593 float sum0 = ptr[(j+0)*n]; 2594 float sum1 = ptr[(j+1)*n]; 2595 2596 v1[j+0] -= p1 * sum0; 2597 v1[j+1] -= p1 * sum1; 2598 2599 sum0 += beta1 * v1[j+0]; 2600 sum1 += beta1 * v1[j+1]; 2601 2602 v2[j+0] -= p2 * sum0; 2603 v2[j+1] -= p2 * sum1; 2604 2605 sum0 += beta2 * v2[j+0]; 2606 sum1 += beta2 * v2[j+1]; 2607 2608 ptr[(j+0)*n] = sum0; 2609 ptr[(j+1)*n] = sum1; 2610 } 2611 2612 for ( ; j < numClamped; j++ ) { 2613 2614 float sum = ptr[j*n]; 2615 2616 v1[j] -= p1 * sum; 2617 sum += beta1 * v1[j]; 2618 2619 v2[j] -= p2 * sum; 2620 sum += beta2 * v2[j]; 2621 2622 ptr[j*n] = sum; 2623 } 2624 } 2625 } 2626 2627 /* 2628 ======================== 2629 idLCP_Symmetric::CalcForceDelta 2630 2631 Modifies this->delta_f. 2632 ======================== 2633 */ 2634 ID_INLINE void idLCP_Symmetric::CalcForceDelta( int d, float dir ) { 2635 2636 delta_f[d] = dir; 2637 2638 if ( numClamped == 0 ) { 2639 return; 2640 } 2641 2642 // solve force delta 2643 SolveClamped( delta_f, rowPtrs[d] ); 2644 2645 // flip force delta based on direction 2646 if ( dir > 0.0f ) { 2647 float * ptr = delta_f.ToFloatPtr(); 2648 for ( int i = 0; i < numClamped; i++ ) { 2649 ptr[i] = - ptr[i]; 2650 } 2651 } 2652 } 2653 2654 /* 2655 ======================== 2656 idLCP_Symmetric::CalcAccelDelta 2657 2658 Modifies this->delta_a and uses this->delta_f. 2659 ======================== 2660 */ 2661 ID_INLINE void idLCP_Symmetric::CalcAccelDelta( int d ) { 2662 // only the not clamped variables, including the current variable, can have a change in acceleration 2663 for ( int j = numClamped; j <= d; j++ ) { 2664 // only the clamped variables and the current variable have a force delta unequal zero 2665 float dot = BigDotProduct( rowPtrs[j], delta_f.ToFloatPtr(), numClamped ); 2666 delta_a[j] = dot + rowPtrs[j][d] * delta_f[d]; 2667 } 2668 } 2669 2670 /* 2671 ======================== 2672 idLCP_Symmetric::ChangeForce 2673 2674 Modifies this->f and uses this->delta_f. 2675 ======================== 2676 */ 2677 ID_INLINE void idLCP_Symmetric::ChangeForce( int d, float step ) { 2678 // only the clamped variables and current variable have a force delta unequal zero 2679 MultiplyAdd( f.ToFloatPtr(), step, delta_f.ToFloatPtr(), numClamped ); 2680 f[d] += step * delta_f[d]; 2681 } 2682 2683 /* 2684 ======================== 2685 idLCP_Symmetric::ChangeAccel 2686 2687 Modifies this->a and uses this->delta_a. 2688 ======================== 2689 */ 2690 ID_INLINE void idLCP_Symmetric::ChangeAccel( int d, float step ) { 2691 // only the not clamped variables, including the current variable, can have an acceleration unequal zero 2692 MultiplyAdd( a.ToFloatPtr() + numClamped, step, delta_a.ToFloatPtr() + numClamped, d - numClamped + 1 ); 2693 } 2694 2695 /* 2696 ======================== 2697 idLCP_Symmetric::Solve 2698 ======================== 2699 */ 2700 bool idLCP_Symmetric::Solve( const idMatX &o_m, idVecX &o_x, const idVecX &o_b, const idVecX &o_lo, const idVecX &o_hi, const int *o_boxIndex ) { 2701 2702 // true when the matrix rows are 16 byte padded 2703 padded = ((o_m.GetNumRows()+3)&~3) == o_m.GetNumColumns(); 2704 2705 assert( padded || o_m.GetNumRows() == o_m.GetNumColumns() ); 2706 assert( o_x.GetSize() == o_m.GetNumRows() ); 2707 assert( o_b.GetSize() == o_m.GetNumRows() ); 2708 assert( o_lo.GetSize() == o_m.GetNumRows() ); 2709 assert( o_hi.GetSize() == o_m.GetNumRows() ); 2710 2711 // allocate memory for permuted input 2712 f.SetData( o_m.GetNumRows(), VECX_ALLOCA( o_m.GetNumRows() ) ); 2713 a.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) ); 2714 b.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) ); 2715 lo.SetData( o_lo.GetSize(), VECX_ALLOCA( o_lo.GetSize() ) ); 2716 hi.SetData( o_hi.GetSize(), VECX_ALLOCA( o_hi.GetSize() ) ); 2717 if ( o_boxIndex != NULL ) { 2718 boxIndex = (int *)_alloca16( o_x.GetSize() * sizeof( int ) ); 2719 memcpy( boxIndex, o_boxIndex, o_x.GetSize() * sizeof( int ) ); 2720 } else { 2721 boxIndex = NULL; 2722 } 2723 2724 // we override the const on o_m here but on exit the matrix is unchanged 2725 m.SetData( o_m.GetNumRows(), o_m.GetNumColumns(), const_cast< float * >( o_m[0] ) ); 2726 f.Zero(); 2727 a.Zero(); 2728 b = o_b; 2729 lo = o_lo; 2730 hi = o_hi; 2731 2732 // pointers to the rows of m 2733 rowPtrs = (float **) _alloca16( m.GetNumRows() * sizeof( float * ) ); 2734 for ( int i = 0; i < m.GetNumRows(); i++ ) { 2735 rowPtrs[i] = m[i]; 2736 } 2737 2738 // tells if a variable is at the low boundary, high boundary or inbetween 2739 side = (int *) _alloca16( m.GetNumRows() * sizeof( int ) ); 2740 2741 // index to keep track of the permutation 2742 permuted = (int *) _alloca16( m.GetNumRows() * sizeof( int ) ); 2743 for ( int i = 0; i < m.GetNumRows(); i++ ) { 2744 permuted[i] = i; 2745 } 2746 2747 // permute input so all unbounded variables come first 2748 numUnbounded = 0; 2749 for ( int i = 0; i < m.GetNumRows(); i++ ) { 2750 if ( lo[i] == -idMath::INFINITY && hi[i] == idMath::INFINITY ) { 2751 if ( numUnbounded != i ) { 2752 Swap( numUnbounded, i ); 2753 } 2754 numUnbounded++; 2755 } 2756 } 2757 2758 // permute input so all variables using the boxIndex come last 2759 int boxStartIndex = m.GetNumRows(); 2760 if ( boxIndex != NULL ) { 2761 for ( int i = m.GetNumRows() - 1; i >= numUnbounded; i-- ) { 2762 if ( boxIndex[i] >= 0 ) { 2763 boxStartIndex--; 2764 if ( boxStartIndex != i ) { 2765 Swap( boxStartIndex, i ); 2766 } 2767 } 2768 } 2769 } 2770 2771 // sub matrix for factorization 2772 clamped.SetDataCacheLines( m.GetNumRows(), m.GetNumColumns(), MATX_ALLOCA_CACHE_LINES( m.GetNumRows() * m.GetNumColumns() ), true ); 2773 diagonal.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); 2774 solveCache1.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); 2775 solveCache2.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); 2776 2777 // all unbounded variables are clamped 2778 numClamped = numUnbounded; 2779 2780 // if there are unbounded variables 2781 if ( numUnbounded ) { 2782 2783 // factor and solve for unbounded variables 2784 if ( !FactorClamped() ) { 2785 idLib::Printf( "idLCP_Symmetric::Solve: unbounded factorization failed\n" ); 2786 return false; 2787 } 2788 SolveClamped( f, b.ToFloatPtr() ); 2789 2790 // if there are no bounded variables we are done 2791 if ( numUnbounded == m.GetNumRows() ) { 2792 o_x = f; // the vector is not permuted 2793 return true; 2794 } 2795 } 2796 2797 int numIgnored = 0; 2798 2799 // allocate for delta force and delta acceleration 2800 delta_f.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); 2801 delta_a.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); 2802 2803 // solve for bounded variables 2804 idStr failed; 2805 for ( int i = numUnbounded; i < m.GetNumRows(); i++ ) { 2806 2807 clampedChangeStart = 0; 2808 2809 // once we hit the box start index we can initialize the low and high boundaries of the variables using the box index 2810 if ( i == boxStartIndex ) { 2811 for ( int j = 0; j < boxStartIndex; j++ ) { 2812 o_x[permuted[j]] = f[j]; 2813 } 2814 for ( int j = boxStartIndex; j < m.GetNumRows(); j++ ) { 2815 float s = o_x[boxIndex[j]]; 2816 if ( lo[j] != -idMath::INFINITY ) { 2817 lo[j] = - idMath::Fabs( lo[j] * s ); 2818 } 2819 if ( hi[j] != idMath::INFINITY ) { 2820 hi[j] = idMath::Fabs( hi[j] * s ); 2821 } 2822 } 2823 } 2824 2825 // calculate acceleration for current variable 2826 float dot = BigDotProduct( rowPtrs[i], f.ToFloatPtr(), i ); 2827 a[i] = dot - b[i]; 2828 2829 // if already at the low boundary 2830 if ( lo[i] >= -LCP_BOUND_EPSILON && a[i] >= -LCP_ACCEL_EPSILON ) { 2831 side[i] = -1; 2832 continue; 2833 } 2834 2835 // if already at the high boundary 2836 if ( hi[i] <= LCP_BOUND_EPSILON && a[i] <= LCP_ACCEL_EPSILON ) { 2837 side[i] = 1; 2838 continue; 2839 } 2840 2841 // if inside the clamped region 2842 if ( idMath::Fabs( a[i] ) <= LCP_ACCEL_EPSILON ) { 2843 side[i] = 0; 2844 AddClamped( i, false ); 2845 continue; 2846 } 2847 2848 // drive the current variable into a valid region 2849 int n = 0; 2850 for ( ; n < maxIterations; n++ ) { 2851 2852 // direction to move 2853 float dir = ( a[i] <= 0.0f ) ? 1.0f : -1.0f; 2854 2855 // calculate force delta 2856 CalcForceDelta( i, dir ); 2857 2858 // calculate acceleration delta: delta_a = m * delta_f; 2859 CalcAccelDelta( i ); 2860 2861 float maxStep; 2862 int limit; 2863 int limitSide; 2864 2865 // maximum step we can take 2866 GetMaxStep( f.ToFloatPtr(), a.ToFloatPtr(), delta_f.ToFloatPtr(), delta_a.ToFloatPtr(), 2867 lo.ToFloatPtr(), hi.ToFloatPtr(), side, numUnbounded, numClamped, 2868 i, dir, maxStep, limit, limitSide ); 2869 2870 if ( maxStep <= 0.0f ) { 2871 #ifdef IGNORE_UNSATISFIABLE_VARIABLES 2872 // ignore the current variable completely 2873 lo[i] = hi[i] = 0.0f; 2874 f[i] = 0.0f; 2875 side[i] = -1; 2876 numIgnored++; 2877 #else 2878 failed.Format( "invalid step size %.4f", maxStep ); 2879 for ( int j = i; j < m.GetNumRows(); j++ ) { 2880 f[j] = 0.0f; 2881 } 2882 numIgnored = m.GetNumRows() - i; 2883 #endif 2884 break; 2885 } 2886 2887 // change force 2888 ChangeForce( i, maxStep ); 2889 2890 // change acceleration 2891 ChangeAccel( i, maxStep ); 2892 2893 // clamp/unclamp the variable that limited this step 2894 side[limit] = limitSide; 2895 if ( limitSide == 0 ) { 2896 a[limit] = 0.0f; 2897 AddClamped( limit, ( limit == i ) ); 2898 } else if ( limitSide == -1 ) { 2899 f[limit] = lo[limit]; 2900 if ( limit != i ) { 2901 RemoveClamped( limit ); 2902 } 2903 } else if ( limitSide == 1 ) { 2904 f[limit] = hi[limit]; 2905 if ( limit != i ) { 2906 RemoveClamped( limit ); 2907 } 2908 } 2909 2910 // if the current variable limited the step we can continue with the next variable 2911 if ( limit == i ) { 2912 break; 2913 } 2914 } 2915 2916 if ( n >= maxIterations ) { 2917 failed.Format( "max iterations %d", maxIterations ); 2918 break; 2919 } 2920 2921 if ( failed.Length() ) { 2922 break; 2923 } 2924 } 2925 2926 #ifdef IGNORE_UNSATISFIABLE_VARIABLES 2927 if ( numIgnored ) { 2928 if ( lcp_showFailures.GetBool() ) { 2929 idLib::Printf( "idLCP_Symmetric::Solve: %d of %d bounded variables ignored\n", numIgnored, m.GetNumRows() - numUnbounded ); 2930 } 2931 } 2932 #endif 2933 2934 // if failed clear remaining forces 2935 if ( failed.Length() ) { 2936 if ( lcp_showFailures.GetBool() ) { 2937 idLib::Printf( "idLCP_Symmetric::Solve: %s (%d of %d bounded variables ignored)\n", failed.c_str(), numIgnored, m.GetNumRows() - numUnbounded ); 2938 } 2939 } 2940 2941 #if defined(_DEBUG) && 0 2942 if ( failed.Length() ) { 2943 // test whether or not the solution satisfies the complementarity conditions 2944 for ( int i = 0; i < m.GetNumRows(); i++ ) { 2945 a[i] = -b[i]; 2946 for ( j = 0; j < m.GetNumRows(); j++ ) { 2947 a[i] += rowPtrs[i][j] * f[j]; 2948 } 2949 2950 if ( f[i] == lo[i] ) { 2951 if ( lo[i] != hi[i] && a[i] < -LCP_ACCEL_EPSILON ) { 2952 int bah1 = 1; 2953 } 2954 } else if ( f[i] == hi[i] ) { 2955 if ( lo[i] != hi[i] && a[i] > LCP_ACCEL_EPSILON ) { 2956 int bah2 = 1; 2957 } 2958 } else if ( f[i] < lo[i] || f[i] > hi[i] || idMath::Fabs( a[i] ) > 1.0f ) { 2959 int bah3 = 1; 2960 } 2961 } 2962 } 2963 #endif 2964 2965 // unpermute result 2966 for ( int i = 0; i < f.GetSize(); i++ ) { 2967 o_x[permuted[i]] = f[i]; 2968 } 2969 2970 return true; 2971 } 2972 2973 /* 2974 ================================================================================================ 2975 2976 idLCP 2977 2978 ================================================================================================ 2979 */ 2980 2981 /* 2982 ======================== 2983 idLCP::AllocSquare 2984 ======================== 2985 */ 2986 idLCP *idLCP::AllocSquare() { 2987 idLCP *lcp = new idLCP_Square; 2988 lcp->SetMaxIterations( 32 ); 2989 return lcp; 2990 } 2991 2992 /* 2993 ======================== 2994 idLCP::AllocSymmetric 2995 ======================== 2996 */ 2997 idLCP *idLCP::AllocSymmetric() { 2998 idLCP *lcp = new idLCP_Symmetric; 2999 lcp->SetMaxIterations( 32 ); 3000 return lcp; 3001 } 3002 3003 /* 3004 ======================== 3005 idLCP::~idLCP 3006 ======================== 3007 */ 3008 idLCP::~idLCP() { 3009 } 3010 3011 /* 3012 ======================== 3013 idLCP::SetMaxIterations 3014 ======================== 3015 */ 3016 void idLCP::SetMaxIterations( int max ) { 3017 maxIterations = max; 3018 } 3019 3020 /* 3021 ======================== 3022 idLCP::GetMaxIterations 3023 ======================== 3024 */ 3025 int idLCP::GetMaxIterations() { 3026 return maxIterations; 3027 } 3028 3029 /* 3030 ======================== 3031 idLCP::Test_f 3032 ======================== 3033 */ 3034 void idLCP::Test_f( const idCmdArgs &args ) { 3035 #ifdef ENABLE_TEST_CODE 3036 DotProduct_Test(); 3037 LowerTriangularSolve_Test(); 3038 LowerTriangularSolveTranspose_Test(); 3039 LDLT_Factor_Test(); 3040 #endif 3041 }