Simd_SSE.cpp (38817B)
1 /* 2 =========================================================================== 3 4 Doom 3 BFG Edition GPL Source Code 5 Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. 6 7 This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). 8 9 Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation, either version 3 of the License, or 12 (at your option) any later version. 13 14 Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License 20 along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>. 21 22 In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below. 23 24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA. 25 26 =========================================================================== 27 */ 28 29 #pragma hdrstop 30 #include "../precompiled.h" 31 #include "Simd_Generic.h" 32 #include "Simd_SSE.h" 33 34 //=============================================================== 35 // M 36 // SSE implementation of idSIMDProcessor MrE 37 // E 38 //=============================================================== 39 40 41 #include <xmmintrin.h> 42 43 #define M_PI 3.14159265358979323846f 44 45 /* 46 ============ 47 idSIMD_SSE::GetName 48 ============ 49 */ 50 const char * idSIMD_SSE::GetName() const { 51 return "MMX & SSE"; 52 } 53 54 /* 55 ============ 56 idSIMD_SSE::BlendJoints 57 ============ 58 */ 59 void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) { 60 61 if ( lerp <= 0.0f ) { 62 return; 63 } else if ( lerp >= 1.0f ) { 64 for ( int i = 0; i < numJoints; i++ ) { 65 int j = index[i]; 66 joints[j] = blendJoints[j]; 67 } 68 return; 69 } 70 71 const __m128 vlerp = { lerp, lerp, lerp, lerp }; 72 73 const __m128 vector_float_one = { 1.0f, 1.0f, 1.0f, 1.0f }; 74 const __m128 vector_float_sign_bit = __m128c( _mm_set_epi32( 0x80000000, 0x80000000, 0x80000000, 0x80000000 ) ); 75 const __m128 vector_float_rsqrt_c0 = { -3.0f, -3.0f, -3.0f, -3.0f }; 76 const __m128 vector_float_rsqrt_c1 = { -0.5f, -0.5f, -0.5f, -0.5f }; 77 const __m128 vector_float_tiny = { 1e-10f, 1e-10f, 1e-10f, 1e-10f }; 78 const __m128 vector_float_half_pi = { M_PI*0.5f, M_PI*0.5f, M_PI*0.5f, M_PI*0.5f }; 79 80 const __m128 vector_float_sin_c0 = { -2.39e-08f, -2.39e-08f, -2.39e-08f, -2.39e-08f }; 81 const __m128 vector_float_sin_c1 = { 2.7526e-06f, 2.7526e-06f, 2.7526e-06f, 2.7526e-06f }; 82 const __m128 vector_float_sin_c2 = { -1.98409e-04f, -1.98409e-04f, -1.98409e-04f, -1.98409e-04f }; 83 const __m128 vector_float_sin_c3 = { 8.3333315e-03f, 8.3333315e-03f, 8.3333315e-03f, 8.3333315e-03f }; 84 const __m128 vector_float_sin_c4 = { -1.666666664e-01f, -1.666666664e-01f, -1.666666664e-01f, -1.666666664e-01f }; 85 86 const __m128 vector_float_atan_c0 = { 0.0028662257f, 0.0028662257f, 0.0028662257f, 0.0028662257f }; 87 const __m128 vector_float_atan_c1 = { -0.0161657367f, -0.0161657367f, -0.0161657367f, -0.0161657367f }; 88 const __m128 vector_float_atan_c2 = { 0.0429096138f, 0.0429096138f, 0.0429096138f, 0.0429096138f }; 89 const __m128 vector_float_atan_c3 = { -0.0752896400f, -0.0752896400f, -0.0752896400f, -0.0752896400f }; 90 const __m128 vector_float_atan_c4 = { 0.1065626393f, 0.1065626393f, 0.1065626393f, 0.1065626393f }; 91 const __m128 vector_float_atan_c5 = { -0.1420889944f, -0.1420889944f, -0.1420889944f, -0.1420889944f }; 92 const __m128 vector_float_atan_c6 = { 0.1999355085f, 0.1999355085f, 0.1999355085f, 0.1999355085f }; 93 const __m128 vector_float_atan_c7 = { -0.3333314528f, -0.3333314528f, -0.3333314528f, -0.3333314528f }; 94 95 int i = 0; 96 for ( ; i < numJoints - 3; i += 4 ) { 97 const int n0 = index[i+0]; 98 const int n1 = index[i+1]; 99 const int n2 = index[i+2]; 100 const int n3 = index[i+3]; 101 102 __m128 jqa_0 = _mm_load_ps( joints[n0].q.ToFloatPtr() ); 103 __m128 jqb_0 = _mm_load_ps( joints[n1].q.ToFloatPtr() ); 104 __m128 jqc_0 = _mm_load_ps( joints[n2].q.ToFloatPtr() ); 105 __m128 jqd_0 = _mm_load_ps( joints[n3].q.ToFloatPtr() ); 106 107 __m128 jta_0 = _mm_load_ps( joints[n0].t.ToFloatPtr() ); 108 __m128 jtb_0 = _mm_load_ps( joints[n1].t.ToFloatPtr() ); 109 __m128 jtc_0 = _mm_load_ps( joints[n2].t.ToFloatPtr() ); 110 __m128 jtd_0 = _mm_load_ps( joints[n3].t.ToFloatPtr() ); 111 112 __m128 bqa_0 = _mm_load_ps( blendJoints[n0].q.ToFloatPtr() ); 113 __m128 bqb_0 = _mm_load_ps( blendJoints[n1].q.ToFloatPtr() ); 114 __m128 bqc_0 = _mm_load_ps( blendJoints[n2].q.ToFloatPtr() ); 115 __m128 bqd_0 = _mm_load_ps( blendJoints[n3].q.ToFloatPtr() ); 116 117 __m128 bta_0 = _mm_load_ps( blendJoints[n0].t.ToFloatPtr() ); 118 __m128 btb_0 = _mm_load_ps( blendJoints[n1].t.ToFloatPtr() ); 119 __m128 btc_0 = _mm_load_ps( blendJoints[n2].t.ToFloatPtr() ); 120 __m128 btd_0 = _mm_load_ps( blendJoints[n3].t.ToFloatPtr() ); 121 122 bta_0 = _mm_sub_ps( bta_0, jta_0 ); 123 btb_0 = _mm_sub_ps( btb_0, jtb_0 ); 124 btc_0 = _mm_sub_ps( btc_0, jtc_0 ); 125 btd_0 = _mm_sub_ps( btd_0, jtd_0 ); 126 127 jta_0 = _mm_madd_ps( vlerp, bta_0, jta_0 ); 128 jtb_0 = _mm_madd_ps( vlerp, btb_0, jtb_0 ); 129 jtc_0 = _mm_madd_ps( vlerp, btc_0, jtc_0 ); 130 jtd_0 = _mm_madd_ps( vlerp, btd_0, jtd_0 ); 131 132 _mm_store_ps( joints[n0].t.ToFloatPtr(), jta_0 ); 133 _mm_store_ps( joints[n1].t.ToFloatPtr(), jtb_0 ); 134 _mm_store_ps( joints[n2].t.ToFloatPtr(), jtc_0 ); 135 _mm_store_ps( joints[n3].t.ToFloatPtr(), jtd_0 ); 136 137 __m128 jqr_0 = _mm_unpacklo_ps( jqa_0, jqc_0 ); 138 __m128 jqs_0 = _mm_unpackhi_ps( jqa_0, jqc_0 ); 139 __m128 jqt_0 = _mm_unpacklo_ps( jqb_0, jqd_0 ); 140 __m128 jqu_0 = _mm_unpackhi_ps( jqb_0, jqd_0 ); 141 142 __m128 bqr_0 = _mm_unpacklo_ps( bqa_0, bqc_0 ); 143 __m128 bqs_0 = _mm_unpackhi_ps( bqa_0, bqc_0 ); 144 __m128 bqt_0 = _mm_unpacklo_ps( bqb_0, bqd_0 ); 145 __m128 bqu_0 = _mm_unpackhi_ps( bqb_0, bqd_0 ); 146 147 __m128 jqx_0 = _mm_unpacklo_ps( jqr_0, jqt_0 ); 148 __m128 jqy_0 = _mm_unpackhi_ps( jqr_0, jqt_0 ); 149 __m128 jqz_0 = _mm_unpacklo_ps( jqs_0, jqu_0 ); 150 __m128 jqw_0 = _mm_unpackhi_ps( jqs_0, jqu_0 ); 151 152 __m128 bqx_0 = _mm_unpacklo_ps( bqr_0, bqt_0 ); 153 __m128 bqy_0 = _mm_unpackhi_ps( bqr_0, bqt_0 ); 154 __m128 bqz_0 = _mm_unpacklo_ps( bqs_0, bqu_0 ); 155 __m128 bqw_0 = _mm_unpackhi_ps( bqs_0, bqu_0 ); 156 157 __m128 cosoma_0 = _mm_mul_ps( jqx_0, bqx_0 ); 158 __m128 cosomb_0 = _mm_mul_ps( jqy_0, bqy_0 ); 159 __m128 cosomc_0 = _mm_mul_ps( jqz_0, bqz_0 ); 160 __m128 cosomd_0 = _mm_mul_ps( jqw_0, bqw_0 ); 161 162 __m128 cosome_0 = _mm_add_ps( cosoma_0, cosomb_0 ); 163 __m128 cosomf_0 = _mm_add_ps( cosomc_0, cosomd_0 ); 164 __m128 cosomg_0 = _mm_add_ps( cosome_0, cosomf_0 ); 165 166 __m128 sign_0 = _mm_and_ps( cosomg_0, vector_float_sign_bit ); 167 __m128 cosom_0 = _mm_xor_ps( cosomg_0, sign_0 ); 168 __m128 ss_0 = _mm_nmsub_ps( cosom_0, cosom_0, vector_float_one ); 169 170 ss_0 = _mm_max_ps( ss_0, vector_float_tiny ); 171 172 __m128 rs_0 = _mm_rsqrt_ps( ss_0 ); 173 __m128 sq_0 = _mm_mul_ps( rs_0, rs_0 ); 174 __m128 sh_0 = _mm_mul_ps( rs_0, vector_float_rsqrt_c1 ); 175 __m128 sx_0 = _mm_madd_ps( ss_0, sq_0, vector_float_rsqrt_c0 ); 176 __m128 sinom_0 = _mm_mul_ps( sh_0, sx_0 ); // sinom = sqrt( ss ); 177 178 ss_0 = _mm_mul_ps( ss_0, sinom_0 ); 179 180 __m128 min_0 = _mm_min_ps( ss_0, cosom_0 ); 181 __m128 max_0 = _mm_max_ps( ss_0, cosom_0 ); 182 __m128 mask_0 = _mm_cmpeq_ps( min_0, cosom_0 ); 183 __m128 masksign_0 = _mm_and_ps( mask_0, vector_float_sign_bit ); 184 __m128 maskPI_0 = _mm_and_ps( mask_0, vector_float_half_pi ); 185 186 __m128 rcpa_0 = _mm_rcp_ps( max_0 ); 187 __m128 rcpb_0 = _mm_mul_ps( max_0, rcpa_0 ); 188 __m128 rcpd_0 = _mm_add_ps( rcpa_0, rcpa_0 ); 189 __m128 rcp_0 = _mm_nmsub_ps( rcpb_0, rcpa_0, rcpd_0 ); // 1 / y or 1 / x 190 __m128 ata_0 = _mm_mul_ps( min_0, rcp_0 ); // x / y or y / x 191 192 __m128 atb_0 = _mm_xor_ps( ata_0, masksign_0 ); // -x / y or y / x 193 __m128 atc_0 = _mm_mul_ps( atb_0, atb_0 ); 194 __m128 atd_0 = _mm_madd_ps( atc_0, vector_float_atan_c0, vector_float_atan_c1 ); 195 196 atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c2 ); 197 atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c3 ); 198 atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c4 ); 199 atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c5 ); 200 atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c6 ); 201 atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c7 ); 202 atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_one ); 203 204 __m128 omega_a_0 = _mm_madd_ps( atd_0, atb_0, maskPI_0 ); 205 __m128 omega_b_0 = _mm_mul_ps( vlerp, omega_a_0 ); 206 omega_a_0 = _mm_sub_ps( omega_a_0, omega_b_0 ); 207 208 __m128 sinsa_0 = _mm_mul_ps( omega_a_0, omega_a_0 ); 209 __m128 sinsb_0 = _mm_mul_ps( omega_b_0, omega_b_0 ); 210 __m128 sina_0 = _mm_madd_ps( sinsa_0, vector_float_sin_c0, vector_float_sin_c1 ); 211 __m128 sinb_0 = _mm_madd_ps( sinsb_0, vector_float_sin_c0, vector_float_sin_c1 ); 212 sina_0 = _mm_madd_ps( sina_0, sinsa_0, vector_float_sin_c2 ); 213 sinb_0 = _mm_madd_ps( sinb_0, sinsb_0, vector_float_sin_c2 ); 214 sina_0 = _mm_madd_ps( sina_0, sinsa_0, vector_float_sin_c3 ); 215 sinb_0 = _mm_madd_ps( sinb_0, sinsb_0, vector_float_sin_c3 ); 216 sina_0 = _mm_madd_ps( sina_0, sinsa_0, vector_float_sin_c4 ); 217 sinb_0 = _mm_madd_ps( sinb_0, sinsb_0, vector_float_sin_c4 ); 218 sina_0 = _mm_madd_ps( sina_0, sinsa_0, vector_float_one ); 219 sinb_0 = _mm_madd_ps( sinb_0, sinsb_0, vector_float_one ); 220 sina_0 = _mm_mul_ps( sina_0, omega_a_0 ); 221 sinb_0 = _mm_mul_ps( sinb_0, omega_b_0 ); 222 __m128 scalea_0 = _mm_mul_ps( sina_0, sinom_0 ); 223 __m128 scaleb_0 = _mm_mul_ps( sinb_0, sinom_0 ); 224 225 scaleb_0 = _mm_xor_ps( scaleb_0, sign_0 ); 226 227 jqx_0 = _mm_mul_ps( jqx_0, scalea_0 ); 228 jqy_0 = _mm_mul_ps( jqy_0, scalea_0 ); 229 jqz_0 = _mm_mul_ps( jqz_0, scalea_0 ); 230 jqw_0 = _mm_mul_ps( jqw_0, scalea_0 ); 231 232 jqx_0 = _mm_madd_ps( bqx_0, scaleb_0, jqx_0 ); 233 jqy_0 = _mm_madd_ps( bqy_0, scaleb_0, jqy_0 ); 234 jqz_0 = _mm_madd_ps( bqz_0, scaleb_0, jqz_0 ); 235 jqw_0 = _mm_madd_ps( bqw_0, scaleb_0, jqw_0 ); 236 237 __m128 tp0_0 = _mm_unpacklo_ps( jqx_0, jqz_0 ); 238 __m128 tp1_0 = _mm_unpackhi_ps( jqx_0, jqz_0 ); 239 __m128 tp2_0 = _mm_unpacklo_ps( jqy_0, jqw_0 ); 240 __m128 tp3_0 = _mm_unpackhi_ps( jqy_0, jqw_0 ); 241 242 __m128 p0_0 = _mm_unpacklo_ps( tp0_0, tp2_0 ); 243 __m128 p1_0 = _mm_unpackhi_ps( tp0_0, tp2_0 ); 244 __m128 p2_0 = _mm_unpacklo_ps( tp1_0, tp3_0 ); 245 __m128 p3_0 = _mm_unpackhi_ps( tp1_0, tp3_0 ); 246 247 _mm_store_ps( joints[n0].q.ToFloatPtr(), p0_0 ); 248 _mm_store_ps( joints[n1].q.ToFloatPtr(), p1_0 ); 249 _mm_store_ps( joints[n2].q.ToFloatPtr(), p2_0 ); 250 _mm_store_ps( joints[n3].q.ToFloatPtr(), p3_0 ); 251 } 252 253 for ( ; i < numJoints; i++ ) { 254 int n = index[i]; 255 256 idVec3 &jointVert = joints[n].t; 257 const idVec3 &blendVert = blendJoints[n].t; 258 259 jointVert[0] += lerp * ( blendVert[0] - jointVert[0] ); 260 jointVert[1] += lerp * ( blendVert[1] - jointVert[1] ); 261 jointVert[2] += lerp * ( blendVert[2] - jointVert[2] ); 262 joints[n].w = 0.0f; 263 264 idQuat &jointQuat = joints[n].q; 265 const idQuat &blendQuat = blendJoints[n].q; 266 267 float cosom; 268 float sinom; 269 float omega; 270 float scale0; 271 float scale1; 272 unsigned long signBit; 273 274 cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w; 275 276 signBit = (*(unsigned long *)&cosom) & ( 1 << 31 ); 277 278 (*(unsigned long *)&cosom) ^= signBit; 279 280 scale0 = 1.0f - cosom * cosom; 281 scale0 = ( scale0 <= 0.0f ) ? 1e-10f : scale0; 282 sinom = idMath::InvSqrt( scale0 ); 283 omega = idMath::ATan16( scale0 * sinom, cosom ); 284 scale0 = idMath::Sin16( ( 1.0f - lerp ) * omega ) * sinom; 285 scale1 = idMath::Sin16( lerp * omega ) * sinom; 286 287 (*(unsigned long *)&scale1) ^= signBit; 288 289 jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.x; 290 jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.y; 291 jointQuat.z = scale0 * jointQuat.z + scale1 * blendQuat.z; 292 jointQuat.w = scale0 * jointQuat.w + scale1 * blendQuat.w; 293 } 294 } 295 296 /* 297 ============ 298 idSIMD_SSE::BlendJointsFast 299 ============ 300 */ 301 void VPCALL idSIMD_SSE::BlendJointsFast( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) { 302 assert_16_byte_aligned( joints ); 303 assert_16_byte_aligned( blendJoints ); 304 assert_16_byte_aligned( JOINTQUAT_Q_OFFSET ); 305 assert_16_byte_aligned( JOINTQUAT_T_OFFSET ); 306 assert_sizeof_16_byte_multiple( idJointQuat ); 307 308 if ( lerp <= 0.0f ) { 309 return; 310 } else if ( lerp >= 1.0f ) { 311 for ( int i = 0; i < numJoints; i++ ) { 312 int j = index[i]; 313 joints[j] = blendJoints[j]; 314 } 315 return; 316 } 317 318 const __m128 vector_float_sign_bit = __m128c( _mm_set_epi32( 0x80000000, 0x80000000, 0x80000000, 0x80000000 ) ); 319 const __m128 vector_float_rsqrt_c0 = { -3.0f, -3.0f, -3.0f, -3.0f }; 320 const __m128 vector_float_rsqrt_c1 = { -0.5f, -0.5f, -0.5f, -0.5f }; 321 322 const float scaledLerp = lerp / ( 1.0f - lerp ); 323 const __m128 vlerp = { lerp, lerp, lerp, lerp }; 324 const __m128 vscaledLerp = { scaledLerp, scaledLerp, scaledLerp, scaledLerp }; 325 326 int i = 0; 327 for ( ; i < numJoints - 3; i += 4 ) { 328 const int n0 = index[i+0]; 329 const int n1 = index[i+1]; 330 const int n2 = index[i+2]; 331 const int n3 = index[i+3]; 332 333 __m128 jqa_0 = _mm_load_ps( joints[n0].q.ToFloatPtr() ); 334 __m128 jqb_0 = _mm_load_ps( joints[n1].q.ToFloatPtr() ); 335 __m128 jqc_0 = _mm_load_ps( joints[n2].q.ToFloatPtr() ); 336 __m128 jqd_0 = _mm_load_ps( joints[n3].q.ToFloatPtr() ); 337 338 __m128 jta_0 = _mm_load_ps( joints[n0].t.ToFloatPtr() ); 339 __m128 jtb_0 = _mm_load_ps( joints[n1].t.ToFloatPtr() ); 340 __m128 jtc_0 = _mm_load_ps( joints[n2].t.ToFloatPtr() ); 341 __m128 jtd_0 = _mm_load_ps( joints[n3].t.ToFloatPtr() ); 342 343 __m128 bqa_0 = _mm_load_ps( blendJoints[n0].q.ToFloatPtr() ); 344 __m128 bqb_0 = _mm_load_ps( blendJoints[n1].q.ToFloatPtr() ); 345 __m128 bqc_0 = _mm_load_ps( blendJoints[n2].q.ToFloatPtr() ); 346 __m128 bqd_0 = _mm_load_ps( blendJoints[n3].q.ToFloatPtr() ); 347 348 __m128 bta_0 = _mm_load_ps( blendJoints[n0].t.ToFloatPtr() ); 349 __m128 btb_0 = _mm_load_ps( blendJoints[n1].t.ToFloatPtr() ); 350 __m128 btc_0 = _mm_load_ps( blendJoints[n2].t.ToFloatPtr() ); 351 __m128 btd_0 = _mm_load_ps( blendJoints[n3].t.ToFloatPtr() ); 352 353 bta_0 = _mm_sub_ps( bta_0, jta_0 ); 354 btb_0 = _mm_sub_ps( btb_0, jtb_0 ); 355 btc_0 = _mm_sub_ps( btc_0, jtc_0 ); 356 btd_0 = _mm_sub_ps( btd_0, jtd_0 ); 357 358 jta_0 = _mm_madd_ps( vlerp, bta_0, jta_0 ); 359 jtb_0 = _mm_madd_ps( vlerp, btb_0, jtb_0 ); 360 jtc_0 = _mm_madd_ps( vlerp, btc_0, jtc_0 ); 361 jtd_0 = _mm_madd_ps( vlerp, btd_0, jtd_0 ); 362 363 _mm_store_ps( joints[n0].t.ToFloatPtr(), jta_0 ); 364 _mm_store_ps( joints[n1].t.ToFloatPtr(), jtb_0 ); 365 _mm_store_ps( joints[n2].t.ToFloatPtr(), jtc_0 ); 366 _mm_store_ps( joints[n3].t.ToFloatPtr(), jtd_0 ); 367 368 __m128 jqr_0 = _mm_unpacklo_ps( jqa_0, jqc_0 ); 369 __m128 jqs_0 = _mm_unpackhi_ps( jqa_0, jqc_0 ); 370 __m128 jqt_0 = _mm_unpacklo_ps( jqb_0, jqd_0 ); 371 __m128 jqu_0 = _mm_unpackhi_ps( jqb_0, jqd_0 ); 372 373 __m128 bqr_0 = _mm_unpacklo_ps( bqa_0, bqc_0 ); 374 __m128 bqs_0 = _mm_unpackhi_ps( bqa_0, bqc_0 ); 375 __m128 bqt_0 = _mm_unpacklo_ps( bqb_0, bqd_0 ); 376 __m128 bqu_0 = _mm_unpackhi_ps( bqb_0, bqd_0 ); 377 378 __m128 jqx_0 = _mm_unpacklo_ps( jqr_0, jqt_0 ); 379 __m128 jqy_0 = _mm_unpackhi_ps( jqr_0, jqt_0 ); 380 __m128 jqz_0 = _mm_unpacklo_ps( jqs_0, jqu_0 ); 381 __m128 jqw_0 = _mm_unpackhi_ps( jqs_0, jqu_0 ); 382 383 __m128 bqx_0 = _mm_unpacklo_ps( bqr_0, bqt_0 ); 384 __m128 bqy_0 = _mm_unpackhi_ps( bqr_0, bqt_0 ); 385 __m128 bqz_0 = _mm_unpacklo_ps( bqs_0, bqu_0 ); 386 __m128 bqw_0 = _mm_unpackhi_ps( bqs_0, bqu_0 ); 387 388 __m128 cosoma_0 = _mm_mul_ps( jqx_0, bqx_0 ); 389 __m128 cosomb_0 = _mm_mul_ps( jqy_0, bqy_0 ); 390 __m128 cosomc_0 = _mm_mul_ps( jqz_0, bqz_0 ); 391 __m128 cosomd_0 = _mm_mul_ps( jqw_0, bqw_0 ); 392 393 __m128 cosome_0 = _mm_add_ps( cosoma_0, cosomb_0 ); 394 __m128 cosomf_0 = _mm_add_ps( cosomc_0, cosomd_0 ); 395 __m128 cosom_0 = _mm_add_ps( cosome_0, cosomf_0 ); 396 397 __m128 sign_0 = _mm_and_ps( cosom_0, vector_float_sign_bit ); 398 399 __m128 scale_0 = _mm_xor_ps( vscaledLerp, sign_0 ); 400 401 jqx_0 = _mm_madd_ps( scale_0, bqx_0, jqx_0 ); 402 jqy_0 = _mm_madd_ps( scale_0, bqy_0, jqy_0 ); 403 jqz_0 = _mm_madd_ps( scale_0, bqz_0, jqz_0 ); 404 jqw_0 = _mm_madd_ps( scale_0, bqw_0, jqw_0 ); 405 406 __m128 da_0 = _mm_mul_ps( jqx_0, jqx_0 ); 407 __m128 db_0 = _mm_mul_ps( jqy_0, jqy_0 ); 408 __m128 dc_0 = _mm_mul_ps( jqz_0, jqz_0 ); 409 __m128 dd_0 = _mm_mul_ps( jqw_0, jqw_0 ); 410 411 __m128 de_0 = _mm_add_ps( da_0, db_0 ); 412 __m128 df_0 = _mm_add_ps( dc_0, dd_0 ); 413 __m128 d_0 = _mm_add_ps( de_0, df_0 ); 414 415 __m128 rs_0 = _mm_rsqrt_ps( d_0 ); 416 __m128 sq_0 = _mm_mul_ps( rs_0, rs_0 ); 417 __m128 sh_0 = _mm_mul_ps( rs_0, vector_float_rsqrt_c1 ); 418 __m128 sx_0 = _mm_madd_ps( d_0, sq_0, vector_float_rsqrt_c0 ); 419 __m128 s_0 = _mm_mul_ps( sh_0, sx_0 ); 420 421 jqx_0 = _mm_mul_ps( jqx_0, s_0 ); 422 jqy_0 = _mm_mul_ps( jqy_0, s_0 ); 423 jqz_0 = _mm_mul_ps( jqz_0, s_0 ); 424 jqw_0 = _mm_mul_ps( jqw_0, s_0 ); 425 426 __m128 tp0_0 = _mm_unpacklo_ps( jqx_0, jqz_0 ); 427 __m128 tp1_0 = _mm_unpackhi_ps( jqx_0, jqz_0 ); 428 __m128 tp2_0 = _mm_unpacklo_ps( jqy_0, jqw_0 ); 429 __m128 tp3_0 = _mm_unpackhi_ps( jqy_0, jqw_0 ); 430 431 __m128 p0_0 = _mm_unpacklo_ps( tp0_0, tp2_0 ); 432 __m128 p1_0 = _mm_unpackhi_ps( tp0_0, tp2_0 ); 433 __m128 p2_0 = _mm_unpacklo_ps( tp1_0, tp3_0 ); 434 __m128 p3_0 = _mm_unpackhi_ps( tp1_0, tp3_0 ); 435 436 _mm_store_ps( joints[n0].q.ToFloatPtr(), p0_0 ); 437 _mm_store_ps( joints[n1].q.ToFloatPtr(), p1_0 ); 438 _mm_store_ps( joints[n2].q.ToFloatPtr(), p2_0 ); 439 _mm_store_ps( joints[n3].q.ToFloatPtr(), p3_0 ); 440 } 441 442 for ( ; i < numJoints; i++ ) { 443 const int n = index[i]; 444 445 idVec3 &jointVert = joints[n].t; 446 const idVec3 &blendVert = blendJoints[n].t; 447 448 jointVert[0] += lerp * ( blendVert[0] - jointVert[0] ); 449 jointVert[1] += lerp * ( blendVert[1] - jointVert[1] ); 450 jointVert[2] += lerp * ( blendVert[2] - jointVert[2] ); 451 452 idQuat &jointQuat = joints[n].q; 453 const idQuat &blendQuat = blendJoints[n].q; 454 455 float cosom; 456 float scale; 457 float s; 458 459 cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w; 460 461 scale = __fsels( cosom, scaledLerp, -scaledLerp ); 462 463 jointQuat.x += scale * blendQuat.x; 464 jointQuat.y += scale * blendQuat.y; 465 jointQuat.z += scale * blendQuat.z; 466 jointQuat.w += scale * blendQuat.w; 467 468 s = jointQuat.x * jointQuat.x + jointQuat.y * jointQuat.y + jointQuat.z * jointQuat.z + jointQuat.w * jointQuat.w; 469 s = __frsqrts( s ); 470 471 jointQuat.x *= s; 472 jointQuat.y *= s; 473 jointQuat.z *= s; 474 jointQuat.w *= s; 475 } 476 } 477 478 /* 479 ============ 480 idSIMD_SSE::ConvertJointQuatsToJointMats 481 ============ 482 */ 483 void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) { 484 assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE ); 485 assert( sizeof( idJointMat ) == JOINTMAT_SIZE ); 486 assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) ); 487 488 const float * jointQuatPtr = (float *)jointQuats; 489 float * jointMatPtr = (float *)jointMats; 490 491 const __m128 vector_float_first_sign_bit = __m128c( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x80000000 ) ); 492 const __m128 vector_float_last_three_sign_bits = __m128c( _mm_set_epi32( 0x80000000, 0x80000000, 0x80000000, 0x00000000 ) ); 493 const __m128 vector_float_first_pos_half = { 0.5f, 0.0f, 0.0f, 0.0f }; // +.5 0 0 0 494 const __m128 vector_float_first_neg_half = { -0.5f, 0.0f, 0.0f, 0.0f }; // -.5 0 0 0 495 const __m128 vector_float_quat2mat_mad1 = { -1.0f, -1.0f, +1.0f, -1.0f }; // - - + - 496 const __m128 vector_float_quat2mat_mad2 = { -1.0f, +1.0f, -1.0f, -1.0f }; // - + - - 497 const __m128 vector_float_quat2mat_mad3 = { +1.0f, -1.0f, -1.0f, +1.0f }; // + - - + 498 499 int i = 0; 500 for ( ; i + 1 < numJoints; i += 2 ) { 501 502 __m128 q0 = _mm_load_ps( &jointQuatPtr[i*8+0*8+0] ); 503 __m128 q1 = _mm_load_ps( &jointQuatPtr[i*8+1*8+0] ); 504 505 __m128 t0 = _mm_load_ps( &jointQuatPtr[i*8+0*8+4] ); 506 __m128 t1 = _mm_load_ps( &jointQuatPtr[i*8+1*8+4] ); 507 508 __m128 d0 = _mm_add_ps( q0, q0 ); 509 __m128 d1 = _mm_add_ps( q1, q1 ); 510 511 __m128 sa0 = _mm_perm_ps( q0, _MM_SHUFFLE( 1, 0, 0, 1 ) ); // y, x, x, y 512 __m128 sb0 = _mm_perm_ps( d0, _MM_SHUFFLE( 2, 2, 1, 1 ) ); // y2, y2, z2, z2 513 __m128 sc0 = _mm_perm_ps( q0, _MM_SHUFFLE( 3, 3, 3, 2 ) ); // z, w, w, w 514 __m128 sd0 = _mm_perm_ps( d0, _MM_SHUFFLE( 0, 1, 2, 2 ) ); // z2, z2, y2, x2 515 __m128 sa1 = _mm_perm_ps( q1, _MM_SHUFFLE( 1, 0, 0, 1 ) ); // y, x, x, y 516 __m128 sb1 = _mm_perm_ps( d1, _MM_SHUFFLE( 2, 2, 1, 1 ) ); // y2, y2, z2, z2 517 __m128 sc1 = _mm_perm_ps( q1, _MM_SHUFFLE( 3, 3, 3, 2 ) ); // z, w, w, w 518 __m128 sd1 = _mm_perm_ps( d1, _MM_SHUFFLE( 0, 1, 2, 2 ) ); // z2, z2, y2, x2 519 520 sa0 = _mm_xor_ps( sa0, vector_float_first_sign_bit ); 521 sa1 = _mm_xor_ps( sa1, vector_float_first_sign_bit ); 522 523 sc0 = _mm_xor_ps( sc0, vector_float_last_three_sign_bits ); // flip stupid inverse quaternions 524 sc1 = _mm_xor_ps( sc1, vector_float_last_three_sign_bits ); // flip stupid inverse quaternions 525 526 __m128 ma0 = _mm_add_ps( _mm_mul_ps( sa0, sb0 ), vector_float_first_pos_half ); // .5 - yy2, xy2, xz2, yz2 // .5 0 0 0 527 __m128 mb0 = _mm_add_ps( _mm_mul_ps( sc0, sd0 ), vector_float_first_neg_half ); // -.5 + zz2, wz2, wy2, wx2 // -.5 0 0 0 528 __m128 mc0 = _mm_sub_ps( vector_float_first_pos_half, _mm_mul_ps( q0, d0 ) ); // .5 - xx2, -yy2, -zz2, -ww2 // .5 0 0 0 529 __m128 ma1 = _mm_add_ps( _mm_mul_ps( sa1, sb1 ), vector_float_first_pos_half ); // .5 - yy2, xy2, xz2, yz2 // .5 0 0 0 530 __m128 mb1 = _mm_add_ps( _mm_mul_ps( sc1, sd1 ), vector_float_first_neg_half ); // -.5 + zz2, wz2, wy2, wx2 // -.5 0 0 0 531 __m128 mc1 = _mm_sub_ps( vector_float_first_pos_half, _mm_mul_ps( q1, d1 ) ); // .5 - xx2, -yy2, -zz2, -ww2 // .5 0 0 0 532 533 __m128 mf0 = _mm_shuffle_ps( ma0, mc0, _MM_SHUFFLE( 0, 0, 1, 1 ) ); // xy2, xy2, .5 - xx2, .5 - xx2 // 01, 01, 10, 10 534 __m128 md0 = _mm_shuffle_ps( mf0, ma0, _MM_SHUFFLE( 3, 2, 0, 2 ) ); // .5 - xx2, xy2, xz2, yz2 // 10, 01, 02, 03 535 __m128 me0 = _mm_shuffle_ps( ma0, mb0, _MM_SHUFFLE( 3, 2, 1, 0 ) ); // .5 - yy2, xy2, wy2, wx2 // 00, 01, 12, 13 536 __m128 mf1 = _mm_shuffle_ps( ma1, mc1, _MM_SHUFFLE( 0, 0, 1, 1 ) ); // xy2, xy2, .5 - xx2, .5 - xx2 // 01, 01, 10, 10 537 __m128 md1 = _mm_shuffle_ps( mf1, ma1, _MM_SHUFFLE( 3, 2, 0, 2 ) ); // .5 - xx2, xy2, xz2, yz2 // 10, 01, 02, 03 538 __m128 me1 = _mm_shuffle_ps( ma1, mb1, _MM_SHUFFLE( 3, 2, 1, 0 ) ); // .5 - yy2, xy2, wy2, wx2 // 00, 01, 12, 13 539 540 __m128 ra0 = _mm_add_ps( _mm_mul_ps( mb0, vector_float_quat2mat_mad1 ), ma0 ); // 1 - yy2 - zz2, xy2 - wz2, xz2 + wy2, // - - + - 541 __m128 rb0 = _mm_add_ps( _mm_mul_ps( mb0, vector_float_quat2mat_mad2 ), md0 ); // 1 - xx2 - zz2, xy2 + wz2, , yz2 - wx2 // - + - - 542 __m128 rc0 = _mm_add_ps( _mm_mul_ps( me0, vector_float_quat2mat_mad3 ), md0 ); // 1 - xx2 - yy2, , xz2 - wy2, yz2 + wx2 // + - - + 543 __m128 ra1 = _mm_add_ps( _mm_mul_ps( mb1, vector_float_quat2mat_mad1 ), ma1 ); // 1 - yy2 - zz2, xy2 - wz2, xz2 + wy2, // - - + - 544 __m128 rb1 = _mm_add_ps( _mm_mul_ps( mb1, vector_float_quat2mat_mad2 ), md1 ); // 1 - xx2 - zz2, xy2 + wz2, , yz2 - wx2 // - + - - 545 __m128 rc1 = _mm_add_ps( _mm_mul_ps( me1, vector_float_quat2mat_mad3 ), md1 ); // 1 - xx2 - yy2, , xz2 - wy2, yz2 + wx2 // + - - + 546 547 __m128 ta0 = _mm_shuffle_ps( ra0, t0, _MM_SHUFFLE( 0, 0, 2, 2 ) ); 548 __m128 tb0 = _mm_shuffle_ps( rb0, t0, _MM_SHUFFLE( 1, 1, 3, 3 ) ); 549 __m128 tc0 = _mm_shuffle_ps( rc0, t0, _MM_SHUFFLE( 2, 2, 0, 0 ) ); 550 __m128 ta1 = _mm_shuffle_ps( ra1, t1, _MM_SHUFFLE( 0, 0, 2, 2 ) ); 551 __m128 tb1 = _mm_shuffle_ps( rb1, t1, _MM_SHUFFLE( 1, 1, 3, 3 ) ); 552 __m128 tc1 = _mm_shuffle_ps( rc1, t1, _MM_SHUFFLE( 2, 2, 0, 0 ) ); 553 554 ra0 = _mm_shuffle_ps( ra0, ta0, _MM_SHUFFLE( 2, 0, 1, 0 ) ); // 00 01 02 10 555 rb0 = _mm_shuffle_ps( rb0, tb0, _MM_SHUFFLE( 2, 0, 0, 1 ) ); // 01 00 03 11 556 rc0 = _mm_shuffle_ps( rc0, tc0, _MM_SHUFFLE( 2, 0, 3, 2 ) ); // 02 03 00 12 557 ra1 = _mm_shuffle_ps( ra1, ta1, _MM_SHUFFLE( 2, 0, 1, 0 ) ); // 00 01 02 10 558 rb1 = _mm_shuffle_ps( rb1, tb1, _MM_SHUFFLE( 2, 0, 0, 1 ) ); // 01 00 03 11 559 rc1 = _mm_shuffle_ps( rc1, tc1, _MM_SHUFFLE( 2, 0, 3, 2 ) ); // 02 03 00 12 560 561 _mm_store_ps( &jointMatPtr[i*12+0*12+0], ra0 ); 562 _mm_store_ps( &jointMatPtr[i*12+0*12+4], rb0 ); 563 _mm_store_ps( &jointMatPtr[i*12+0*12+8], rc0 ); 564 _mm_store_ps( &jointMatPtr[i*12+1*12+0], ra1 ); 565 _mm_store_ps( &jointMatPtr[i*12+1*12+4], rb1 ); 566 _mm_store_ps( &jointMatPtr[i*12+1*12+8], rc1 ); 567 } 568 569 for ( ; i < numJoints; i++ ) { 570 571 __m128 q0 = _mm_load_ps( &jointQuatPtr[i*8+0*8+0] ); 572 __m128 t0 = _mm_load_ps( &jointQuatPtr[i*8+0*8+4] ); 573 574 __m128 d0 = _mm_add_ps( q0, q0 ); 575 576 __m128 sa0 = _mm_perm_ps( q0, _MM_SHUFFLE( 1, 0, 0, 1 ) ); // y, x, x, y 577 __m128 sb0 = _mm_perm_ps( d0, _MM_SHUFFLE( 2, 2, 1, 1 ) ); // y2, y2, z2, z2 578 __m128 sc0 = _mm_perm_ps( q0, _MM_SHUFFLE( 3, 3, 3, 2 ) ); // z, w, w, w 579 __m128 sd0 = _mm_perm_ps( d0, _MM_SHUFFLE( 0, 1, 2, 2 ) ); // z2, z2, y2, x2 580 581 sa0 = _mm_xor_ps( sa0, vector_float_first_sign_bit ); 582 sc0 = _mm_xor_ps( sc0, vector_float_last_three_sign_bits ); // flip stupid inverse quaternions 583 584 __m128 ma0 = _mm_add_ps( _mm_mul_ps( sa0, sb0 ), vector_float_first_pos_half ); // .5 - yy2, xy2, xz2, yz2 // .5 0 0 0 585 __m128 mb0 = _mm_add_ps( _mm_mul_ps( sc0, sd0 ), vector_float_first_neg_half ); // -.5 + zz2, wz2, wy2, wx2 // -.5 0 0 0 586 __m128 mc0 = _mm_sub_ps( vector_float_first_pos_half, _mm_mul_ps( q0, d0 ) ); // .5 - xx2, -yy2, -zz2, -ww2 // .5 0 0 0 587 588 __m128 mf0 = _mm_shuffle_ps( ma0, mc0, _MM_SHUFFLE( 0, 0, 1, 1 ) ); // xy2, xy2, .5 - xx2, .5 - xx2 // 01, 01, 10, 10 589 __m128 md0 = _mm_shuffle_ps( mf0, ma0, _MM_SHUFFLE( 3, 2, 0, 2 ) ); // .5 - xx2, xy2, xz2, yz2 // 10, 01, 02, 03 590 __m128 me0 = _mm_shuffle_ps( ma0, mb0, _MM_SHUFFLE( 3, 2, 1, 0 ) ); // .5 - yy2, xy2, wy2, wx2 // 00, 01, 12, 13 591 592 __m128 ra0 = _mm_add_ps( _mm_mul_ps( mb0, vector_float_quat2mat_mad1 ), ma0 ); // 1 - yy2 - zz2, xy2 - wz2, xz2 + wy2, // - - + - 593 __m128 rb0 = _mm_add_ps( _mm_mul_ps( mb0, vector_float_quat2mat_mad2 ), md0 ); // 1 - xx2 - zz2, xy2 + wz2, , yz2 - wx2 // - + - - 594 __m128 rc0 = _mm_add_ps( _mm_mul_ps( me0, vector_float_quat2mat_mad3 ), md0 ); // 1 - xx2 - yy2, , xz2 - wy2, yz2 + wx2 // + - - + 595 596 __m128 ta0 = _mm_shuffle_ps( ra0, t0, _MM_SHUFFLE( 0, 0, 2, 2 ) ); 597 __m128 tb0 = _mm_shuffle_ps( rb0, t0, _MM_SHUFFLE( 1, 1, 3, 3 ) ); 598 __m128 tc0 = _mm_shuffle_ps( rc0, t0, _MM_SHUFFLE( 2, 2, 0, 0 ) ); 599 600 ra0 = _mm_shuffle_ps( ra0, ta0, _MM_SHUFFLE( 2, 0, 1, 0 ) ); // 00 01 02 10 601 rb0 = _mm_shuffle_ps( rb0, tb0, _MM_SHUFFLE( 2, 0, 0, 1 ) ); // 01 00 03 11 602 rc0 = _mm_shuffle_ps( rc0, tc0, _MM_SHUFFLE( 2, 0, 3, 2 ) ); // 02 03 00 12 603 604 _mm_store_ps( &jointMatPtr[i*12+0*12+0], ra0 ); 605 _mm_store_ps( &jointMatPtr[i*12+0*12+4], rb0 ); 606 _mm_store_ps( &jointMatPtr[i*12+0*12+8], rc0 ); 607 } 608 } 609 610 /* 611 ============ 612 idSIMD_SSE::ConvertJointMatsToJointQuats 613 ============ 614 */ 615 void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) { 616 617 assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE ); 618 assert( sizeof( idJointMat ) == JOINTMAT_SIZE ); 619 assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) ); 620 621 const __m128 vector_float_zero = _mm_setzero_ps(); 622 const __m128 vector_float_one = { 1.0f, 1.0f, 1.0f, 1.0f }; 623 const __m128 vector_float_not = __m128c( _mm_set_epi32( -1, -1, -1, -1 ) ); 624 const __m128 vector_float_sign_bit = __m128c( _mm_set_epi32( 0x80000000, 0x80000000, 0x80000000, 0x80000000 ) ); 625 const __m128 vector_float_rsqrt_c0 = { -3.0f, -3.0f, -3.0f, -3.0f }; 626 const __m128 vector_float_rsqrt_c2 = { -0.25f, -0.25f, -0.25f, -0.25f }; 627 628 int i = 0; 629 for ( ; i < numJoints - 3; i += 4 ) { 630 const float *__restrict m = (float *)&jointMats[i]; 631 float *__restrict q = (float *)&jointQuats[i]; 632 633 __m128 ma0 = _mm_load_ps( &m[0*12+0] ); 634 __m128 ma1 = _mm_load_ps( &m[0*12+4] ); 635 __m128 ma2 = _mm_load_ps( &m[0*12+8] ); 636 637 __m128 mb0 = _mm_load_ps( &m[1*12+0] ); 638 __m128 mb1 = _mm_load_ps( &m[1*12+4] ); 639 __m128 mb2 = _mm_load_ps( &m[1*12+8] ); 640 641 __m128 mc0 = _mm_load_ps( &m[2*12+0] ); 642 __m128 mc1 = _mm_load_ps( &m[2*12+4] ); 643 __m128 mc2 = _mm_load_ps( &m[2*12+8] ); 644 645 __m128 md0 = _mm_load_ps( &m[3*12+0] ); 646 __m128 md1 = _mm_load_ps( &m[3*12+4] ); 647 __m128 md2 = _mm_load_ps( &m[3*12+8] ); 648 649 __m128 ta0 = _mm_unpacklo_ps( ma0, mc0 ); // a0, c0, a1, c1 650 __m128 ta1 = _mm_unpackhi_ps( ma0, mc0 ); // a2, c2, a3, c3 651 __m128 ta2 = _mm_unpacklo_ps( mb0, md0 ); // b0, d0, b1, b2 652 __m128 ta3 = _mm_unpackhi_ps( mb0, md0 ); // b2, d2, b3, d3 653 654 __m128 tb0 = _mm_unpacklo_ps( ma1, mc1 ); // a0, c0, a1, c1 655 __m128 tb1 = _mm_unpackhi_ps( ma1, mc1 ); // a2, c2, a3, c3 656 __m128 tb2 = _mm_unpacklo_ps( mb1, md1 ); // b0, d0, b1, b2 657 __m128 tb3 = _mm_unpackhi_ps( mb1, md1 ); // b2, d2, b3, d3 658 659 __m128 tc0 = _mm_unpacklo_ps( ma2, mc2 ); // a0, c0, a1, c1 660 __m128 tc1 = _mm_unpackhi_ps( ma2, mc2 ); // a2, c2, a3, c3 661 __m128 tc2 = _mm_unpacklo_ps( mb2, md2 ); // b0, d0, b1, b2 662 __m128 tc3 = _mm_unpackhi_ps( mb2, md2 ); // b2, d2, b3, d3 663 664 __m128 m00 = _mm_unpacklo_ps( ta0, ta2 ); 665 __m128 m01 = _mm_unpackhi_ps( ta0, ta2 ); 666 __m128 m02 = _mm_unpacklo_ps( ta1, ta3 ); 667 __m128 m03 = _mm_unpackhi_ps( ta1, ta3 ); 668 669 __m128 m10 = _mm_unpacklo_ps( tb0, tb2 ); 670 __m128 m11 = _mm_unpackhi_ps( tb0, tb2 ); 671 __m128 m12 = _mm_unpacklo_ps( tb1, tb3 ); 672 __m128 m13 = _mm_unpackhi_ps( tb1, tb3 ); 673 674 __m128 m20 = _mm_unpacklo_ps( tc0, tc2 ); 675 __m128 m21 = _mm_unpackhi_ps( tc0, tc2 ); 676 __m128 m22 = _mm_unpacklo_ps( tc1, tc3 ); 677 __m128 m23 = _mm_unpackhi_ps( tc1, tc3 ); 678 679 __m128 b00 = _mm_add_ps( m00, m11 ); 680 __m128 b11 = _mm_cmpgt_ps( m00, m22 ); 681 __m128 b01 = _mm_add_ps( b00, m22 ); 682 __m128 b10 = _mm_cmpgt_ps( m00, m11 ); 683 __m128 b0 = _mm_cmpgt_ps( b01, vector_float_zero ); 684 __m128 b1 = _mm_and_ps( b10, b11 ); 685 __m128 b2 = _mm_cmpgt_ps( m11, m22 ); 686 687 __m128 m0 = b0; 688 __m128 m1 = _mm_and_ps( _mm_xor_ps( b0, vector_float_not ), b1 ); 689 __m128 p1 = _mm_or_ps( b0, b1 ); 690 __m128 p2 = _mm_or_ps( p1, b2 ); 691 __m128 m2 = _mm_and_ps( _mm_xor_ps( p1, vector_float_not ), b2 ); 692 __m128 m3 = _mm_xor_ps( p2, vector_float_not ); 693 694 __m128 i0 = _mm_or_ps( m2, m3 ); 695 __m128 i1 = _mm_or_ps( m1, m3 ); 696 __m128 i2 = _mm_or_ps( m1, m2 ); 697 698 __m128 s0 = _mm_and_ps( i0, vector_float_sign_bit ); 699 __m128 s1 = _mm_and_ps( i1, vector_float_sign_bit ); 700 __m128 s2 = _mm_and_ps( i2, vector_float_sign_bit ); 701 702 m00 = _mm_xor_ps( m00, s0 ); 703 m11 = _mm_xor_ps( m11, s1 ); 704 m22 = _mm_xor_ps( m22, s2 ); 705 m21 = _mm_xor_ps( m21, s0 ); 706 m02 = _mm_xor_ps( m02, s1 ); 707 m10 = _mm_xor_ps( m10, s2 ); 708 709 __m128 t0 = _mm_add_ps( m00, m11 ); 710 __m128 t1 = _mm_add_ps( m22, vector_float_one ); 711 __m128 q0 = _mm_add_ps( t0, t1 ); 712 __m128 q1 = _mm_sub_ps( m01, m10 ); 713 __m128 q2 = _mm_sub_ps( m20, m02 ); 714 __m128 q3 = _mm_sub_ps( m12, m21 ); 715 716 __m128 rs = _mm_rsqrt_ps( q0 ); 717 __m128 sq = _mm_mul_ps( rs, rs ); 718 __m128 sh = _mm_mul_ps( rs, vector_float_rsqrt_c2 ); 719 __m128 sx = _mm_madd_ps( q0, sq, vector_float_rsqrt_c0 ); 720 __m128 s = _mm_mul_ps( sh, sx ); 721 722 q0 = _mm_mul_ps( q0, s ); 723 q1 = _mm_mul_ps( q1, s ); 724 q2 = _mm_mul_ps( q2, s ); 725 q3 = _mm_mul_ps( q3, s ); 726 727 m0 = _mm_or_ps( m0, m2 ); 728 m2 = _mm_or_ps( m2, m3 ); 729 730 __m128 fq0 = _mm_sel_ps( q0, q3, m0 ); 731 __m128 fq1 = _mm_sel_ps( q1, q2, m0 ); 732 __m128 fq2 = _mm_sel_ps( q2, q1, m0 ); 733 __m128 fq3 = _mm_sel_ps( q3, q0, m0 ); 734 735 __m128 rq0 = _mm_sel_ps( fq0, fq2, m2 ); 736 __m128 rq1 = _mm_sel_ps( fq1, fq3, m2 ); 737 __m128 rq2 = _mm_sel_ps( fq2, fq0, m2 ); 738 __m128 rq3 = _mm_sel_ps( fq3, fq1, m2 ); 739 740 __m128 tq0 = _mm_unpacklo_ps( rq0, rq2 ); 741 __m128 tq1 = _mm_unpackhi_ps( rq0, rq2 ); 742 __m128 tq2 = _mm_unpacklo_ps( rq1, rq3 ); 743 __m128 tq3 = _mm_unpackhi_ps( rq1, rq3 ); 744 745 __m128 sq0 = _mm_unpacklo_ps( tq0, tq2 ); 746 __m128 sq1 = _mm_unpackhi_ps( tq0, tq2 ); 747 __m128 sq2 = _mm_unpacklo_ps( tq1, tq3 ); 748 __m128 sq3 = _mm_unpackhi_ps( tq1, tq3 ); 749 750 __m128 tt0 = _mm_unpacklo_ps( m03, m23 ); 751 __m128 tt1 = _mm_unpackhi_ps( m03, m23 ); 752 __m128 tt2 = _mm_unpacklo_ps( m13, vector_float_zero ); 753 __m128 tt3 = _mm_unpackhi_ps( m13, vector_float_zero ); 754 755 __m128 st0 = _mm_unpacklo_ps( tt0, tt2 ); 756 __m128 st1 = _mm_unpackhi_ps( tt0, tt2 ); 757 __m128 st2 = _mm_unpacklo_ps( tt1, tt3 ); 758 __m128 st3 = _mm_unpackhi_ps( tt1, tt3 ); 759 760 _mm_store_ps( &q[0*4], sq0 ); 761 _mm_store_ps( &q[1*4], st0 ); 762 _mm_store_ps( &q[2*4], sq1 ); 763 _mm_store_ps( &q[3*4], st1 ); 764 _mm_store_ps( &q[4*4], sq2 ); 765 _mm_store_ps( &q[5*4], st2 ); 766 _mm_store_ps( &q[6*4], sq3 ); 767 _mm_store_ps( &q[7*4], st3 ); 768 } 769 770 float sign[2] = { 1.0f, -1.0f }; 771 772 for ( ; i < numJoints; i++ ) { 773 const float *__restrict m = (float *)&jointMats[i]; 774 float *__restrict q = (float *)&jointQuats[i]; 775 776 int b0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f; 777 int b1 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]; 778 int b2 = m[1 * 4 + 1] > m[2 * 4 + 2]; 779 780 int m0 = b0; 781 int m1 = ( !b0 ) & b1; 782 int m2 = ( !( b0 | b1 ) ) & b2; 783 int m3 = !( b0 | b1 | b2 ); 784 785 int i0 = ( m2 | m3 ); 786 int i1 = ( m1 | m3 ); 787 int i2 = ( m1 | m2 ); 788 789 float s0 = sign[i0]; 790 float s1 = sign[i1]; 791 float s2 = sign[i2]; 792 793 float t = s0 * m[0 * 4 + 0] + s1 * m[1 * 4 + 1] + s2 * m[2 * 4 + 2] + 1.0f; 794 float s = __frsqrts( t ); 795 s = ( t * s * s + -3.0f ) * ( s * -0.25f ); 796 797 q[0] = t * s; 798 q[1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; 799 q[2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; 800 q[3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; 801 802 if ( m0 | m2 ) { 803 // reverse 804 SwapValues( q[0], q[3] ); 805 SwapValues( q[1], q[2] ); 806 } 807 if ( m2 | m3 ) { 808 // rotate 2 809 SwapValues( q[0], q[2] ); 810 SwapValues( q[1], q[3] ); 811 } 812 813 q[4] = m[0 * 4 + 3]; 814 q[5] = m[1 * 4 + 3]; 815 q[6] = m[2 * 4 + 3]; 816 q[7] = 0.0f; 817 } 818 } 819 820 /* 821 ============ 822 idSIMD_SSE::TransformJoints 823 ============ 824 */ 825 void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) { 826 const __m128 vector_float_mask_keep_last = __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) ); 827 828 const float *__restrict firstMatrix = jointMats->ToFloatPtr() + ( firstJoint + firstJoint + firstJoint - 3 ) * 4; 829 830 __m128 pma = _mm_load_ps( firstMatrix + 0 ); 831 __m128 pmb = _mm_load_ps( firstMatrix + 4 ); 832 __m128 pmc = _mm_load_ps( firstMatrix + 8 ); 833 834 for ( int joint = firstJoint; joint <= lastJoint; joint++ ) { 835 const int parent = parents[joint]; 836 const float *__restrict parentMatrix = jointMats->ToFloatPtr() + ( parent + parent + parent ) * 4; 837 float *__restrict childMatrix = jointMats->ToFloatPtr() + ( joint + joint + joint ) * 4; 838 839 if ( parent != joint - 1 ) { 840 pma = _mm_load_ps( parentMatrix + 0 ); 841 pmb = _mm_load_ps( parentMatrix + 4 ); 842 pmc = _mm_load_ps( parentMatrix + 8 ); 843 } 844 845 __m128 cma = _mm_load_ps( childMatrix + 0 ); 846 __m128 cmb = _mm_load_ps( childMatrix + 4 ); 847 __m128 cmc = _mm_load_ps( childMatrix + 8 ); 848 849 __m128 ta = _mm_splat_ps( pma, 0 ); 850 __m128 tb = _mm_splat_ps( pmb, 0 ); 851 __m128 tc = _mm_splat_ps( pmc, 0 ); 852 853 __m128 td = _mm_splat_ps( pma, 1 ); 854 __m128 te = _mm_splat_ps( pmb, 1 ); 855 __m128 tf = _mm_splat_ps( pmc, 1 ); 856 857 __m128 tg = _mm_splat_ps( pma, 2 ); 858 __m128 th = _mm_splat_ps( pmb, 2 ); 859 __m128 ti = _mm_splat_ps( pmc, 2 ); 860 861 pma = _mm_madd_ps( ta, cma, _mm_and_ps( pma, vector_float_mask_keep_last ) ); 862 pmb = _mm_madd_ps( tb, cma, _mm_and_ps( pmb, vector_float_mask_keep_last ) ); 863 pmc = _mm_madd_ps( tc, cma, _mm_and_ps( pmc, vector_float_mask_keep_last ) ); 864 865 pma = _mm_madd_ps( td, cmb, pma ); 866 pmb = _mm_madd_ps( te, cmb, pmb ); 867 pmc = _mm_madd_ps( tf, cmb, pmc ); 868 869 pma = _mm_madd_ps( tg, cmc, pma ); 870 pmb = _mm_madd_ps( th, cmc, pmb ); 871 pmc = _mm_madd_ps( ti, cmc, pmc ); 872 873 _mm_store_ps( childMatrix + 0, pma ); 874 _mm_store_ps( childMatrix + 4, pmb ); 875 _mm_store_ps( childMatrix + 8, pmc ); 876 } 877 } 878 879 /* 880 ============ 881 idSIMD_SSE::UntransformJoints 882 ============ 883 */ 884 void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) { 885 const __m128 vector_float_mask_keep_last = __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) ); 886 887 for ( int joint = lastJoint; joint >= firstJoint; joint-- ) { 888 assert( parents[joint] < joint ); 889 const int parent = parents[joint]; 890 const float *__restrict parentMatrix = jointMats->ToFloatPtr() + ( parent + parent + parent ) * 4; 891 float *__restrict childMatrix = jointMats->ToFloatPtr() + ( joint + joint + joint ) * 4; 892 893 __m128 pma = _mm_load_ps( parentMatrix + 0 ); 894 __m128 pmb = _mm_load_ps( parentMatrix + 4 ); 895 __m128 pmc = _mm_load_ps( parentMatrix + 8 ); 896 897 __m128 cma = _mm_load_ps( childMatrix + 0 ); 898 __m128 cmb = _mm_load_ps( childMatrix + 4 ); 899 __m128 cmc = _mm_load_ps( childMatrix + 8 ); 900 901 __m128 ta = _mm_splat_ps( pma, 0 ); 902 __m128 tb = _mm_splat_ps( pma, 1 ); 903 __m128 tc = _mm_splat_ps( pma, 2 ); 904 905 __m128 td = _mm_splat_ps( pmb, 0 ); 906 __m128 te = _mm_splat_ps( pmb, 1 ); 907 __m128 tf = _mm_splat_ps( pmb, 2 ); 908 909 __m128 tg = _mm_splat_ps( pmc, 0 ); 910 __m128 th = _mm_splat_ps( pmc, 1 ); 911 __m128 ti = _mm_splat_ps( pmc, 2 ); 912 913 cma = _mm_sub_ps( cma, _mm_and_ps( pma, vector_float_mask_keep_last ) ); 914 cmb = _mm_sub_ps( cmb, _mm_and_ps( pmb, vector_float_mask_keep_last ) ); 915 cmc = _mm_sub_ps( cmc, _mm_and_ps( pmc, vector_float_mask_keep_last ) ); 916 917 pma = _mm_mul_ps( ta, cma ); 918 pmb = _mm_mul_ps( tb, cma ); 919 pmc = _mm_mul_ps( tc, cma ); 920 921 pma = _mm_madd_ps( td, cmb, pma ); 922 pmb = _mm_madd_ps( te, cmb, pmb ); 923 pmc = _mm_madd_ps( tf, cmb, pmc ); 924 925 pma = _mm_madd_ps( tg, cmc, pma ); 926 pmb = _mm_madd_ps( th, cmc, pmb ); 927 pmc = _mm_madd_ps( ti, cmc, pmc ); 928 929 _mm_store_ps( childMatrix + 0, pma ); 930 _mm_store_ps( childMatrix + 4, pmb ); 931 _mm_store_ps( childMatrix + 8, pmc ); 932 } 933 } 934