Simd_SSE.cpp - DOOM-3-BFG - DOOM 3 BFG Edition

Simd_SSE.cpp (38817B)
      1 /*
      2 ===========================================================================
      3 
      4 Doom 3 BFG Edition GPL Source Code
      5 Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. 
      6 
      7 This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").  
      8 
      9 Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
     10 it under the terms of the GNU General Public License as published by
     11 the Free Software Foundation, either version 3 of the License, or
     12 (at your option) any later version.
     13 
     14 Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
     15 but WITHOUT ANY WARRANTY; without even the implied warranty of
     16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     17 GNU General Public License for more details.
     18 
     19 You should have received a copy of the GNU General Public License
     20 along with Doom 3 BFG Edition Source Code.  If not, see <http://www.gnu.org/licenses/>.
     21 
     22 In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code.  If not, please request a copy in writing from id Software at the address below.
     23 
     24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
     25 
     26 ===========================================================================
     27 */
     28 
     29 #pragma hdrstop
     30 #include "../precompiled.h"
     31 #include "Simd_Generic.h"
     32 #include "Simd_SSE.h"
     33 
     34 //===============================================================
     35 //                                                        M
     36 //  SSE implementation of idSIMDProcessor                MrE
     37 //                                                        E
     38 //===============================================================
     39 
     40 
     41 #include <xmmintrin.h>
     42 
     43 #define M_PI	3.14159265358979323846f
     44 
     45 /*
     46 ============
     47 idSIMD_SSE::GetName
     48 ============
     49 */
     50 const char * idSIMD_SSE::GetName() const {
     51 	return "MMX & SSE";
     52 }
     53 
     54 /*
     55 ============
     56 idSIMD_SSE::BlendJoints
     57 ============
     58 */
     59 void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
     60 
     61 	if ( lerp <= 0.0f ) {
     62 		return;
     63 	} else if ( lerp >= 1.0f ) {
     64 		for ( int i = 0; i < numJoints; i++ ) {
     65 			int j = index[i];
     66 			joints[j] = blendJoints[j];
     67 		}
     68 		return;
     69 	}
     70 
     71 	const __m128 vlerp = { lerp, lerp, lerp, lerp };
     72 
     73 	const __m128 vector_float_one		= { 1.0f, 1.0f, 1.0f, 1.0f };
     74 	const __m128 vector_float_sign_bit	= __m128c( _mm_set_epi32( 0x80000000, 0x80000000, 0x80000000, 0x80000000 ) );
     75 	const __m128 vector_float_rsqrt_c0	= {  -3.0f,  -3.0f,  -3.0f,  -3.0f };
     76 	const __m128 vector_float_rsqrt_c1	= {  -0.5f,  -0.5f,  -0.5f,  -0.5f };
     77 	const __m128 vector_float_tiny		= {    1e-10f,    1e-10f,    1e-10f,    1e-10f };
     78 	const __m128 vector_float_half_pi	= { M_PI*0.5f, M_PI*0.5f, M_PI*0.5f, M_PI*0.5f };
     79 
     80 	const __m128 vector_float_sin_c0	= { -2.39e-08f, -2.39e-08f, -2.39e-08f, -2.39e-08f };
     81 	const __m128 vector_float_sin_c1	= {  2.7526e-06f, 2.7526e-06f, 2.7526e-06f, 2.7526e-06f };
     82 	const __m128 vector_float_sin_c2	= { -1.98409e-04f, -1.98409e-04f, -1.98409e-04f, -1.98409e-04f };
     83 	const __m128 vector_float_sin_c3	= {  8.3333315e-03f, 8.3333315e-03f, 8.3333315e-03f, 8.3333315e-03f };
     84 	const __m128 vector_float_sin_c4	= { -1.666666664e-01f, -1.666666664e-01f, -1.666666664e-01f, -1.666666664e-01f };
     85 
     86 	const __m128 vector_float_atan_c0	= {  0.0028662257f,  0.0028662257f,  0.0028662257f,  0.0028662257f };
     87 	const __m128 vector_float_atan_c1	= { -0.0161657367f, -0.0161657367f, -0.0161657367f, -0.0161657367f };
     88 	const __m128 vector_float_atan_c2	= {  0.0429096138f,  0.0429096138f,  0.0429096138f,  0.0429096138f };
     89 	const __m128 vector_float_atan_c3	= { -0.0752896400f, -0.0752896400f, -0.0752896400f, -0.0752896400f };
     90 	const __m128 vector_float_atan_c4	= {  0.1065626393f,  0.1065626393f,  0.1065626393f,  0.1065626393f };
     91 	const __m128 vector_float_atan_c5	= { -0.1420889944f, -0.1420889944f, -0.1420889944f, -0.1420889944f };
     92 	const __m128 vector_float_atan_c6	= {  0.1999355085f,  0.1999355085f,  0.1999355085f,  0.1999355085f };
     93 	const __m128 vector_float_atan_c7	= { -0.3333314528f, -0.3333314528f, -0.3333314528f, -0.3333314528f };
     94 
     95 	int i = 0;
     96 	for ( ; i < numJoints - 3; i += 4 ) {
     97 		const int n0 = index[i+0];
     98 		const int n1 = index[i+1];
     99 		const int n2 = index[i+2];
    100 		const int n3 = index[i+3];
    101 
    102 		__m128 jqa_0 = _mm_load_ps( joints[n0].q.ToFloatPtr() );
    103 		__m128 jqb_0 = _mm_load_ps( joints[n1].q.ToFloatPtr() );
    104 		__m128 jqc_0 = _mm_load_ps( joints[n2].q.ToFloatPtr() );
    105 		__m128 jqd_0 = _mm_load_ps( joints[n3].q.ToFloatPtr() );
    106 
    107 		__m128 jta_0 = _mm_load_ps( joints[n0].t.ToFloatPtr() );
    108 		__m128 jtb_0 = _mm_load_ps( joints[n1].t.ToFloatPtr() );
    109 		__m128 jtc_0 = _mm_load_ps( joints[n2].t.ToFloatPtr() );
    110 		__m128 jtd_0 = _mm_load_ps( joints[n3].t.ToFloatPtr() );
    111 
    112 		__m128 bqa_0 = _mm_load_ps( blendJoints[n0].q.ToFloatPtr() );
    113 		__m128 bqb_0 = _mm_load_ps( blendJoints[n1].q.ToFloatPtr() );
    114 		__m128 bqc_0 = _mm_load_ps( blendJoints[n2].q.ToFloatPtr() );
    115 		__m128 bqd_0 = _mm_load_ps( blendJoints[n3].q.ToFloatPtr() );
    116 
    117 		__m128 bta_0 = _mm_load_ps( blendJoints[n0].t.ToFloatPtr() );
    118 		__m128 btb_0 = _mm_load_ps( blendJoints[n1].t.ToFloatPtr() );
    119 		__m128 btc_0 = _mm_load_ps( blendJoints[n2].t.ToFloatPtr() );
    120 		__m128 btd_0 = _mm_load_ps( blendJoints[n3].t.ToFloatPtr() );
    121 
    122 		bta_0 = _mm_sub_ps( bta_0, jta_0 );
    123 		btb_0 = _mm_sub_ps( btb_0, jtb_0 );
    124 		btc_0 = _mm_sub_ps( btc_0, jtc_0 );
    125 		btd_0 = _mm_sub_ps( btd_0, jtd_0 );
    126 
    127 		jta_0 = _mm_madd_ps( vlerp, bta_0, jta_0 );
    128 		jtb_0 = _mm_madd_ps( vlerp, btb_0, jtb_0 );
    129 		jtc_0 = _mm_madd_ps( vlerp, btc_0, jtc_0 );
    130 		jtd_0 = _mm_madd_ps( vlerp, btd_0, jtd_0 );
    131 
    132 		_mm_store_ps( joints[n0].t.ToFloatPtr(), jta_0 );
    133 		_mm_store_ps( joints[n1].t.ToFloatPtr(), jtb_0 );
    134 		_mm_store_ps( joints[n2].t.ToFloatPtr(), jtc_0 );
    135 		_mm_store_ps( joints[n3].t.ToFloatPtr(), jtd_0 );
    136 
    137 		__m128 jqr_0 = _mm_unpacklo_ps( jqa_0, jqc_0 );
    138 		__m128 jqs_0 = _mm_unpackhi_ps( jqa_0, jqc_0 );
    139 		__m128 jqt_0 = _mm_unpacklo_ps( jqb_0, jqd_0 );
    140 		__m128 jqu_0 = _mm_unpackhi_ps( jqb_0, jqd_0 );
    141 
    142 		__m128 bqr_0 = _mm_unpacklo_ps( bqa_0, bqc_0 );
    143 		__m128 bqs_0 = _mm_unpackhi_ps( bqa_0, bqc_0 );
    144 		__m128 bqt_0 = _mm_unpacklo_ps( bqb_0, bqd_0 );
    145 		__m128 bqu_0 = _mm_unpackhi_ps( bqb_0, bqd_0 );
    146 
    147 		__m128 jqx_0 = _mm_unpacklo_ps( jqr_0, jqt_0 );
    148 		__m128 jqy_0 = _mm_unpackhi_ps( jqr_0, jqt_0 );
    149 		__m128 jqz_0 = _mm_unpacklo_ps( jqs_0, jqu_0 );
    150 		__m128 jqw_0 = _mm_unpackhi_ps( jqs_0, jqu_0 );
    151 
    152 		__m128 bqx_0 = _mm_unpacklo_ps( bqr_0, bqt_0 );
    153 		__m128 bqy_0 = _mm_unpackhi_ps( bqr_0, bqt_0 );
    154 		__m128 bqz_0 = _mm_unpacklo_ps( bqs_0, bqu_0 );
    155 		__m128 bqw_0 = _mm_unpackhi_ps( bqs_0, bqu_0 );
    156 
    157 		__m128 cosoma_0 = _mm_mul_ps( jqx_0, bqx_0 );
    158 		__m128 cosomb_0 = _mm_mul_ps( jqy_0, bqy_0 );
    159 		__m128 cosomc_0 = _mm_mul_ps( jqz_0, bqz_0 );
    160 		__m128 cosomd_0 = _mm_mul_ps( jqw_0, bqw_0 );
    161 
    162 		__m128 cosome_0 = _mm_add_ps( cosoma_0, cosomb_0 );
    163 		__m128 cosomf_0 = _mm_add_ps( cosomc_0, cosomd_0 );
    164 		__m128 cosomg_0 = _mm_add_ps( cosome_0, cosomf_0 );
    165 
    166 		__m128 sign_0 = _mm_and_ps( cosomg_0, vector_float_sign_bit );
    167 		__m128 cosom_0 = _mm_xor_ps( cosomg_0, sign_0 );
    168 		__m128 ss_0 = _mm_nmsub_ps( cosom_0, cosom_0, vector_float_one );
    169 
    170 		ss_0 = _mm_max_ps( ss_0, vector_float_tiny );
    171 
    172 		__m128 rs_0 = _mm_rsqrt_ps( ss_0 );
    173 		__m128 sq_0 = _mm_mul_ps( rs_0, rs_0 );
    174 		__m128 sh_0 = _mm_mul_ps( rs_0, vector_float_rsqrt_c1 );
    175 		__m128 sx_0 = _mm_madd_ps( ss_0, sq_0, vector_float_rsqrt_c0 );
    176 		__m128 sinom_0 = _mm_mul_ps( sh_0, sx_0 );						// sinom = sqrt( ss );
    177 
    178 		ss_0 = _mm_mul_ps( ss_0, sinom_0 );
    179 
    180 		__m128 min_0 = _mm_min_ps( ss_0, cosom_0 );
    181 		__m128 max_0 = _mm_max_ps( ss_0, cosom_0 );
    182 		__m128 mask_0 = _mm_cmpeq_ps( min_0, cosom_0 );
    183 		__m128 masksign_0 = _mm_and_ps( mask_0, vector_float_sign_bit );
    184 		__m128 maskPI_0 = _mm_and_ps( mask_0, vector_float_half_pi );
    185 
    186 		__m128 rcpa_0 = _mm_rcp_ps( max_0 );
    187 		__m128 rcpb_0 = _mm_mul_ps( max_0, rcpa_0 );
    188 		__m128 rcpd_0 = _mm_add_ps( rcpa_0, rcpa_0 );
    189 		__m128 rcp_0 = _mm_nmsub_ps( rcpb_0, rcpa_0, rcpd_0 );			// 1 / y or 1 / x
    190 		__m128 ata_0 = _mm_mul_ps( min_0, rcp_0 );						// x / y or y / x
    191 
    192 		__m128 atb_0 = _mm_xor_ps( ata_0, masksign_0 );					// -x / y or y / x
    193 		__m128 atc_0 = _mm_mul_ps( atb_0, atb_0 );
    194 		__m128 atd_0 = _mm_madd_ps( atc_0, vector_float_atan_c0, vector_float_atan_c1 );
    195 
    196 		atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c2 );
    197 		atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c3 );
    198 		atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c4 );
    199 		atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c5 );
    200 		atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c6 );
    201 		atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_atan_c7 );
    202 		atd_0 = _mm_madd_ps( atd_0, atc_0, vector_float_one );
    203 
    204 		__m128 omega_a_0 = _mm_madd_ps( atd_0, atb_0, maskPI_0 );
    205 		__m128 omega_b_0 = _mm_mul_ps( vlerp, omega_a_0 );
    206 		omega_a_0 = _mm_sub_ps( omega_a_0, omega_b_0 );
    207 
    208 		__m128 sinsa_0 = _mm_mul_ps( omega_a_0, omega_a_0 );
    209 		__m128 sinsb_0 = _mm_mul_ps( omega_b_0, omega_b_0 );
    210 		__m128 sina_0 = _mm_madd_ps( sinsa_0, vector_float_sin_c0, vector_float_sin_c1 );
    211 		__m128 sinb_0 = _mm_madd_ps( sinsb_0, vector_float_sin_c0, vector_float_sin_c1 );
    212 		sina_0 = _mm_madd_ps( sina_0, sinsa_0, vector_float_sin_c2 );
    213 		sinb_0 = _mm_madd_ps( sinb_0, sinsb_0, vector_float_sin_c2 );
    214 		sina_0 = _mm_madd_ps( sina_0, sinsa_0, vector_float_sin_c3 );
    215 		sinb_0 = _mm_madd_ps( sinb_0, sinsb_0, vector_float_sin_c3 );
    216 		sina_0 = _mm_madd_ps( sina_0, sinsa_0, vector_float_sin_c4 );
    217 		sinb_0 = _mm_madd_ps( sinb_0, sinsb_0, vector_float_sin_c4 );
    218 		sina_0 = _mm_madd_ps( sina_0, sinsa_0, vector_float_one );
    219 		sinb_0 = _mm_madd_ps( sinb_0, sinsb_0, vector_float_one );
    220 		sina_0 = _mm_mul_ps( sina_0, omega_a_0 );
    221 		sinb_0 = _mm_mul_ps( sinb_0, omega_b_0 );
    222 		__m128 scalea_0 = _mm_mul_ps( sina_0, sinom_0 );
    223 		__m128 scaleb_0 = _mm_mul_ps( sinb_0, sinom_0 );
    224 
    225 		scaleb_0 = _mm_xor_ps( scaleb_0, sign_0 );
    226 
    227 		jqx_0 = _mm_mul_ps( jqx_0, scalea_0 );
    228 		jqy_0 = _mm_mul_ps( jqy_0, scalea_0 );
    229 		jqz_0 = _mm_mul_ps( jqz_0, scalea_0 );
    230 		jqw_0 = _mm_mul_ps( jqw_0, scalea_0 );
    231 
    232 		jqx_0 = _mm_madd_ps( bqx_0, scaleb_0, jqx_0 );
    233 		jqy_0 = _mm_madd_ps( bqy_0, scaleb_0, jqy_0 );
    234 		jqz_0 = _mm_madd_ps( bqz_0, scaleb_0, jqz_0 );
    235 		jqw_0 = _mm_madd_ps( bqw_0, scaleb_0, jqw_0 );
    236 
    237 		__m128 tp0_0 = _mm_unpacklo_ps( jqx_0, jqz_0 );
    238 		__m128 tp1_0 = _mm_unpackhi_ps( jqx_0, jqz_0 );
    239 		__m128 tp2_0 = _mm_unpacklo_ps( jqy_0, jqw_0 );
    240 		__m128 tp3_0 = _mm_unpackhi_ps( jqy_0, jqw_0 );
    241 
    242 		__m128 p0_0 = _mm_unpacklo_ps( tp0_0, tp2_0 );
    243 		__m128 p1_0 = _mm_unpackhi_ps( tp0_0, tp2_0 );
    244 		__m128 p2_0 = _mm_unpacklo_ps( tp1_0, tp3_0 );
    245 		__m128 p3_0 = _mm_unpackhi_ps( tp1_0, tp3_0 );
    246 
    247 		_mm_store_ps( joints[n0].q.ToFloatPtr(), p0_0 );
    248 		_mm_store_ps( joints[n1].q.ToFloatPtr(), p1_0 );
    249 		_mm_store_ps( joints[n2].q.ToFloatPtr(), p2_0 );
    250 		_mm_store_ps( joints[n3].q.ToFloatPtr(), p3_0 );
    251 	}
    252 
    253 	for ( ; i < numJoints; i++ ) {
    254 		int n = index[i];
    255 
    256 		idVec3 &jointVert = joints[n].t;
    257 		const idVec3 &blendVert = blendJoints[n].t;
    258 
    259 		jointVert[0] += lerp * ( blendVert[0] - jointVert[0] );
    260 		jointVert[1] += lerp * ( blendVert[1] - jointVert[1] );
    261 		jointVert[2] += lerp * ( blendVert[2] - jointVert[2] );
    262 		joints[n].w = 0.0f;
    263 
    264 		idQuat &jointQuat = joints[n].q;
    265 		const idQuat &blendQuat = blendJoints[n].q;
    266 
    267 		float cosom;
    268 		float sinom;
    269 		float omega;
    270 		float scale0;
    271 		float scale1;
    272 		unsigned long signBit;
    273 
    274 		cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w;
    275 
    276 		signBit = (*(unsigned long *)&cosom) & ( 1 << 31 );
    277 
    278 		(*(unsigned long *)&cosom) ^= signBit;
    279 
    280 		scale0 = 1.0f - cosom * cosom;
    281 		scale0 = ( scale0 <= 0.0f ) ? 1e-10f : scale0;
    282 		sinom = idMath::InvSqrt( scale0 );
    283 		omega = idMath::ATan16( scale0 * sinom, cosom );
    284 		scale0 = idMath::Sin16( ( 1.0f - lerp ) * omega ) * sinom;
    285 		scale1 = idMath::Sin16( lerp * omega ) * sinom;
    286 
    287 		(*(unsigned long *)&scale1) ^= signBit;
    288 
    289 		jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.x;
    290 		jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.y;
    291 		jointQuat.z = scale0 * jointQuat.z + scale1 * blendQuat.z;
    292 		jointQuat.w = scale0 * jointQuat.w + scale1 * blendQuat.w;
    293 	}
    294 }
    295 
    296 /*
    297 ============
    298 idSIMD_SSE::BlendJointsFast
    299 ============
    300 */
    301 void VPCALL idSIMD_SSE::BlendJointsFast( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
    302 	assert_16_byte_aligned( joints );
    303 	assert_16_byte_aligned( blendJoints );
    304 	assert_16_byte_aligned( JOINTQUAT_Q_OFFSET );
    305 	assert_16_byte_aligned( JOINTQUAT_T_OFFSET );
    306 	assert_sizeof_16_byte_multiple( idJointQuat );
    307 
    308 	if ( lerp <= 0.0f ) {
    309 		return;
    310 	} else if ( lerp >= 1.0f ) {
    311 		for ( int i = 0; i < numJoints; i++ ) {
    312 			int j = index[i];
    313 			joints[j] = blendJoints[j];
    314 		}
    315 		return;
    316 	}
    317 
    318 	const __m128 vector_float_sign_bit	= __m128c( _mm_set_epi32( 0x80000000, 0x80000000, 0x80000000, 0x80000000 ) );
    319 	const __m128 vector_float_rsqrt_c0	= {  -3.0f,  -3.0f,  -3.0f,  -3.0f };
    320 	const __m128 vector_float_rsqrt_c1	= {  -0.5f,  -0.5f,  -0.5f,  -0.5f };
    321 
    322 	const float scaledLerp = lerp / ( 1.0f - lerp );
    323 	const __m128 vlerp = { lerp, lerp, lerp, lerp };
    324 	const __m128 vscaledLerp = { scaledLerp, scaledLerp, scaledLerp, scaledLerp };
    325 
    326 	int i = 0;
    327 	for ( ; i < numJoints - 3; i += 4 ) {
    328 		const int n0 = index[i+0];
    329 		const int n1 = index[i+1];
    330 		const int n2 = index[i+2];
    331 		const int n3 = index[i+3];
    332 
    333 		__m128 jqa_0 = _mm_load_ps( joints[n0].q.ToFloatPtr() );
    334 		__m128 jqb_0 = _mm_load_ps( joints[n1].q.ToFloatPtr() );
    335 		__m128 jqc_0 = _mm_load_ps( joints[n2].q.ToFloatPtr() );
    336 		__m128 jqd_0 = _mm_load_ps( joints[n3].q.ToFloatPtr() );
    337 
    338 		__m128 jta_0 = _mm_load_ps( joints[n0].t.ToFloatPtr() );
    339 		__m128 jtb_0 = _mm_load_ps( joints[n1].t.ToFloatPtr() );
    340 		__m128 jtc_0 = _mm_load_ps( joints[n2].t.ToFloatPtr() );
    341 		__m128 jtd_0 = _mm_load_ps( joints[n3].t.ToFloatPtr() );
    342 
    343 		__m128 bqa_0 = _mm_load_ps( blendJoints[n0].q.ToFloatPtr() );
    344 		__m128 bqb_0 = _mm_load_ps( blendJoints[n1].q.ToFloatPtr() );
    345 		__m128 bqc_0 = _mm_load_ps( blendJoints[n2].q.ToFloatPtr() );
    346 		__m128 bqd_0 = _mm_load_ps( blendJoints[n3].q.ToFloatPtr() );
    347 
    348 		__m128 bta_0 = _mm_load_ps( blendJoints[n0].t.ToFloatPtr() );
    349 		__m128 btb_0 = _mm_load_ps( blendJoints[n1].t.ToFloatPtr() );
    350 		__m128 btc_0 = _mm_load_ps( blendJoints[n2].t.ToFloatPtr() );
    351 		__m128 btd_0 = _mm_load_ps( blendJoints[n3].t.ToFloatPtr() );
    352 
    353 		bta_0 = _mm_sub_ps( bta_0, jta_0 );
    354 		btb_0 = _mm_sub_ps( btb_0, jtb_0 );
    355 		btc_0 = _mm_sub_ps( btc_0, jtc_0 );
    356 		btd_0 = _mm_sub_ps( btd_0, jtd_0 );
    357 
    358 		jta_0 = _mm_madd_ps( vlerp, bta_0, jta_0 );
    359 		jtb_0 = _mm_madd_ps( vlerp, btb_0, jtb_0 );
    360 		jtc_0 = _mm_madd_ps( vlerp, btc_0, jtc_0 );
    361 		jtd_0 = _mm_madd_ps( vlerp, btd_0, jtd_0 );
    362 
    363 		_mm_store_ps( joints[n0].t.ToFloatPtr(), jta_0 );
    364 		_mm_store_ps( joints[n1].t.ToFloatPtr(), jtb_0 );
    365 		_mm_store_ps( joints[n2].t.ToFloatPtr(), jtc_0 );
    366 		_mm_store_ps( joints[n3].t.ToFloatPtr(), jtd_0 );
    367 
    368 		__m128 jqr_0 = _mm_unpacklo_ps( jqa_0, jqc_0 );
    369 		__m128 jqs_0 = _mm_unpackhi_ps( jqa_0, jqc_0 );
    370 		__m128 jqt_0 = _mm_unpacklo_ps( jqb_0, jqd_0 );
    371 		__m128 jqu_0 = _mm_unpackhi_ps( jqb_0, jqd_0 );
    372 
    373 		__m128 bqr_0 = _mm_unpacklo_ps( bqa_0, bqc_0 );
    374 		__m128 bqs_0 = _mm_unpackhi_ps( bqa_0, bqc_0 );
    375 		__m128 bqt_0 = _mm_unpacklo_ps( bqb_0, bqd_0 );
    376 		__m128 bqu_0 = _mm_unpackhi_ps( bqb_0, bqd_0 );
    377 
    378 		__m128 jqx_0 = _mm_unpacklo_ps( jqr_0, jqt_0 );
    379 		__m128 jqy_0 = _mm_unpackhi_ps( jqr_0, jqt_0 );
    380 		__m128 jqz_0 = _mm_unpacklo_ps( jqs_0, jqu_0 );
    381 		__m128 jqw_0 = _mm_unpackhi_ps( jqs_0, jqu_0 );
    382 
    383 		__m128 bqx_0 = _mm_unpacklo_ps( bqr_0, bqt_0 );
    384 		__m128 bqy_0 = _mm_unpackhi_ps( bqr_0, bqt_0 );
    385 		__m128 bqz_0 = _mm_unpacklo_ps( bqs_0, bqu_0 );
    386 		__m128 bqw_0 = _mm_unpackhi_ps( bqs_0, bqu_0 );
    387 
    388 		__m128 cosoma_0 = _mm_mul_ps( jqx_0, bqx_0 );
    389 		__m128 cosomb_0 = _mm_mul_ps( jqy_0, bqy_0 );
    390 		__m128 cosomc_0 = _mm_mul_ps( jqz_0, bqz_0 );
    391 		__m128 cosomd_0 = _mm_mul_ps( jqw_0, bqw_0 );
    392 
    393 		__m128 cosome_0 = _mm_add_ps( cosoma_0, cosomb_0 );
    394 		__m128 cosomf_0 = _mm_add_ps( cosomc_0, cosomd_0 );
    395 		__m128 cosom_0 = _mm_add_ps( cosome_0, cosomf_0 );
    396 
    397 		__m128 sign_0 = _mm_and_ps( cosom_0, vector_float_sign_bit );
    398 
    399 		__m128 scale_0 = _mm_xor_ps( vscaledLerp, sign_0 );
    400 
    401 		jqx_0 = _mm_madd_ps( scale_0, bqx_0, jqx_0 );
    402 		jqy_0 = _mm_madd_ps( scale_0, bqy_0, jqy_0 );
    403 		jqz_0 = _mm_madd_ps( scale_0, bqz_0, jqz_0 );
    404 		jqw_0 = _mm_madd_ps( scale_0, bqw_0, jqw_0 );
    405 
    406 		__m128 da_0 = _mm_mul_ps( jqx_0, jqx_0 );
    407 		__m128 db_0 = _mm_mul_ps( jqy_0, jqy_0 );
    408 		__m128 dc_0 = _mm_mul_ps( jqz_0, jqz_0 );
    409 		__m128 dd_0 = _mm_mul_ps( jqw_0, jqw_0 );
    410 
    411 		__m128 de_0 = _mm_add_ps( da_0, db_0 );
    412 		__m128 df_0 = _mm_add_ps( dc_0, dd_0 );
    413 		__m128 d_0 = _mm_add_ps( de_0, df_0 );
    414 
    415 		__m128 rs_0 = _mm_rsqrt_ps( d_0 );
    416 		__m128 sq_0 = _mm_mul_ps( rs_0, rs_0 );
    417 		__m128 sh_0 = _mm_mul_ps( rs_0, vector_float_rsqrt_c1 );
    418 		__m128 sx_0 = _mm_madd_ps( d_0, sq_0, vector_float_rsqrt_c0 );
    419 		__m128 s_0 = _mm_mul_ps( sh_0, sx_0 );
    420 
    421 		jqx_0 = _mm_mul_ps( jqx_0, s_0 );
    422 		jqy_0 = _mm_mul_ps( jqy_0, s_0 );
    423 		jqz_0 = _mm_mul_ps( jqz_0, s_0 );
    424 		jqw_0 = _mm_mul_ps( jqw_0, s_0 );
    425 
    426 		__m128 tp0_0 = _mm_unpacklo_ps( jqx_0, jqz_0 );
    427 		__m128 tp1_0 = _mm_unpackhi_ps( jqx_0, jqz_0 );
    428 		__m128 tp2_0 = _mm_unpacklo_ps( jqy_0, jqw_0 );
    429 		__m128 tp3_0 = _mm_unpackhi_ps( jqy_0, jqw_0 );
    430 
    431 		__m128 p0_0 = _mm_unpacklo_ps( tp0_0, tp2_0 );
    432 		__m128 p1_0 = _mm_unpackhi_ps( tp0_0, tp2_0 );
    433 		__m128 p2_0 = _mm_unpacklo_ps( tp1_0, tp3_0 );
    434 		__m128 p3_0 = _mm_unpackhi_ps( tp1_0, tp3_0 );
    435 
    436 		_mm_store_ps( joints[n0].q.ToFloatPtr(), p0_0 );
    437 		_mm_store_ps( joints[n1].q.ToFloatPtr(), p1_0 );
    438 		_mm_store_ps( joints[n2].q.ToFloatPtr(), p2_0 );
    439 		_mm_store_ps( joints[n3].q.ToFloatPtr(), p3_0 );
    440 	}
    441 
    442 	for ( ; i < numJoints; i++ ) {
    443 		const int n = index[i];
    444 
    445 		idVec3 &jointVert = joints[n].t;
    446 		const idVec3 &blendVert = blendJoints[n].t;
    447 
    448 		jointVert[0] += lerp * ( blendVert[0] - jointVert[0] );
    449 		jointVert[1] += lerp * ( blendVert[1] - jointVert[1] );
    450 		jointVert[2] += lerp * ( blendVert[2] - jointVert[2] );
    451 
    452 		idQuat &jointQuat = joints[n].q;
    453 		const idQuat &blendQuat = blendJoints[n].q;
    454 
    455 		float cosom;
    456 		float scale;
    457 		float s;
    458 
    459 		cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w;
    460 
    461 		scale = __fsels( cosom, scaledLerp, -scaledLerp );
    462 
    463 		jointQuat.x += scale * blendQuat.x;
    464 		jointQuat.y += scale * blendQuat.y;
    465 		jointQuat.z += scale * blendQuat.z;
    466 		jointQuat.w += scale * blendQuat.w;
    467 
    468 		s = jointQuat.x * jointQuat.x + jointQuat.y * jointQuat.y + jointQuat.z * jointQuat.z + jointQuat.w * jointQuat.w;
    469 		s = __frsqrts( s );
    470 
    471 		jointQuat.x *= s;
    472 		jointQuat.y *= s;
    473 		jointQuat.z *= s;
    474 		jointQuat.w *= s;
    475 	}
    476 }
    477 
    478 /*
    479 ============
    480 idSIMD_SSE::ConvertJointQuatsToJointMats
    481 ============
    482 */
    483 void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
    484 	assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
    485 	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
    486 	assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
    487 
    488 	const float * jointQuatPtr = (float *)jointQuats;
    489 	float * jointMatPtr = (float *)jointMats;
    490 
    491 	const __m128 vector_float_first_sign_bit		= __m128c( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x80000000 ) );
    492 	const __m128 vector_float_last_three_sign_bits	= __m128c( _mm_set_epi32( 0x80000000, 0x80000000, 0x80000000, 0x00000000 ) );
    493 	const __m128 vector_float_first_pos_half		= {   0.5f,   0.0f,   0.0f,   0.0f };	// +.5 0 0 0
    494 	const __m128 vector_float_first_neg_half		= {  -0.5f,   0.0f,   0.0f,   0.0f };	// -.5 0 0 0
    495 	const __m128 vector_float_quat2mat_mad1			= {  -1.0f,  -1.0f,  +1.0f,  -1.0f };	//  - - + -
    496 	const __m128 vector_float_quat2mat_mad2			= {  -1.0f,  +1.0f,  -1.0f,  -1.0f };	//  - + - -
    497 	const __m128 vector_float_quat2mat_mad3			= {  +1.0f,  -1.0f,  -1.0f,  +1.0f };	//  + - - +
    498 
    499 	int i = 0;
    500 	for ( ; i + 1 < numJoints; i += 2 ) {
    501 
    502 		__m128 q0 = _mm_load_ps( &jointQuatPtr[i*8+0*8+0] );
    503 		__m128 q1 = _mm_load_ps( &jointQuatPtr[i*8+1*8+0] );
    504 
    505 		__m128 t0 = _mm_load_ps( &jointQuatPtr[i*8+0*8+4] );
    506 		__m128 t1 = _mm_load_ps( &jointQuatPtr[i*8+1*8+4] );
    507 
    508 		__m128 d0 = _mm_add_ps( q0, q0 );
    509 		__m128 d1 = _mm_add_ps( q1, q1 );
    510 
    511 		__m128 sa0 = _mm_perm_ps( q0, _MM_SHUFFLE( 1, 0, 0, 1 ) );							//   y,   x,   x,   y
    512 		__m128 sb0 = _mm_perm_ps( d0, _MM_SHUFFLE( 2, 2, 1, 1 ) );							//  y2,  y2,  z2,  z2
    513 		__m128 sc0 = _mm_perm_ps( q0, _MM_SHUFFLE( 3, 3, 3, 2 ) );							//   z,   w,   w,   w
    514 		__m128 sd0 = _mm_perm_ps( d0, _MM_SHUFFLE( 0, 1, 2, 2 ) );							//  z2,  z2,  y2,  x2
    515 		__m128 sa1 = _mm_perm_ps( q1, _MM_SHUFFLE( 1, 0, 0, 1 ) );							//   y,   x,   x,   y
    516 		__m128 sb1 = _mm_perm_ps( d1, _MM_SHUFFLE( 2, 2, 1, 1 ) );							//  y2,  y2,  z2,  z2
    517 		__m128 sc1 = _mm_perm_ps( q1, _MM_SHUFFLE( 3, 3, 3, 2 ) );							//   z,   w,   w,   w
    518 		__m128 sd1 = _mm_perm_ps( d1, _MM_SHUFFLE( 0, 1, 2, 2 ) );							//  z2,  z2,  y2,  x2
    519 
    520 		sa0 = _mm_xor_ps( sa0, vector_float_first_sign_bit );
    521 		sa1 = _mm_xor_ps( sa1, vector_float_first_sign_bit );
    522 
    523 		sc0 = _mm_xor_ps( sc0, vector_float_last_three_sign_bits );							// flip stupid inverse quaternions
    524 		sc1 = _mm_xor_ps( sc1, vector_float_last_three_sign_bits );							// flip stupid inverse quaternions
    525 
    526 		__m128 ma0 = _mm_add_ps( _mm_mul_ps( sa0, sb0 ), vector_float_first_pos_half );		//  .5 - yy2,  xy2,  xz2,  yz2		//  .5 0 0 0
    527 		__m128 mb0 = _mm_add_ps( _mm_mul_ps( sc0, sd0 ), vector_float_first_neg_half );		// -.5 + zz2,  wz2,  wy2,  wx2		// -.5 0 0 0
    528 		__m128 mc0 = _mm_sub_ps( vector_float_first_pos_half, _mm_mul_ps( q0, d0 ) );		//  .5 - xx2, -yy2, -zz2, -ww2		//  .5 0 0 0
    529 		__m128 ma1 = _mm_add_ps( _mm_mul_ps( sa1, sb1 ), vector_float_first_pos_half );		//  .5 - yy2,  xy2,  xz2,  yz2		//  .5 0 0 0
    530 		__m128 mb1 = _mm_add_ps( _mm_mul_ps( sc1, sd1 ), vector_float_first_neg_half );		// -.5 + zz2,  wz2,  wy2,  wx2		// -.5 0 0 0
    531 		__m128 mc1 = _mm_sub_ps( vector_float_first_pos_half, _mm_mul_ps( q1, d1 ) );		//  .5 - xx2, -yy2, -zz2, -ww2		//  .5 0 0 0
    532 
    533 		__m128 mf0 = _mm_shuffle_ps( ma0, mc0, _MM_SHUFFLE( 0, 0, 1, 1 ) );					//       xy2,  xy2, .5 - xx2, .5 - xx2	// 01, 01, 10, 10
    534 		__m128 md0 = _mm_shuffle_ps( mf0, ma0, _MM_SHUFFLE( 3, 2, 0, 2 ) );					//  .5 - xx2,  xy2,  xz2,  yz2			// 10, 01, 02, 03
    535 		__m128 me0 = _mm_shuffle_ps( ma0, mb0, _MM_SHUFFLE( 3, 2, 1, 0 ) );					//  .5 - yy2,  xy2,  wy2,  wx2			// 00, 01, 12, 13
    536 		__m128 mf1 = _mm_shuffle_ps( ma1, mc1, _MM_SHUFFLE( 0, 0, 1, 1 ) );					//       xy2,  xy2, .5 - xx2, .5 - xx2	// 01, 01, 10, 10
    537 		__m128 md1 = _mm_shuffle_ps( mf1, ma1, _MM_SHUFFLE( 3, 2, 0, 2 ) );					//  .5 - xx2,  xy2,  xz2,  yz2			// 10, 01, 02, 03
    538 		__m128 me1 = _mm_shuffle_ps( ma1, mb1, _MM_SHUFFLE( 3, 2, 1, 0 ) );					//  .5 - yy2,  xy2,  wy2,  wx2			// 00, 01, 12, 13
    539 
    540 		__m128 ra0 = _mm_add_ps( _mm_mul_ps( mb0, vector_float_quat2mat_mad1 ), ma0 );		// 1 - yy2 - zz2, xy2 - wz2, xz2 + wy2,					// - - + -
    541 		__m128 rb0 = _mm_add_ps( _mm_mul_ps( mb0, vector_float_quat2mat_mad2 ), md0 );		// 1 - xx2 - zz2, xy2 + wz2,          , yz2 - wx2		// - + - -
    542 		__m128 rc0 = _mm_add_ps( _mm_mul_ps( me0, vector_float_quat2mat_mad3 ), md0 );		// 1 - xx2 - yy2,          , xz2 - wy2, yz2 + wx2		// + - - +
    543 		__m128 ra1 = _mm_add_ps( _mm_mul_ps( mb1, vector_float_quat2mat_mad1 ), ma1 );		// 1 - yy2 - zz2, xy2 - wz2, xz2 + wy2,					// - - + -
    544 		__m128 rb1 = _mm_add_ps( _mm_mul_ps( mb1, vector_float_quat2mat_mad2 ), md1 );		// 1 - xx2 - zz2, xy2 + wz2,          , yz2 - wx2		// - + - -
    545 		__m128 rc1 = _mm_add_ps( _mm_mul_ps( me1, vector_float_quat2mat_mad3 ), md1 );		// 1 - xx2 - yy2,          , xz2 - wy2, yz2 + wx2		// + - - +
    546 
    547 		__m128 ta0 = _mm_shuffle_ps( ra0, t0, _MM_SHUFFLE( 0, 0, 2, 2 ) );
    548 		__m128 tb0 = _mm_shuffle_ps( rb0, t0, _MM_SHUFFLE( 1, 1, 3, 3 ) );
    549 		__m128 tc0 = _mm_shuffle_ps( rc0, t0, _MM_SHUFFLE( 2, 2, 0, 0 ) );
    550 		__m128 ta1 = _mm_shuffle_ps( ra1, t1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
    551 		__m128 tb1 = _mm_shuffle_ps( rb1, t1, _MM_SHUFFLE( 1, 1, 3, 3 ) );
    552 		__m128 tc1 = _mm_shuffle_ps( rc1, t1, _MM_SHUFFLE( 2, 2, 0, 0 ) );
    553 
    554 		ra0 = _mm_shuffle_ps( ra0, ta0, _MM_SHUFFLE( 2, 0, 1, 0 ) );						// 00 01 02 10
    555 		rb0 = _mm_shuffle_ps( rb0, tb0, _MM_SHUFFLE( 2, 0, 0, 1 ) );						// 01 00 03 11
    556 		rc0 = _mm_shuffle_ps( rc0, tc0, _MM_SHUFFLE( 2, 0, 3, 2 ) );						// 02 03 00 12
    557 		ra1 = _mm_shuffle_ps( ra1, ta1, _MM_SHUFFLE( 2, 0, 1, 0 ) );						// 00 01 02 10
    558 		rb1 = _mm_shuffle_ps( rb1, tb1, _MM_SHUFFLE( 2, 0, 0, 1 ) );						// 01 00 03 11
    559 		rc1 = _mm_shuffle_ps( rc1, tc1, _MM_SHUFFLE( 2, 0, 3, 2 ) );						// 02 03 00 12
    560 
    561 		_mm_store_ps( &jointMatPtr[i*12+0*12+0], ra0 );
    562 		_mm_store_ps( &jointMatPtr[i*12+0*12+4], rb0 );
    563 		_mm_store_ps( &jointMatPtr[i*12+0*12+8], rc0 );
    564 		_mm_store_ps( &jointMatPtr[i*12+1*12+0], ra1 );
    565 		_mm_store_ps( &jointMatPtr[i*12+1*12+4], rb1 );
    566 		_mm_store_ps( &jointMatPtr[i*12+1*12+8], rc1 );
    567 	}
    568 
    569 	for ( ; i < numJoints; i++ ) {
    570 
    571 		__m128 q0 = _mm_load_ps( &jointQuatPtr[i*8+0*8+0] );
    572 		__m128 t0 = _mm_load_ps( &jointQuatPtr[i*8+0*8+4] );
    573 
    574 		__m128 d0 = _mm_add_ps( q0, q0 );
    575 
    576 		__m128 sa0 = _mm_perm_ps( q0, _MM_SHUFFLE( 1, 0, 0, 1 ) );							//   y,   x,   x,   y
    577 		__m128 sb0 = _mm_perm_ps( d0, _MM_SHUFFLE( 2, 2, 1, 1 ) );							//  y2,  y2,  z2,  z2
    578 		__m128 sc0 = _mm_perm_ps( q0, _MM_SHUFFLE( 3, 3, 3, 2 ) );							//   z,   w,   w,   w
    579 		__m128 sd0 = _mm_perm_ps( d0, _MM_SHUFFLE( 0, 1, 2, 2 ) );							//  z2,  z2,  y2,  x2
    580 
    581 		sa0 = _mm_xor_ps( sa0, vector_float_first_sign_bit );
    582 		sc0 = _mm_xor_ps( sc0, vector_float_last_three_sign_bits );							// flip stupid inverse quaternions
    583 
    584 		__m128 ma0 = _mm_add_ps( _mm_mul_ps( sa0, sb0 ), vector_float_first_pos_half );		//  .5 - yy2,  xy2,  xz2,  yz2		//  .5 0 0 0
    585 		__m128 mb0 = _mm_add_ps( _mm_mul_ps( sc0, sd0 ), vector_float_first_neg_half );		// -.5 + zz2,  wz2,  wy2,  wx2		// -.5 0 0 0
    586 		__m128 mc0 = _mm_sub_ps( vector_float_first_pos_half, _mm_mul_ps( q0, d0 ) );		//  .5 - xx2, -yy2, -zz2, -ww2		//  .5 0 0 0
    587 
    588 		__m128 mf0 = _mm_shuffle_ps( ma0, mc0, _MM_SHUFFLE( 0, 0, 1, 1 ) );					//       xy2,  xy2, .5 - xx2, .5 - xx2	// 01, 01, 10, 10
    589 		__m128 md0 = _mm_shuffle_ps( mf0, ma0, _MM_SHUFFLE( 3, 2, 0, 2 ) );					//  .5 - xx2,  xy2,  xz2,  yz2			// 10, 01, 02, 03
    590 		__m128 me0 = _mm_shuffle_ps( ma0, mb0, _MM_SHUFFLE( 3, 2, 1, 0 ) );					//  .5 - yy2,  xy2,  wy2,  wx2			// 00, 01, 12, 13
    591 
    592 		__m128 ra0 = _mm_add_ps( _mm_mul_ps( mb0, vector_float_quat2mat_mad1 ), ma0 );		// 1 - yy2 - zz2, xy2 - wz2, xz2 + wy2,					// - - + -
    593 		__m128 rb0 = _mm_add_ps( _mm_mul_ps( mb0, vector_float_quat2mat_mad2 ), md0 );		// 1 - xx2 - zz2, xy2 + wz2,          , yz2 - wx2		// - + - -
    594 		__m128 rc0 = _mm_add_ps( _mm_mul_ps( me0, vector_float_quat2mat_mad3 ), md0 );		// 1 - xx2 - yy2,          , xz2 - wy2, yz2 + wx2		// + - - +
    595 
    596 		__m128 ta0 = _mm_shuffle_ps( ra0, t0, _MM_SHUFFLE( 0, 0, 2, 2 ) );
    597 		__m128 tb0 = _mm_shuffle_ps( rb0, t0, _MM_SHUFFLE( 1, 1, 3, 3 ) );
    598 		__m128 tc0 = _mm_shuffle_ps( rc0, t0, _MM_SHUFFLE( 2, 2, 0, 0 ) );
    599 
    600 		ra0 = _mm_shuffle_ps( ra0, ta0, _MM_SHUFFLE( 2, 0, 1, 0 ) );						// 00 01 02 10
    601 		rb0 = _mm_shuffle_ps( rb0, tb0, _MM_SHUFFLE( 2, 0, 0, 1 ) );						// 01 00 03 11
    602 		rc0 = _mm_shuffle_ps( rc0, tc0, _MM_SHUFFLE( 2, 0, 3, 2 ) );						// 02 03 00 12
    603 
    604 		_mm_store_ps( &jointMatPtr[i*12+0*12+0], ra0 );
    605 		_mm_store_ps( &jointMatPtr[i*12+0*12+4], rb0 );
    606 		_mm_store_ps( &jointMatPtr[i*12+0*12+8], rc0 );
    607 	}
    608 }
    609 
    610 /*
    611 ============
    612 idSIMD_SSE::ConvertJointMatsToJointQuats
    613 ============
    614 */
    615 void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
    616 
    617 	assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
    618 	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
    619 	assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
    620 
    621 	const __m128 vector_float_zero		= _mm_setzero_ps();
    622 	const __m128 vector_float_one		= { 1.0f, 1.0f, 1.0f, 1.0f };
    623 	const __m128 vector_float_not		= __m128c( _mm_set_epi32( -1, -1, -1, -1 ) );
    624 	const __m128 vector_float_sign_bit	= __m128c( _mm_set_epi32( 0x80000000, 0x80000000, 0x80000000, 0x80000000 ) );
    625 	const __m128 vector_float_rsqrt_c0	= {  -3.0f,  -3.0f,  -3.0f,  -3.0f };
    626 	const __m128 vector_float_rsqrt_c2	= { -0.25f, -0.25f, -0.25f, -0.25f };
    627 
    628 	int i = 0;
    629 	for ( ; i < numJoints - 3; i += 4 ) {
    630 		const float *__restrict m = (float *)&jointMats[i];
    631 		float *__restrict q = (float *)&jointQuats[i];
    632 
    633 		__m128 ma0 = _mm_load_ps( &m[0*12+0] );
    634 		__m128 ma1 = _mm_load_ps( &m[0*12+4] );
    635 		__m128 ma2 = _mm_load_ps( &m[0*12+8] );
    636 
    637 		__m128 mb0 = _mm_load_ps( &m[1*12+0] );
    638 		__m128 mb1 = _mm_load_ps( &m[1*12+4] );
    639 		__m128 mb2 = _mm_load_ps( &m[1*12+8] );
    640 
    641 		__m128 mc0 = _mm_load_ps( &m[2*12+0] );
    642 		__m128 mc1 = _mm_load_ps( &m[2*12+4] );
    643 		__m128 mc2 = _mm_load_ps( &m[2*12+8] );
    644 
    645 		__m128 md0 = _mm_load_ps( &m[3*12+0] );
    646 		__m128 md1 = _mm_load_ps( &m[3*12+4] );
    647 		__m128 md2 = _mm_load_ps( &m[3*12+8] );
    648 
    649 		__m128 ta0 = _mm_unpacklo_ps( ma0, mc0 );	// a0, c0, a1, c1
    650 		__m128 ta1 = _mm_unpackhi_ps( ma0, mc0 );	// a2, c2, a3, c3
    651 		__m128 ta2 = _mm_unpacklo_ps( mb0, md0 );	// b0, d0, b1, b2
    652 		__m128 ta3 = _mm_unpackhi_ps( mb0, md0 );	// b2, d2, b3, d3
    653 
    654 		__m128 tb0 = _mm_unpacklo_ps( ma1, mc1 );	// a0, c0, a1, c1
    655 		__m128 tb1 = _mm_unpackhi_ps( ma1, mc1 );	// a2, c2, a3, c3
    656 		__m128 tb2 = _mm_unpacklo_ps( mb1, md1 );	// b0, d0, b1, b2
    657 		__m128 tb3 = _mm_unpackhi_ps( mb1, md1 );	// b2, d2, b3, d3
    658 
    659 		__m128 tc0 = _mm_unpacklo_ps( ma2, mc2 );	// a0, c0, a1, c1
    660 		__m128 tc1 = _mm_unpackhi_ps( ma2, mc2 );	// a2, c2, a3, c3
    661 		__m128 tc2 = _mm_unpacklo_ps( mb2, md2 );	// b0, d0, b1, b2
    662 		__m128 tc3 = _mm_unpackhi_ps( mb2, md2 );	// b2, d2, b3, d3
    663 
    664 		__m128 m00 = _mm_unpacklo_ps( ta0, ta2 );
    665 		__m128 m01 = _mm_unpackhi_ps( ta0, ta2 );
    666 		__m128 m02 = _mm_unpacklo_ps( ta1, ta3 );
    667 		__m128 m03 = _mm_unpackhi_ps( ta1, ta3 );
    668 
    669 		__m128 m10 = _mm_unpacklo_ps( tb0, tb2 );
    670 		__m128 m11 = _mm_unpackhi_ps( tb0, tb2 );
    671 		__m128 m12 = _mm_unpacklo_ps( tb1, tb3 );
    672 		__m128 m13 = _mm_unpackhi_ps( tb1, tb3 );
    673 
    674 		__m128 m20 = _mm_unpacklo_ps( tc0, tc2 );
    675 		__m128 m21 = _mm_unpackhi_ps( tc0, tc2 );
    676 		__m128 m22 = _mm_unpacklo_ps( tc1, tc3 );
    677 		__m128 m23 = _mm_unpackhi_ps( tc1, tc3 );
    678 
    679 		__m128 b00 = _mm_add_ps( m00, m11 );
    680 		__m128 b11 = _mm_cmpgt_ps( m00, m22 );
    681 		__m128 b01 = _mm_add_ps( b00, m22 );
    682 		__m128 b10 = _mm_cmpgt_ps( m00, m11 );
    683 		__m128 b0  = _mm_cmpgt_ps( b01, vector_float_zero );
    684 		__m128 b1  = _mm_and_ps( b10, b11 );
    685 		__m128 b2  = _mm_cmpgt_ps( m11, m22 );
    686 
    687 		__m128 m0  = b0;
    688 		__m128 m1  = _mm_and_ps( _mm_xor_ps( b0, vector_float_not ), b1 );
    689 		__m128 p1  = _mm_or_ps( b0, b1 );
    690 		__m128 p2  = _mm_or_ps( p1, b2 );
    691 		__m128 m2  = _mm_and_ps( _mm_xor_ps( p1, vector_float_not ), b2 );
    692 		__m128 m3  = _mm_xor_ps( p2, vector_float_not );
    693 
    694 		__m128 i0  = _mm_or_ps( m2, m3 );
    695 		__m128 i1  = _mm_or_ps( m1, m3 );
    696 		__m128 i2  = _mm_or_ps( m1, m2 );
    697 
    698 		__m128 s0  = _mm_and_ps( i0, vector_float_sign_bit );
    699 		__m128 s1  = _mm_and_ps( i1, vector_float_sign_bit );
    700 		__m128 s2  = _mm_and_ps( i2, vector_float_sign_bit );
    701 
    702 		m00 = _mm_xor_ps( m00, s0 );
    703 		m11 = _mm_xor_ps( m11, s1 );
    704 		m22 = _mm_xor_ps( m22, s2 );
    705 		m21 = _mm_xor_ps( m21, s0 );
    706 		m02 = _mm_xor_ps( m02, s1 );
    707 		m10 = _mm_xor_ps( m10, s2 );
    708 
    709 		__m128 t0  = _mm_add_ps( m00, m11 );
    710 		__m128 t1  = _mm_add_ps( m22, vector_float_one );
    711 		__m128 q0  = _mm_add_ps( t0, t1 );
    712 		__m128 q1  = _mm_sub_ps( m01, m10 );
    713 		__m128 q2  = _mm_sub_ps( m20, m02 );
    714 		__m128 q3  = _mm_sub_ps( m12, m21 );
    715 
    716 		__m128 rs = _mm_rsqrt_ps( q0 );
    717 		__m128 sq = _mm_mul_ps( rs, rs );
    718 		__m128 sh = _mm_mul_ps( rs, vector_float_rsqrt_c2 );
    719 		__m128 sx = _mm_madd_ps( q0, sq, vector_float_rsqrt_c0 );
    720 		__m128 s  = _mm_mul_ps( sh, sx );
    721 
    722 		q0 = _mm_mul_ps( q0, s );
    723 		q1 = _mm_mul_ps( q1, s );
    724 		q2 = _mm_mul_ps( q2, s );
    725 		q3 = _mm_mul_ps( q3, s );
    726 
    727 		m0 = _mm_or_ps( m0, m2 );
    728 		m2 = _mm_or_ps( m2, m3 );
    729 
    730 		__m128 fq0 = _mm_sel_ps( q0, q3, m0 );
    731 		__m128 fq1 = _mm_sel_ps( q1, q2, m0 );
    732 		__m128 fq2 = _mm_sel_ps( q2, q1, m0 );
    733 		__m128 fq3 = _mm_sel_ps( q3, q0, m0 );
    734 
    735 		__m128 rq0 = _mm_sel_ps( fq0, fq2, m2 );
    736 		__m128 rq1 = _mm_sel_ps( fq1, fq3, m2 );
    737 		__m128 rq2 = _mm_sel_ps( fq2, fq0, m2 );
    738 		__m128 rq3 = _mm_sel_ps( fq3, fq1, m2 );
    739 
    740 		__m128 tq0 = _mm_unpacklo_ps( rq0, rq2 );
    741 		__m128 tq1 = _mm_unpackhi_ps( rq0, rq2 );
    742 		__m128 tq2 = _mm_unpacklo_ps( rq1, rq3 );
    743 		__m128 tq3 = _mm_unpackhi_ps( rq1, rq3 );
    744 
    745 		__m128 sq0 = _mm_unpacklo_ps( tq0, tq2 );
    746 		__m128 sq1 = _mm_unpackhi_ps( tq0, tq2 );
    747 		__m128 sq2 = _mm_unpacklo_ps( tq1, tq3 );
    748 		__m128 sq3 = _mm_unpackhi_ps( tq1, tq3 );
    749 
    750 		__m128 tt0 = _mm_unpacklo_ps( m03, m23 );
    751 		__m128 tt1 = _mm_unpackhi_ps( m03, m23 );
    752 		__m128 tt2 = _mm_unpacklo_ps( m13, vector_float_zero );
    753 		__m128 tt3 = _mm_unpackhi_ps( m13, vector_float_zero );
    754 
    755 		__m128 st0 = _mm_unpacklo_ps( tt0, tt2 );
    756 		__m128 st1 = _mm_unpackhi_ps( tt0, tt2 );
    757 		__m128 st2 = _mm_unpacklo_ps( tt1, tt3 );
    758 		__m128 st3 = _mm_unpackhi_ps( tt1, tt3 );
    759 
    760 		_mm_store_ps( &q[0*4], sq0 );
    761 		_mm_store_ps( &q[1*4], st0 );
    762 		_mm_store_ps( &q[2*4], sq1 );
    763 		_mm_store_ps( &q[3*4], st1 );
    764 		_mm_store_ps( &q[4*4], sq2 );
    765 		_mm_store_ps( &q[5*4], st2 );
    766 		_mm_store_ps( &q[6*4], sq3 );
    767 		_mm_store_ps( &q[7*4], st3 );
    768 	}
    769 
    770 	float sign[2] = { 1.0f, -1.0f };
    771 
    772 	for ( ; i < numJoints; i++ ) {
    773 		const float *__restrict m = (float *)&jointMats[i];
    774 		float *__restrict q = (float *)&jointQuats[i];
    775 
    776 		int b0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f;
    777 		int b1 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2];
    778 		int b2 = m[1 * 4 + 1] > m[2 * 4 + 2];
    779 
    780 		int m0 = b0;
    781 		int m1 = ( !b0 ) & b1;
    782 		int m2 = ( !( b0 | b1 ) ) & b2;
    783 		int m3 = !( b0 | b1 | b2 );
    784 
    785 		int i0 = ( m2 | m3 );
    786 		int i1 = ( m1 | m3 );
    787 		int i2 = ( m1 | m2 );
    788 
    789 		float s0 = sign[i0];
    790 		float s1 = sign[i1];
    791 		float s2 = sign[i2];
    792 
    793 		float t = s0 * m[0 * 4 + 0] + s1 * m[1 * 4 + 1] + s2 * m[2 * 4 + 2] + 1.0f;
    794 		float s = __frsqrts( t );
    795 		s = ( t * s * s + -3.0f ) * ( s * -0.25f );
    796 
    797 		q[0] = t * s;
    798 		q[1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
    799 		q[2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
    800 		q[3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
    801 
    802 		if ( m0 | m2 ) {
    803 			// reverse
    804 			SwapValues( q[0], q[3] );
    805 			SwapValues( q[1], q[2] );
    806 		}
    807 		if ( m2 | m3 ) {
    808 			// rotate 2
    809 			SwapValues( q[0], q[2] );
    810 			SwapValues( q[1], q[3] );
    811 		}
    812 
    813 		q[4] = m[0 * 4 + 3];
    814 		q[5] = m[1 * 4 + 3];
    815 		q[6] = m[2 * 4 + 3];
    816 		q[7] = 0.0f;
    817 	}
    818 }
    819 
    820 /*
    821 ============
    822 idSIMD_SSE::TransformJoints
    823 ============
    824 */
    825 void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
    826 	const __m128 vector_float_mask_keep_last	= __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) );
    827 
    828 	const float *__restrict firstMatrix = jointMats->ToFloatPtr() + ( firstJoint + firstJoint + firstJoint - 3 ) * 4;
    829 
    830 	__m128 pma = _mm_load_ps( firstMatrix + 0 );
    831 	__m128 pmb = _mm_load_ps( firstMatrix + 4 );
    832 	__m128 pmc = _mm_load_ps( firstMatrix + 8 );
    833 
    834 	for ( int joint = firstJoint; joint <= lastJoint; joint++ ) {
    835 		const int parent = parents[joint];
    836 		const float *__restrict parentMatrix = jointMats->ToFloatPtr() + ( parent + parent + parent ) * 4;
    837 		float *__restrict childMatrix = jointMats->ToFloatPtr() + ( joint + joint + joint ) * 4;
    838 
    839 		if ( parent != joint - 1 ) {
    840 			pma = _mm_load_ps( parentMatrix + 0 );
    841 			pmb = _mm_load_ps( parentMatrix + 4 );
    842 			pmc = _mm_load_ps( parentMatrix + 8 );
    843 		}
    844 
    845 		__m128 cma = _mm_load_ps( childMatrix + 0 );
    846 		__m128 cmb = _mm_load_ps( childMatrix + 4 );
    847 		__m128 cmc = _mm_load_ps( childMatrix + 8 );
    848 
    849 		__m128 ta = _mm_splat_ps( pma, 0 );
    850 		__m128 tb = _mm_splat_ps( pmb, 0 );
    851 		__m128 tc = _mm_splat_ps( pmc, 0 );
    852 
    853 		__m128 td = _mm_splat_ps( pma, 1 );
    854 		__m128 te = _mm_splat_ps( pmb, 1 );
    855 		__m128 tf = _mm_splat_ps( pmc, 1 );
    856 
    857 		__m128 tg = _mm_splat_ps( pma, 2 );
    858 		__m128 th = _mm_splat_ps( pmb, 2 );
    859 		__m128 ti = _mm_splat_ps( pmc, 2 );
    860 
    861 		pma = _mm_madd_ps( ta, cma, _mm_and_ps( pma, vector_float_mask_keep_last ) );
    862 		pmb = _mm_madd_ps( tb, cma, _mm_and_ps( pmb, vector_float_mask_keep_last ) );
    863 		pmc = _mm_madd_ps( tc, cma, _mm_and_ps( pmc, vector_float_mask_keep_last ) );
    864 
    865 		pma = _mm_madd_ps( td, cmb, pma );
    866 		pmb = _mm_madd_ps( te, cmb, pmb );
    867 		pmc = _mm_madd_ps( tf, cmb, pmc );
    868 
    869 		pma = _mm_madd_ps( tg, cmc, pma );
    870 		pmb = _mm_madd_ps( th, cmc, pmb );
    871 		pmc = _mm_madd_ps( ti, cmc, pmc );
    872 
    873 		_mm_store_ps( childMatrix + 0, pma );
    874 		_mm_store_ps( childMatrix + 4, pmb );
    875 		_mm_store_ps( childMatrix + 8, pmc );
    876 	}
    877 }
    878 
    879 /*
    880 ============
    881 idSIMD_SSE::UntransformJoints
    882 ============
    883 */
    884 void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
    885 	const __m128 vector_float_mask_keep_last	= __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) );
    886 
    887 	for ( int joint = lastJoint; joint >= firstJoint; joint-- ) {
    888 		assert( parents[joint] < joint );
    889 		const int parent = parents[joint];
    890 		const float *__restrict parentMatrix = jointMats->ToFloatPtr() + ( parent + parent + parent ) * 4;
    891 		float *__restrict childMatrix = jointMats->ToFloatPtr() + ( joint + joint + joint ) * 4;
    892 
    893 		__m128 pma = _mm_load_ps( parentMatrix + 0 );
    894 		__m128 pmb = _mm_load_ps( parentMatrix + 4 );
    895 		__m128 pmc = _mm_load_ps( parentMatrix + 8 );
    896 
    897 		__m128 cma = _mm_load_ps( childMatrix + 0 );
    898 		__m128 cmb = _mm_load_ps( childMatrix + 4 );
    899 		__m128 cmc = _mm_load_ps( childMatrix + 8 );
    900 
    901 		__m128 ta = _mm_splat_ps( pma, 0 );
    902 		__m128 tb = _mm_splat_ps( pma, 1 );
    903 		__m128 tc = _mm_splat_ps( pma, 2 );
    904 
    905 		__m128 td = _mm_splat_ps( pmb, 0 );
    906 		__m128 te = _mm_splat_ps( pmb, 1 );
    907 		__m128 tf = _mm_splat_ps( pmb, 2 );
    908 
    909 		__m128 tg = _mm_splat_ps( pmc, 0 );
    910 		__m128 th = _mm_splat_ps( pmc, 1 );
    911 		__m128 ti = _mm_splat_ps( pmc, 2 );
    912 
    913 		cma = _mm_sub_ps( cma, _mm_and_ps( pma, vector_float_mask_keep_last ) );
    914 		cmb = _mm_sub_ps( cmb, _mm_and_ps( pmb, vector_float_mask_keep_last ) );
    915 		cmc = _mm_sub_ps( cmc, _mm_and_ps( pmc, vector_float_mask_keep_last ) );
    916 
    917 		pma = _mm_mul_ps( ta, cma );
    918 		pmb = _mm_mul_ps( tb, cma );
    919 		pmc = _mm_mul_ps( tc, cma );
    920 
    921 		pma = _mm_madd_ps( td, cmb, pma );
    922 		pmb = _mm_madd_ps( te, cmb, pmb );
    923 		pmc = _mm_madd_ps( tf, cmb, pmc );
    924 
    925 		pma = _mm_madd_ps( tg, cmc, pma );
    926 		pmb = _mm_madd_ps( th, cmc, pmb );
    927 		pmc = _mm_madd_ps( ti, cmc, pmc );
    928 
    929 		_mm_store_ps( childMatrix + 0, pma );
    930 		_mm_store_ps( childMatrix + 4, pmb );
    931 		_mm_store_ps( childMatrix + 8, pmc );
    932 	}
    933 }
    934