DrawVert_intrinsics.h (9874B)
1 /* 2 =========================================================================== 3 4 Doom 3 BFG Edition GPL Source Code 5 Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. 6 7 This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). 8 9 Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation, either version 3 of the License, or 12 (at your option) any later version. 13 14 Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License 20 along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>. 21 22 In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below. 23 24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA. 25 26 =========================================================================== 27 */ 28 29 #ifndef __DRAWVERT_INTRINSICS_H__ 30 #define __DRAWVERT_INTRINSICS_H__ 31 32 #ifdef ID_WIN_X86_SSE2_INTRIN 33 34 static const __m128i vector_int_f32_sign_mask = _mm_set1_epi32( 1U << IEEE_FLT_SIGN_BIT ); 35 static const __m128i vector_int_f32_exponent_mask = _mm_set1_epi32( ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS ); 36 static const __m128i vector_int_f32_mantissa_mask = _mm_set1_epi32( ( 1U << IEEE_FLT_MANTISSA_BITS ) - 1 ); 37 static const __m128i vector_int_f16_min_exponent = _mm_set1_epi32( 0 ); 38 static const __m128i vector_int_f16_max_exponent = _mm_set1_epi32( ( 30 << IEEE_FLT16_MANTISSA_BITS ) ); 39 static const __m128i vector_int_f16_min_mantissa = _mm_set1_epi32( 0 ); 40 static const __m128i vector_int_f16_max_mantissa = _mm_set1_epi32( ( ( 1 << IEEE_FLT16_MANTISSA_BITS ) - 1 ) ); 41 static const __m128i vector_int_f32_to_f16_exponent_bias = _mm_set1_epi32( ( IEEE_FLT_EXPONENT_BIAS - IEEE_FLT16_EXPONENT_BIAS ) << IEEE_FLT16_MANTISSA_BITS ); 42 static const int f32_to_f16_sign_shift = IEEE_FLT_SIGN_BIT - IEEE_FLT16_SIGN_BIT; 43 static const int f32_to_f16_exponent_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS; 44 static const int f32_to_f16_mantissa_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS; 45 46 static const __m128i vector_int_zero = _mm_setzero_si128(); 47 static const __m128i vector_int_one = _mm_set_epi32( 1, 1, 1, 1 ); 48 49 static const __m128 vector_float_mask_clear_last = __m128c( _mm_set_epi32( 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF ) ); 50 static const __m128 vector_float_last_one = { 0.0f, 0.0f, 0.0f, 1.0f }; 51 static const __m128 vector_float_1_over_255 = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }; 52 static const __m128 vector_float_1_over_4 = { 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f }; 53 54 #endif 55 56 /* 57 ==================== 58 FastF32toF16 59 ==================== 60 */ 61 #ifdef ID_WIN_X86_SSE2_INTRIN 62 63 ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits ) { 64 __m128i f16_sign = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_sign_mask ), f32_to_f16_sign_shift ); 65 __m128i f16_exponent = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_exponent_mask ), f32_to_f16_exponent_shift ); 66 __m128i f16_mantissa = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_mantissa_mask ), f32_to_f16_mantissa_shift ); 67 68 f16_exponent = _mm_sub_epi32( f16_exponent, vector_int_f32_to_f16_exponent_bias ); 69 70 const __m128i underflow = _mm_cmplt_epi32( f16_exponent, vector_int_f16_min_exponent ); 71 const __m128i overflow = _mm_cmpgt_epi32( f16_exponent, vector_int_f16_max_exponent ); 72 73 f16_exponent = _mm_sel_si128( f16_exponent, vector_int_f16_min_exponent, underflow ); 74 f16_exponent = _mm_sel_si128( f16_exponent, vector_int_f16_max_exponent, overflow ); 75 f16_mantissa = _mm_sel_si128( f16_mantissa, vector_int_f16_min_mantissa, underflow ); 76 f16_mantissa = _mm_sel_si128( f16_mantissa, vector_int_f16_max_mantissa, overflow ); 77 78 __m128i flt16 = _mm_or_si128( _mm_or_si128( f16_sign, f16_exponent ), f16_mantissa ); 79 80 return _mm_packs_epi32( flt16, flt16 ); 81 } 82 83 #endif 84 85 ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 ) { 86 const int f32_sign_mask = 1U << IEEE_FLT_SIGN_BIT; 87 const int f32_exponent_mask = ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS; 88 const int f32_mantissa_mask = ( 1U << IEEE_FLT_MANTISSA_BITS ) - 1; 89 const int f16_min_exponent = 0; 90 const int f16_max_exponent = ( 30 << IEEE_FLT16_MANTISSA_BITS ); 91 const int f16_min_mantissa = 0; 92 const int f16_max_mantissa = ( ( 1 << IEEE_FLT16_MANTISSA_BITS ) - 1 ); 93 const int f32_to_f16_sign_shift = IEEE_FLT_SIGN_BIT - IEEE_FLT16_SIGN_BIT; 94 const int f32_to_f16_exponent_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS; 95 const int f32_to_f16_mantissa_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS; 96 const int f32_to_f16_exponent_bias = ( IEEE_FLT_EXPONENT_BIAS - IEEE_FLT16_EXPONENT_BIAS ) << IEEE_FLT16_MANTISSA_BITS; 97 98 int f32_bits = *(unsigned int *)&f32; 99 100 int f16_sign = ( (unsigned int )( f32_bits & f32_sign_mask ) >> f32_to_f16_sign_shift ); 101 int f16_exponent = ( (unsigned int )( f32_bits & f32_exponent_mask ) >> f32_to_f16_exponent_shift ); 102 int f16_mantissa = ( (unsigned int )( f32_bits & f32_mantissa_mask ) >> f32_to_f16_mantissa_shift ); 103 104 f16_exponent -= f32_to_f16_exponent_bias; 105 106 const bool underflow = ( f16_exponent < f16_min_exponent ); 107 const bool overflow = ( f16_exponent > f16_max_exponent ); 108 109 f16_exponent = underflow ? f16_min_exponent : f16_exponent; 110 f16_exponent = overflow ? f16_max_exponent : f16_exponent; 111 f16_mantissa = underflow ? f16_min_mantissa : f16_mantissa; 112 f16_mantissa = overflow ? f16_max_mantissa : f16_mantissa; 113 114 return (halfFloat_t)( f16_sign | f16_exponent | f16_mantissa ); 115 } 116 117 /* 118 ==================== 119 LoadSkinnedDrawVertPosition 120 ==================== 121 */ 122 #ifdef ID_WIN_X86_SSE2_INTRIN 123 124 ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert & base, const idJointMat * joints ) { 125 const idJointMat & j0 = joints[base.color[0]]; 126 const idJointMat & j1 = joints[base.color[1]]; 127 const idJointMat & j2 = joints[base.color[2]]; 128 const idJointMat & j3 = joints[base.color[3]]; 129 130 __m128i weights_b = _mm_cvtsi32_si128( *(const unsigned int *)base.color2 ); 131 __m128i weights_s = _mm_unpacklo_epi8( weights_b, vector_int_zero ); 132 __m128i weights_i = _mm_unpacklo_epi16( weights_s, vector_int_zero ); 133 134 __m128 weights = _mm_cvtepi32_ps( weights_i ); 135 weights = _mm_mul_ps( weights, vector_float_1_over_255 ); 136 137 __m128 w0 = _mm_splat_ps( weights, 0 ); 138 __m128 w1 = _mm_splat_ps( weights, 1 ); 139 __m128 w2 = _mm_splat_ps( weights, 2 ); 140 __m128 w3 = _mm_splat_ps( weights, 3 ); 141 142 __m128 matX = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 0 * 4 ), w0 ); 143 __m128 matY = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 1 * 4 ), w0 ); 144 __m128 matZ = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 2 * 4 ), w0 ); 145 146 matX = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 0 * 4 ), w1, matX ); 147 matY = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 1 * 4 ), w1, matY ); 148 matZ = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 2 * 4 ), w1, matZ ); 149 150 matX = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 0 * 4 ), w2, matX ); 151 matY = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 1 * 4 ), w2, matY ); 152 matZ = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 2 * 4 ), w2, matZ ); 153 154 matX = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 0 * 4 ), w3, matX ); 155 matY = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 1 * 4 ), w3, matY ); 156 matZ = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 2 * 4 ), w3, matZ ); 157 158 __m128 v = _mm_load_ps( base.xyz.ToFloatPtr() ); 159 v = _mm_and_ps( v, vector_float_mask_clear_last ); 160 v = _mm_or_ps( v, vector_float_last_one ); 161 162 __m128 t0 = _mm_mul_ps( matX, v ); 163 __m128 t1 = _mm_mul_ps( matY, v ); 164 __m128 t2 = _mm_mul_ps( matZ, v ); 165 __m128 t3 = vector_float_1_over_4; 166 167 __m128 s0 = _mm_unpacklo_ps( t0, t2 ); // x0, z0, x1, z1 168 __m128 s1 = _mm_unpackhi_ps( t0, t2 ); // x2, z2, x3, z3 169 __m128 s2 = _mm_unpacklo_ps( t1, t3 ); // y0, w0, y1, w1 170 __m128 s3 = _mm_unpackhi_ps( t1, t3 ); // y2, w2, y3, w3 171 172 __m128 r0 = _mm_unpacklo_ps( s0, s2 ); // x0, y0, z0, w0 173 __m128 r1 = _mm_unpackhi_ps( s0, s2 ); // x1, y1, z1, w1 174 __m128 r2 = _mm_unpacklo_ps( s1, s3 ); // x2, y2, z2, w2 175 __m128 r3 = _mm_unpackhi_ps( s1, s3 ); // x3, y3, z3, w3 176 177 r0 = _mm_add_ps( r0, r1 ); 178 r2 = _mm_add_ps( r2, r3 ); 179 r0 = _mm_add_ps( r0, r2 ); 180 181 return r0; 182 } 183 184 #endif 185 186 ID_INLINE_EXTERN idVec3 Scalar_LoadSkinnedDrawVertPosition( const idDrawVert & vert, const idJointMat * joints ) { 187 const idJointMat & j0 = joints[vert.color[0]]; 188 const idJointMat & j1 = joints[vert.color[1]]; 189 const idJointMat & j2 = joints[vert.color[2]]; 190 const idJointMat & j3 = joints[vert.color[3]]; 191 192 const float w0 = vert.color2[0] * ( 1.0f / 255.0f ); 193 const float w1 = vert.color2[1] * ( 1.0f / 255.0f ); 194 const float w2 = vert.color2[2] * ( 1.0f / 255.0f ); 195 const float w3 = vert.color2[3] * ( 1.0f / 255.0f ); 196 197 idJointMat accum; 198 idJointMat::Mul( accum, j0, w0 ); 199 idJointMat::Mad( accum, j1, w1 ); 200 idJointMat::Mad( accum, j2, w2 ); 201 idJointMat::Mad( accum, j3, w3 ); 202 203 return accum * idVec4( vert.xyz.x, vert.xyz.y, vert.xyz.z, 1.0f ); 204 } 205 206 #endif /* !__DRAWVERT_INTRINSICS_H__ */