DOOM-3-BFG

DOOM 3 BFG Edition
Log | Files | Refs

DrawVert_intrinsics.h (9874B)


      1 /*
      2 ===========================================================================
      3 
      4 Doom 3 BFG Edition GPL Source Code
      5 Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. 
      6 
      7 This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").  
      8 
      9 Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
     10 it under the terms of the GNU General Public License as published by
     11 the Free Software Foundation, either version 3 of the License, or
     12 (at your option) any later version.
     13 
     14 Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
     15 but WITHOUT ANY WARRANTY; without even the implied warranty of
     16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     17 GNU General Public License for more details.
     18 
     19 You should have received a copy of the GNU General Public License
     20 along with Doom 3 BFG Edition Source Code.  If not, see <http://www.gnu.org/licenses/>.
     21 
     22 In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code.  If not, please request a copy in writing from id Software at the address below.
     23 
     24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
     25 
     26 ===========================================================================
     27 */
     28 
     29 #ifndef __DRAWVERT_INTRINSICS_H__
     30 #define __DRAWVERT_INTRINSICS_H__
     31 
     32 #ifdef ID_WIN_X86_SSE2_INTRIN
     33 
     34 static const __m128i vector_int_f32_sign_mask					= _mm_set1_epi32( 1U << IEEE_FLT_SIGN_BIT );
     35 static const __m128i vector_int_f32_exponent_mask				= _mm_set1_epi32( ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS );
     36 static const __m128i vector_int_f32_mantissa_mask				= _mm_set1_epi32( ( 1U << IEEE_FLT_MANTISSA_BITS ) - 1 );
     37 static const __m128i vector_int_f16_min_exponent				= _mm_set1_epi32( 0 );
     38 static const __m128i vector_int_f16_max_exponent				= _mm_set1_epi32( ( 30 << IEEE_FLT16_MANTISSA_BITS ) );
     39 static const __m128i vector_int_f16_min_mantissa				= _mm_set1_epi32( 0 );
     40 static const __m128i vector_int_f16_max_mantissa				= _mm_set1_epi32( ( ( 1 << IEEE_FLT16_MANTISSA_BITS ) - 1 ) );
     41 static const __m128i vector_int_f32_to_f16_exponent_bias		= _mm_set1_epi32( ( IEEE_FLT_EXPONENT_BIAS - IEEE_FLT16_EXPONENT_BIAS ) << IEEE_FLT16_MANTISSA_BITS );
     42 static const int f32_to_f16_sign_shift							= IEEE_FLT_SIGN_BIT - IEEE_FLT16_SIGN_BIT;
     43 static const int f32_to_f16_exponent_shift						= IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
     44 static const int f32_to_f16_mantissa_shift						= IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
     45 
     46 static const __m128i vector_int_zero							= _mm_setzero_si128();
     47 static const __m128i vector_int_one								= _mm_set_epi32( 1, 1, 1, 1 );
     48 
     49 static const __m128 vector_float_mask_clear_last				= __m128c( _mm_set_epi32( 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF ) );
     50 static const __m128 vector_float_last_one						= {   0.0f,	  0.0f,   0.0f,   1.0f };
     51 static const __m128 vector_float_1_over_255						= { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f };
     52 static const __m128 vector_float_1_over_4						= { 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f };
     53 
     54 #endif
     55 
     56 /*
     57 ====================
     58 FastF32toF16
     59 ====================
     60 */
     61 #ifdef ID_WIN_X86_SSE2_INTRIN
     62 
     63 ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits ) {
     64 	__m128i f16_sign     = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_sign_mask     ), f32_to_f16_sign_shift );
     65 	__m128i f16_exponent = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_exponent_mask ), f32_to_f16_exponent_shift );
     66 	__m128i f16_mantissa = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_mantissa_mask ), f32_to_f16_mantissa_shift );
     67 
     68 	f16_exponent = _mm_sub_epi32( f16_exponent, vector_int_f32_to_f16_exponent_bias );
     69 
     70 	const __m128i underflow = _mm_cmplt_epi32( f16_exponent, vector_int_f16_min_exponent );
     71 	const __m128i overflow  = _mm_cmpgt_epi32( f16_exponent, vector_int_f16_max_exponent );
     72 
     73 	f16_exponent = _mm_sel_si128( f16_exponent, vector_int_f16_min_exponent, underflow );
     74 	f16_exponent = _mm_sel_si128( f16_exponent, vector_int_f16_max_exponent, overflow );
     75 	f16_mantissa = _mm_sel_si128( f16_mantissa, vector_int_f16_min_mantissa, underflow );
     76 	f16_mantissa = _mm_sel_si128( f16_mantissa, vector_int_f16_max_mantissa, overflow );
     77 
     78 	__m128i flt16 = _mm_or_si128( _mm_or_si128( f16_sign, f16_exponent ), f16_mantissa );
     79 
     80 	return _mm_packs_epi32( flt16, flt16 );
     81 }
     82 
     83 #endif
     84 
     85 ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 ) {
     86 	const int f32_sign_mask				= 1U << IEEE_FLT_SIGN_BIT;
     87 	const int f32_exponent_mask			= ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS;
     88 	const int f32_mantissa_mask			= ( 1U << IEEE_FLT_MANTISSA_BITS ) - 1;
     89 	const int f16_min_exponent			= 0;
     90 	const int f16_max_exponent			= ( 30 << IEEE_FLT16_MANTISSA_BITS );
     91 	const int f16_min_mantissa			= 0;
     92 	const int f16_max_mantissa			= ( ( 1 << IEEE_FLT16_MANTISSA_BITS ) - 1 );
     93 	const int f32_to_f16_sign_shift		= IEEE_FLT_SIGN_BIT - IEEE_FLT16_SIGN_BIT;
     94 	const int f32_to_f16_exponent_shift	= IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
     95 	const int f32_to_f16_mantissa_shift	= IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
     96 	const int f32_to_f16_exponent_bias	= ( IEEE_FLT_EXPONENT_BIAS - IEEE_FLT16_EXPONENT_BIAS ) << IEEE_FLT16_MANTISSA_BITS;
     97 
     98 	int f32_bits = *(unsigned int *)&f32;
     99 
    100 	int f16_sign     = ( (unsigned int )( f32_bits & f32_sign_mask     ) >> f32_to_f16_sign_shift );
    101 	int f16_exponent = ( (unsigned int )( f32_bits & f32_exponent_mask ) >> f32_to_f16_exponent_shift );
    102 	int f16_mantissa = ( (unsigned int )( f32_bits & f32_mantissa_mask ) >> f32_to_f16_mantissa_shift );
    103 
    104 	f16_exponent -= f32_to_f16_exponent_bias;
    105 
    106 	const bool underflow = ( f16_exponent < f16_min_exponent );
    107 	const bool overflow  = ( f16_exponent > f16_max_exponent );
    108 
    109 	f16_exponent = underflow ? f16_min_exponent : f16_exponent;
    110 	f16_exponent = overflow  ? f16_max_exponent : f16_exponent;
    111 	f16_mantissa = underflow ? f16_min_mantissa : f16_mantissa;
    112 	f16_mantissa = overflow  ? f16_max_mantissa : f16_mantissa;
    113 
    114 	return (halfFloat_t)( f16_sign | f16_exponent | f16_mantissa );
    115 }
    116 
    117 /*
    118 ====================
    119 LoadSkinnedDrawVertPosition
    120 ====================
    121 */
    122 #ifdef ID_WIN_X86_SSE2_INTRIN
    123 
    124 ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert & base, const idJointMat * joints ) {
    125 	const idJointMat & j0 = joints[base.color[0]];
    126 	const idJointMat & j1 = joints[base.color[1]];
    127 	const idJointMat & j2 = joints[base.color[2]];
    128 	const idJointMat & j3 = joints[base.color[3]];
    129 
    130 	__m128i weights_b = _mm_cvtsi32_si128( *(const unsigned int *)base.color2 );
    131 	__m128i weights_s = _mm_unpacklo_epi8( weights_b, vector_int_zero );
    132 	__m128i weights_i = _mm_unpacklo_epi16( weights_s, vector_int_zero );
    133 
    134 	__m128 weights = _mm_cvtepi32_ps( weights_i );
    135 	weights = _mm_mul_ps( weights, vector_float_1_over_255 );
    136 
    137 	__m128 w0 = _mm_splat_ps( weights, 0 );
    138 	__m128 w1 = _mm_splat_ps( weights, 1 );
    139 	__m128 w2 = _mm_splat_ps( weights, 2 );
    140 	__m128 w3 = _mm_splat_ps( weights, 3 );
    141 
    142 	__m128 matX = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 0 * 4 ), w0 );
    143 	__m128 matY = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 1 * 4 ), w0 );
    144 	__m128 matZ = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 2 * 4 ), w0 );
    145 
    146 	matX = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 0 * 4 ), w1, matX );
    147 	matY = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 1 * 4 ), w1, matY );
    148 	matZ = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 2 * 4 ), w1, matZ );
    149 
    150 	matX = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 0 * 4 ), w2, matX );
    151 	matY = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 1 * 4 ), w2, matY );
    152 	matZ = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 2 * 4 ), w2, matZ );
    153 
    154 	matX = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 0 * 4 ), w3, matX );
    155 	matY = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 1 * 4 ), w3, matY );
    156 	matZ = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 2 * 4 ), w3, matZ );
    157 
    158 	__m128 v = _mm_load_ps( base.xyz.ToFloatPtr() );
    159 	v = _mm_and_ps( v, vector_float_mask_clear_last );
    160 	v = _mm_or_ps( v, vector_float_last_one );
    161 
    162 	__m128 t0 = _mm_mul_ps( matX, v );
    163 	__m128 t1 = _mm_mul_ps( matY, v );
    164 	__m128 t2 = _mm_mul_ps( matZ, v );
    165 	__m128 t3 = vector_float_1_over_4;
    166 
    167 	__m128 s0 = _mm_unpacklo_ps( t0, t2 );	// x0, z0, x1, z1
    168 	__m128 s1 = _mm_unpackhi_ps( t0, t2 );	// x2, z2, x3, z3
    169 	__m128 s2 = _mm_unpacklo_ps( t1, t3 );	// y0, w0, y1, w1
    170 	__m128 s3 = _mm_unpackhi_ps( t1, t3 );	// y2, w2, y3, w3
    171 
    172 	__m128 r0 = _mm_unpacklo_ps( s0, s2 );	// x0, y0, z0, w0
    173 	__m128 r1 = _mm_unpackhi_ps( s0, s2 );	// x1, y1, z1, w1
    174 	__m128 r2 = _mm_unpacklo_ps( s1, s3 );	// x2, y2, z2, w2
    175 	__m128 r3 = _mm_unpackhi_ps( s1, s3 );	// x3, y3, z3, w3
    176 
    177 	r0 = _mm_add_ps( r0, r1 );
    178 	r2 = _mm_add_ps( r2, r3 );
    179 	r0 = _mm_add_ps( r0, r2 );
    180 
    181 	return r0;
    182 }
    183 
    184 #endif
    185 
    186 ID_INLINE_EXTERN idVec3 Scalar_LoadSkinnedDrawVertPosition( const idDrawVert & vert, const idJointMat * joints ) {
    187 	const idJointMat & j0 = joints[vert.color[0]];
    188 	const idJointMat & j1 = joints[vert.color[1]];
    189 	const idJointMat & j2 = joints[vert.color[2]];
    190 	const idJointMat & j3 = joints[vert.color[3]];
    191 
    192 	const float w0 = vert.color2[0] * ( 1.0f / 255.0f );
    193 	const float w1 = vert.color2[1] * ( 1.0f / 255.0f );
    194 	const float w2 = vert.color2[2] * ( 1.0f / 255.0f );
    195 	const float w3 = vert.color2[3] * ( 1.0f / 255.0f );
    196 
    197 	idJointMat accum;
    198 	idJointMat::Mul( accum, j0, w0 );
    199 	idJointMat::Mad( accum, j1, w1 );
    200 	idJointMat::Mad( accum, j2, w2 );
    201 	idJointMat::Mad( accum, j3, w3 );
    202 
    203 	return accum * idVec4( vert.xyz.x, vert.xyz.y, vert.xyz.z, 1.0f );
    204 }
    205 
    206 #endif /* !__DRAWVERT_INTRINSICS_H__ */