sys_intrinsics.h (9972B)
1 /* 2 =========================================================================== 3 4 Doom 3 BFG Edition GPL Source Code 5 Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. 6 7 This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). 8 9 Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation, either version 3 of the License, or 12 (at your option) any later version. 13 14 Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License 20 along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>. 21 22 In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below. 23 24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA. 25 26 =========================================================================== 27 */ 28 #ifndef __SYS_INTRIINSICS_H__ 29 #define __SYS_INTRIINSICS_H__ 30 31 /* 32 ================================================================================================ 33 34 Scalar single precision floating-point intrinsics 35 36 ================================================================================================ 37 */ 38 39 ID_INLINE_EXTERN float __fmuls( float a, float b ) { return ( a * b ); } 40 ID_INLINE_EXTERN float __fmadds( float a, float b, float c ) { return ( a * b + c ); } 41 ID_INLINE_EXTERN float __fnmsubs( float a, float b, float c ) { return ( c - a * b ); } 42 ID_INLINE_EXTERN float __fsels( float a, float b, float c ) { return ( a >= 0.0f ) ? b : c; } 43 ID_INLINE_EXTERN float __frcps( float x ) { return ( 1.0f / x ); } 44 ID_INLINE_EXTERN float __fdivs( float x, float y ) { return ( x / y ); } 45 ID_INLINE_EXTERN float __frsqrts( float x ) { return ( 1.0f / sqrtf( x ) ); } 46 ID_INLINE_EXTERN float __frcps16( float x ) { return ( 1.0f / x ); } 47 ID_INLINE_EXTERN float __fdivs16( float x, float y ) { return ( x / y ); } 48 ID_INLINE_EXTERN float __frsqrts16( float x ) { return ( 1.0f / sqrtf( x ) ); } 49 ID_INLINE_EXTERN float __frndz( float x ) { return (float)( (int)( x ) ); } 50 51 /* 52 ================================================================================================ 53 54 Zero cache line and prefetch intrinsics 55 56 ================================================================================================ 57 */ 58 59 #ifdef ID_WIN_X86_SSE2_INTRIN 60 61 // The code below assumes that a cache line is 64 bytes. 62 // We specify the cache line size as 128 here to make the code consistent with the consoles. 63 #define CACHE_LINE_SIZE 128 64 65 ID_FORCE_INLINE void Prefetch( const void * ptr, int offset ) { 66 // const char * bytePtr = ( (const char *) ptr ) + offset; 67 // _mm_prefetch( bytePtr + 0, _MM_HINT_NTA ); 68 // _mm_prefetch( bytePtr + 64, _MM_HINT_NTA ); 69 } 70 ID_FORCE_INLINE void ZeroCacheLine( void * ptr, int offset ) { 71 assert_128_byte_aligned( ptr ); 72 char * bytePtr = ( (char *) ptr ) + offset; 73 __m128i zero = _mm_setzero_si128(); 74 _mm_store_si128( (__m128i *) ( bytePtr + 0*16 ), zero ); 75 _mm_store_si128( (__m128i *) ( bytePtr + 1*16 ), zero ); 76 _mm_store_si128( (__m128i *) ( bytePtr + 2*16 ), zero ); 77 _mm_store_si128( (__m128i *) ( bytePtr + 3*16 ), zero ); 78 _mm_store_si128( (__m128i *) ( bytePtr + 4*16 ), zero ); 79 _mm_store_si128( (__m128i *) ( bytePtr + 5*16 ), zero ); 80 _mm_store_si128( (__m128i *) ( bytePtr + 6*16 ), zero ); 81 _mm_store_si128( (__m128i *) ( bytePtr + 7*16 ), zero ); 82 } 83 ID_FORCE_INLINE void FlushCacheLine( const void * ptr, int offset ) { 84 const char * bytePtr = ( (const char *) ptr ) + offset; 85 _mm_clflush( bytePtr + 0 ); 86 _mm_clflush( bytePtr + 64 ); 87 } 88 89 /* 90 ================================================ 91 Other 92 ================================================ 93 */ 94 #else 95 96 #define CACHE_LINE_SIZE 128 97 98 ID_INLINE void Prefetch( const void * ptr, int offset ) {} 99 ID_INLINE void ZeroCacheLine( void * ptr, int offset ) { 100 byte * bytePtr = (byte *)( ( ( (UINT_PTR) ( ptr ) ) + ( offset ) ) & ~( CACHE_LINE_SIZE - 1 ) ); 101 memset( bytePtr, 0, CACHE_LINE_SIZE ); 102 } 103 ID_INLINE void FlushCacheLine( const void * ptr, int offset ) {} 104 105 #endif 106 107 /* 108 ================================================ 109 Block Clear Macros 110 ================================================ 111 */ 112 113 // number of additional elements that are potentially cleared when clearing whole cache lines at a time 114 ID_INLINE_EXTERN int CACHE_LINE_CLEAR_OVERFLOW_COUNT( int size ) { 115 if ( ( size & ( CACHE_LINE_SIZE - 1 ) ) == 0 ) { 116 return 0; 117 } 118 if ( size > CACHE_LINE_SIZE ) { 119 return 1; 120 } 121 return ( CACHE_LINE_SIZE / ( size & ( CACHE_LINE_SIZE - 1 ) ) ); 122 } 123 124 // if the pointer is not on a cache line boundary this assumes the cache line the pointer starts in was already cleared 125 #define CACHE_LINE_CLEAR_BLOCK( ptr, size ) \ 126 byte * startPtr = (byte *)( ( ( (UINT_PTR) ( ptr ) ) + CACHE_LINE_SIZE - 1 ) & ~( CACHE_LINE_SIZE - 1 ) ); \ 127 byte * endPtr = (byte *)( ( (UINT_PTR) ( ptr ) + ( size ) - 1 ) & ~( CACHE_LINE_SIZE - 1 ) ); \ 128 for ( ; startPtr <= endPtr; startPtr += CACHE_LINE_SIZE ) { \ 129 ZeroCacheLine( startPtr, 0 ); \ 130 } 131 132 #define CACHE_LINE_CLEAR_BLOCK_AND_FLUSH( ptr, size ) \ 133 byte * startPtr = (byte *)( ( ( (UINT_PTR) ( ptr ) ) + CACHE_LINE_SIZE - 1 ) & ~( CACHE_LINE_SIZE - 1 ) ); \ 134 byte * endPtr = (byte *)( ( (UINT_PTR) ( ptr ) + ( size ) - 1 ) & ~( CACHE_LINE_SIZE - 1 ) ); \ 135 for ( ; startPtr <= endPtr; startPtr += CACHE_LINE_SIZE ) { \ 136 ZeroCacheLine( startPtr, 0 ); \ 137 FlushCacheLine( startPtr, 0 ); \ 138 } 139 140 /* 141 ================================================================================================ 142 143 Vector Intrinsics 144 145 ================================================================================================ 146 */ 147 148 /* 149 ================================================ 150 PC Windows 151 ================================================ 152 */ 153 154 #if !defined( R_SHUFFLE_D ) 155 #define R_SHUFFLE_D( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 )) 156 #endif 157 158 // make the intrinsics "type unsafe" 159 typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128c { 160 __m128c() {} 161 __m128c( __m128 f ) { m128 = f; } 162 __m128c( __m128i i ) { m128i = i; } 163 operator __m128() { return m128; } 164 operator __m128i() { return m128i; } 165 __m128 m128; 166 __m128i m128i; 167 } __m128c; 168 169 #define _mm_madd_ps( a, b, c ) _mm_add_ps( _mm_mul_ps( (a), (b) ), (c) ) 170 #define _mm_nmsub_ps( a, b, c ) _mm_sub_ps( (c), _mm_mul_ps( (a), (b) ) ) 171 #define _mm_splat_ps( x, i ) __m128c( _mm_shuffle_epi32( __m128c( x ), _MM_SHUFFLE( i, i, i, i ) ) ) 172 #define _mm_perm_ps( x, perm ) __m128c( _mm_shuffle_epi32( __m128c( x ), perm ) ) 173 #define _mm_sel_ps( a, b, c ) _mm_or_ps( _mm_andnot_ps( __m128c( c ), a ), _mm_and_ps( __m128c( c ), b ) ) 174 #define _mm_sel_si128( a, b, c ) _mm_or_si128( _mm_andnot_si128( __m128c( c ), a ), _mm_and_si128( __m128c( c ), b ) ) 175 #define _mm_sld_ps( x, y, imm ) __m128c( _mm_or_si128( _mm_srli_si128( __m128c( x ), imm ), _mm_slli_si128( __m128c( y ), 16 - imm ) ) ) 176 #define _mm_sld_si128( x, y, imm ) _mm_or_si128( _mm_srli_si128( x, imm ), _mm_slli_si128( y, 16 - imm ) ) 177 178 ID_FORCE_INLINE_EXTERN __m128 _mm_msum3_ps( __m128 a, __m128 b ) { 179 __m128 c = _mm_mul_ps( a, b ); 180 return _mm_add_ps( _mm_splat_ps( c, 0 ), _mm_add_ps( _mm_splat_ps( c, 1 ), _mm_splat_ps( c, 2 ) ) ); 181 } 182 183 ID_FORCE_INLINE_EXTERN __m128 _mm_msum4_ps( __m128 a, __m128 b ) { 184 __m128 c = _mm_mul_ps( a, b ); 185 c = _mm_add_ps( c, _mm_perm_ps( c, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 186 c = _mm_add_ps( c, _mm_perm_ps( c, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 187 return c; 188 } 189 190 #define _mm_shufmix_epi32( x, y, perm ) __m128c( _mm_shuffle_ps( __m128c( x ), __m128c( y ), perm ) ) 191 #define _mm_loadh_epi64( x, address ) __m128c( _mm_loadh_pi( __m128c( x ), (__m64 *)address ) ) 192 #define _mm_storeh_epi64( address, x ) _mm_storeh_pi( (__m64 *)address, __m128c( x ) ) 193 194 // floating-point reciprocal with close to full precision 195 ID_FORCE_INLINE_EXTERN __m128 _mm_rcp32_ps( __m128 x ) { 196 __m128 r = _mm_rcp_ps( x ); // _mm_rcp_ps() has 12 bits of precision 197 r = _mm_sub_ps( _mm_add_ps( r, r ), _mm_mul_ps( _mm_mul_ps( x, r ), r ) ); 198 r = _mm_sub_ps( _mm_add_ps( r, r ), _mm_mul_ps( _mm_mul_ps( x, r ), r ) ); 199 return r; 200 } 201 // floating-point reciprocal with at least 16 bits precision 202 ID_FORCE_INLINE_EXTERN __m128 _mm_rcp16_ps( __m128 x ) { 203 __m128 r = _mm_rcp_ps( x ); // _mm_rcp_ps() has 12 bits of precision 204 r = _mm_sub_ps( _mm_add_ps( r, r ), _mm_mul_ps( _mm_mul_ps( x, r ), r ) ); 205 return r; 206 } 207 // floating-point divide with close to full precision 208 ID_FORCE_INLINE_EXTERN __m128 _mm_div32_ps( __m128 x, __m128 y ) { 209 return _mm_mul_ps( x, _mm_rcp32_ps( y ) ); 210 } 211 // floating-point divide with at least 16 bits precision 212 ID_FORCE_INLINE_EXTERN __m128 _mm_div16_ps( __m128 x, __m128 y ) { 213 return _mm_mul_ps( x, _mm_rcp16_ps( y ) ); 214 } 215 // load idBounds::GetMins() 216 #define _mm_loadu_bounds_0( bounds ) _mm_perm_ps( _mm_loadh_pi( _mm_load_ss( & bounds[0].x ), (__m64 *) & bounds[0].y ), _MM_SHUFFLE( 1, 3, 2, 0 ) ) 217 // load idBounds::GetMaxs() 218 #define _mm_loadu_bounds_1( bounds ) _mm_perm_ps( _mm_loadh_pi( _mm_load_ss( & bounds[1].x ), (__m64 *) & bounds[1].y ), _MM_SHUFFLE( 1, 3, 2, 0 ) ) 219 220 #endif // !__SYS_INTRIINSICS_H__