sys_intrinsics.h - DOOM-3-BFG - DOOM 3 BFG Edition

sys_intrinsics.h (9972B)
      1 /*
      2 ===========================================================================
      3 
      4 Doom 3 BFG Edition GPL Source Code
      5 Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. 
      6 
      7 This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").  
      8 
      9 Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
     10 it under the terms of the GNU General Public License as published by
     11 the Free Software Foundation, either version 3 of the License, or
     12 (at your option) any later version.
     13 
     14 Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
     15 but WITHOUT ANY WARRANTY; without even the implied warranty of
     16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     17 GNU General Public License for more details.
     18 
     19 You should have received a copy of the GNU General Public License
     20 along with Doom 3 BFG Edition Source Code.  If not, see <http://www.gnu.org/licenses/>.
     21 
     22 In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code.  If not, please request a copy in writing from id Software at the address below.
     23 
     24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
     25 
     26 ===========================================================================
     27 */
     28 #ifndef __SYS_INTRIINSICS_H__
     29 #define __SYS_INTRIINSICS_H__
     30 
     31 /*
     32 ================================================================================================
     33 
     34 	Scalar single precision floating-point intrinsics
     35 
     36 ================================================================================================
     37 */
     38 
     39 ID_INLINE_EXTERN float __fmuls( float a, float b )				{	return ( a * b ); }
     40 ID_INLINE_EXTERN float __fmadds( float a, float b, float c )	{	return ( a * b + c ); }
     41 ID_INLINE_EXTERN float __fnmsubs( float a, float b, float c )	{	return ( c - a * b ); }
     42 ID_INLINE_EXTERN float __fsels( float a, float b, float c )		{	return ( a >= 0.0f ) ? b : c; }
     43 ID_INLINE_EXTERN float __frcps( float x )						{	return ( 1.0f / x ); }
     44 ID_INLINE_EXTERN float __fdivs( float x, float y )				{	return ( x / y ); }
     45 ID_INLINE_EXTERN float __frsqrts( float x )						{	return ( 1.0f / sqrtf( x ) ); }
     46 ID_INLINE_EXTERN float __frcps16( float x )						{	return ( 1.0f / x ); }
     47 ID_INLINE_EXTERN float __fdivs16( float x, float y )			{	return ( x / y ); }
     48 ID_INLINE_EXTERN float __frsqrts16( float x )					{	return ( 1.0f / sqrtf( x ) ); }
     49 ID_INLINE_EXTERN float __frndz( float x )						{	return (float)( (int)( x ) ); }
     50 
     51 /*
     52 ================================================================================================
     53 
     54 	Zero cache line and prefetch intrinsics
     55 
     56 ================================================================================================
     57 */
     58 
     59 #ifdef ID_WIN_X86_SSE2_INTRIN
     60 
     61 // The code below assumes that a cache line is 64 bytes.
     62 // We specify the cache line size as 128 here to make the code consistent with the consoles.
     63 #define CACHE_LINE_SIZE						128
     64 
     65 ID_FORCE_INLINE void Prefetch( const void * ptr, int offset ) {
     66 //	const char * bytePtr = ( (const char *) ptr ) + offset;
     67 //	_mm_prefetch( bytePtr +  0, _MM_HINT_NTA );
     68 //	_mm_prefetch( bytePtr + 64, _MM_HINT_NTA );
     69 }
     70 ID_FORCE_INLINE void ZeroCacheLine( void * ptr, int offset ) {
     71 	assert_128_byte_aligned( ptr );
     72 	char * bytePtr = ( (char *) ptr ) + offset;
     73 	__m128i zero = _mm_setzero_si128();
     74 	_mm_store_si128( (__m128i *) ( bytePtr + 0*16 ), zero );
     75 	_mm_store_si128( (__m128i *) ( bytePtr + 1*16 ), zero );
     76 	_mm_store_si128( (__m128i *) ( bytePtr + 2*16 ), zero );
     77 	_mm_store_si128( (__m128i *) ( bytePtr + 3*16 ), zero );
     78 	_mm_store_si128( (__m128i *) ( bytePtr + 4*16 ), zero );
     79 	_mm_store_si128( (__m128i *) ( bytePtr + 5*16 ), zero );
     80 	_mm_store_si128( (__m128i *) ( bytePtr + 6*16 ), zero );
     81 	_mm_store_si128( (__m128i *) ( bytePtr + 7*16 ), zero );
     82 }
     83 ID_FORCE_INLINE void FlushCacheLine( const void * ptr, int offset ) {
     84 	const char * bytePtr = ( (const char *) ptr ) + offset;
     85 	_mm_clflush( bytePtr +  0 );
     86 	_mm_clflush( bytePtr + 64 );
     87 }
     88 
     89 /*
     90 ================================================
     91 	Other
     92 ================================================
     93 */
     94 #else
     95 
     96 #define CACHE_LINE_SIZE						128
     97 
     98 ID_INLINE void Prefetch( const void * ptr, int offset ) {}
     99 ID_INLINE void ZeroCacheLine( void * ptr, int offset ) {
    100 	byte * bytePtr = (byte *)( ( ( (UINT_PTR) ( ptr ) ) + ( offset ) ) & ~( CACHE_LINE_SIZE - 1 ) );
    101 	memset( bytePtr, 0, CACHE_LINE_SIZE );
    102 }
    103 ID_INLINE void FlushCacheLine( const void * ptr, int offset ) {}
    104 
    105 #endif
    106 
    107 /*
    108 ================================================
    109 	Block Clear Macros
    110 ================================================
    111 */
    112 
    113 // number of additional elements that are potentially cleared when clearing whole cache lines at a time
    114 ID_INLINE_EXTERN int CACHE_LINE_CLEAR_OVERFLOW_COUNT( int size ) {
    115 	if ( ( size & ( CACHE_LINE_SIZE - 1 ) ) == 0 ) {
    116 		return 0;
    117 	}
    118 	if ( size > CACHE_LINE_SIZE ) {
    119 		return 1;
    120 	}
    121 	return ( CACHE_LINE_SIZE / ( size & ( CACHE_LINE_SIZE - 1 ) ) );
    122 }
    123 
    124 // if the pointer is not on a cache line boundary this assumes the cache line the pointer starts in was already cleared
    125 #define CACHE_LINE_CLEAR_BLOCK( ptr, size )																		\
    126 	byte * startPtr = (byte *)( ( ( (UINT_PTR) ( ptr ) ) + CACHE_LINE_SIZE - 1 ) & ~( CACHE_LINE_SIZE - 1 ) );	\
    127 	byte * endPtr = (byte *)( ( (UINT_PTR) ( ptr ) + ( size ) - 1 ) & ~( CACHE_LINE_SIZE - 1 ) );				\
    128 	for ( ; startPtr <= endPtr; startPtr += CACHE_LINE_SIZE ) {													\
    129 		ZeroCacheLine( startPtr, 0 );																			\
    130 	}
    131 
    132 #define CACHE_LINE_CLEAR_BLOCK_AND_FLUSH( ptr, size )															\
    133 	byte * startPtr = (byte *)( ( ( (UINT_PTR) ( ptr ) ) + CACHE_LINE_SIZE - 1 ) & ~( CACHE_LINE_SIZE - 1 ) );	\
    134 	byte * endPtr = (byte *)( ( (UINT_PTR) ( ptr ) + ( size ) - 1 ) & ~( CACHE_LINE_SIZE - 1 ) );				\
    135 	for ( ; startPtr <= endPtr; startPtr += CACHE_LINE_SIZE ) {													\
    136 		ZeroCacheLine( startPtr, 0 );																			\
    137 		FlushCacheLine( startPtr, 0 );																			\
    138 	}
    139 
    140 /*
    141 ================================================================================================
    142 
    143 	Vector Intrinsics
    144 
    145 ================================================================================================
    146 */
    147 
    148 /*
    149 ================================================
    150 	PC Windows
    151 ================================================
    152 */
    153 
    154 #if !defined( R_SHUFFLE_D )
    155 #define R_SHUFFLE_D( x, y, z, w )	(( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
    156 #endif
    157 
    158 // make the intrinsics "type unsafe"
    159 typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128c {
    160 				__m128c() {}
    161 				__m128c( __m128 f ) { m128 = f; }
    162 				__m128c( __m128i i ) { m128i = i; }
    163 	operator	__m128() { return m128; }
    164 	operator	__m128i() { return m128i; }
    165 	__m128		m128;
    166 	__m128i		m128i;
    167 } __m128c;
    168 
    169 #define _mm_madd_ps( a, b, c )				_mm_add_ps( _mm_mul_ps( (a), (b) ), (c) )
    170 #define _mm_nmsub_ps( a, b, c )				_mm_sub_ps( (c), _mm_mul_ps( (a), (b) ) )
    171 #define _mm_splat_ps( x, i )				__m128c( _mm_shuffle_epi32( __m128c( x ), _MM_SHUFFLE( i, i, i, i ) ) )
    172 #define _mm_perm_ps( x, perm )				__m128c( _mm_shuffle_epi32( __m128c( x ), perm ) )
    173 #define _mm_sel_ps( a, b, c )  				_mm_or_ps( _mm_andnot_ps( __m128c( c ), a ), _mm_and_ps( __m128c( c ), b ) )
    174 #define _mm_sel_si128( a, b, c )			_mm_or_si128( _mm_andnot_si128( __m128c( c ), a ), _mm_and_si128( __m128c( c ), b ) )
    175 #define _mm_sld_ps( x, y, imm )				__m128c( _mm_or_si128( _mm_srli_si128( __m128c( x ), imm ), _mm_slli_si128( __m128c( y ), 16 - imm ) ) )
    176 #define _mm_sld_si128( x, y, imm )			_mm_or_si128( _mm_srli_si128( x, imm ), _mm_slli_si128( y, 16 - imm ) )
    177 
    178 ID_FORCE_INLINE_EXTERN __m128 _mm_msum3_ps( __m128 a, __m128 b )	{
    179 	__m128 c = _mm_mul_ps( a, b );
    180 	return _mm_add_ps( _mm_splat_ps( c, 0 ), _mm_add_ps( _mm_splat_ps( c, 1 ), _mm_splat_ps( c, 2 ) ) );
    181 }
    182 
    183 ID_FORCE_INLINE_EXTERN __m128 _mm_msum4_ps( __m128 a, __m128 b ) {
    184 	__m128 c = _mm_mul_ps( a, b );
    185 	c = _mm_add_ps( c, _mm_perm_ps( c, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
    186 	c = _mm_add_ps( c, _mm_perm_ps( c, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
    187 	return c;
    188 }
    189 
    190 #define _mm_shufmix_epi32( x, y, perm )		__m128c( _mm_shuffle_ps( __m128c( x ), __m128c( y ), perm ) )
    191 #define _mm_loadh_epi64( x, address )		__m128c( _mm_loadh_pi( __m128c( x ), (__m64 *)address ) )
    192 #define _mm_storeh_epi64( address, x )		_mm_storeh_pi( (__m64 *)address, __m128c( x ) )
    193 
    194 // floating-point reciprocal with close to full precision
    195 ID_FORCE_INLINE_EXTERN __m128 _mm_rcp32_ps( __m128 x ) {
    196 	__m128 r = _mm_rcp_ps( x );		// _mm_rcp_ps() has 12 bits of precision
    197 	r = _mm_sub_ps( _mm_add_ps( r, r ), _mm_mul_ps( _mm_mul_ps( x, r ), r ) );
    198 	r = _mm_sub_ps( _mm_add_ps( r, r ), _mm_mul_ps( _mm_mul_ps( x, r ), r ) );
    199 	return r;
    200 }
    201 // floating-point reciprocal with at least 16 bits precision
    202 ID_FORCE_INLINE_EXTERN __m128 _mm_rcp16_ps( __m128 x ) {
    203 	__m128 r = _mm_rcp_ps( x );		// _mm_rcp_ps() has 12 bits of precision
    204 	r = _mm_sub_ps( _mm_add_ps( r, r ), _mm_mul_ps( _mm_mul_ps( x, r ), r ) );
    205 	return r;
    206 }
    207 // floating-point divide with close to full precision
    208 ID_FORCE_INLINE_EXTERN __m128 _mm_div32_ps( __m128 x, __m128 y ) {
    209 	return _mm_mul_ps( x, _mm_rcp32_ps( y ) );
    210 }
    211 // floating-point divide with at least 16 bits precision
    212 ID_FORCE_INLINE_EXTERN __m128 _mm_div16_ps( __m128 x, __m128 y ) {
    213 	return _mm_mul_ps( x, _mm_rcp16_ps( y ) );
    214 }
    215 // load idBounds::GetMins()
    216 #define _mm_loadu_bounds_0( bounds )		_mm_perm_ps( _mm_loadh_pi( _mm_load_ss( & bounds[0].x ), (__m64 *) & bounds[0].y ), _MM_SHUFFLE( 1, 3, 2, 0 ) )
    217 // load idBounds::GetMaxs()
    218 #define _mm_loadu_bounds_1( bounds )		_mm_perm_ps( _mm_loadh_pi( _mm_load_ss( & bounds[1].x ), (__m64 *) & bounds[1].y ), _MM_SHUFFLE( 1, 3, 2, 0 ) )
    219 
    220 #endif	// !__SYS_INTRIINSICS_H__