DOOM-3-BFG

DOOM 3 BFG Edition
Log | Files | Refs

RenderMatrix.cpp (191624B)


      1 /*
      2 ===========================================================================
      3 
      4 Doom 3 BFG Edition GPL Source Code
      5 Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. 
      6 
      7 This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").  
      8 
      9 Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
     10 it under the terms of the GNU General Public License as published by
     11 the Free Software Foundation, either version 3 of the License, or
     12 (at your option) any later version.
     13 
     14 Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
     15 but WITHOUT ANY WARRANTY; without even the implied warranty of
     16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     17 GNU General Public License for more details.
     18 
     19 You should have received a copy of the GNU General Public License
     20 along with Doom 3 BFG Edition Source Code.  If not, see <http://www.gnu.org/licenses/>.
     21 
     22 In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code.  If not, please request a copy in writing from id Software at the address below.
     23 
     24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
     25 
     26 ===========================================================================
     27 */
     28 
     29 #include "../ParallelJobList_JobHeaders.h"
     30 #include "../math/Math.h"
     31 #include "../math/Vector.h"
     32 #include "../math/Matrix.h"
     33 #include "../math/Rotation.h"
     34 #include "../math/Plane.h"
     35 #include "../bv/Sphere.h"
     36 #include "../bv/Bounds.h"
     37 #include "RenderMatrix.h"
     38 
     39 // FIXME:	it would be nice if all render matrices were 16-byte aligned
     40 //			so there is no need for unaligned loads and stores everywhere
     41 
     42 #ifdef _lint
     43 #undef ID_WIN_X86_SSE2_INTRIN
     44 #endif
     45 
     46 //lint -e438	// the non-SSE code isn't lint friendly, either
     47 //lint -e550
     48 
     49 #define RENDER_MATRIX_INVERSE_EPSILON		1e-16f	// JDC: changed from 1e-14f to allow full wasteland parallel light projections to invert
     50 #define RENDER_MATRIX_INFINITY				1e30f	// NOTE: cannot initiaize a vec_float4 with idMath::INFINITY on the SPU
     51 #define RENDER_MATRIX_PROJECTION_EPSILON	0.1f
     52 
     53 #define CLIP_SPACE_OGL		// the OpenGL clip space Z is in the range [-1, 1]
     54 
     55 /*
     56 ================================================================================================
     57 
     58 Constant render matrices
     59 
     60 ================================================================================================
     61 */
     62 
     63 // identity matrix
     64 ALIGNTYPE16 const idRenderMatrix renderMatrix_identity(
     65 	1.0f, 0.0f, 0.0f, 0.0f,
     66 	0.0f, 1.0f, 0.0f, 0.0f,
     67 	0.0f, 0.0f, 1.0f, 0.0f,
     68 	0.0f, 0.0f, 0.0f, 1.0f
     69 );
     70 
     71 // convert from our coordinate system (looking down X) to OpenGL's coordinate system (looking down -Z)
     72 ALIGNTYPE16 const idRenderMatrix renderMatrix_flipToOpenGL(
     73 	 0.0f, -1.0f, 0.0f, 0.0f,
     74 	 0.0f,  0.0f, 1.0f, 0.0f,
     75 	-1.0f,  0.0f, 0.0f, 0.0f,
     76 	 0.0f,  0.0f, 0.0f, 1.0f
     77 );
     78 
     79 // OpenGL -1 to 1.
     80 ALIGNTYPE16 const idRenderMatrix renderMatrix_windowSpaceToClipSpace(
     81 	2.0f, 0.0f, 0.0f, -1.0f,
     82 	0.0f, 2.0f, 0.0f, -1.0f,
     83 	0.0f, 0.0f, 2.0f, -1.0f,
     84 	0.0f, 0.0f, 0.0f,  1.0f
     85 );
     86 
     87 /*
     88 ================================================================================================
     89 
     90 SIMD constants
     91 
     92 ================================================================================================
     93 */
     94 
     95 #ifdef ID_WIN_X86_SSE2_INTRIN
     96 
     97 static const __m128i vector_int_1							= _mm_set1_epi32( 1 );
     98 static const __m128i vector_int_4							= _mm_set1_epi32( 4 );
     99 static const __m128i vector_int_0123						= _mm_set_epi32( 3, 2, 1, 0 );
    100 static const __m128 vector_float_mask0						= __m128c( _mm_set1_epi32( 1<<0 ) );
    101 static const __m128 vector_float_mask1						= __m128c( _mm_set1_epi32( 1<<1 ) );
    102 static const __m128 vector_float_mask2						= __m128c( _mm_set1_epi32( 1<<2 ) );
    103 static const __m128 vector_float_mask3						= __m128c( _mm_set1_epi32( 1<<3 ) );
    104 static const __m128 vector_float_mask4						= __m128c( _mm_set1_epi32( 1<<4 ) );
    105 static const __m128 vector_float_mask5						= __m128c( _mm_set1_epi32( 1<<5 ) );
    106 static const __m128 vector_float_sign_bit					= __m128c( _mm_set1_epi32( IEEE_FLT_SIGN_MASK ) );
    107 static const __m128 vector_float_abs_mask					= __m128c( _mm_set1_epi32( ~IEEE_FLT_SIGN_MASK ) );
    108 static const __m128 vector_float_keep_last					= __m128c( _mm_set_epi32( -1, 0, 0, 0 ) );
    109 static const __m128 vector_float_inverse_epsilon			= { RENDER_MATRIX_INVERSE_EPSILON, RENDER_MATRIX_INVERSE_EPSILON, RENDER_MATRIX_INVERSE_EPSILON, RENDER_MATRIX_INVERSE_EPSILON };
    110 static const __m128 vector_float_smallest_non_denorm		= { 1.1754944e-038f, 1.1754944e-038f, 1.1754944e-038f, 1.1754944e-038f };
    111 static const __m128 vector_float_pos_infinity				= { RENDER_MATRIX_INFINITY, RENDER_MATRIX_INFINITY, RENDER_MATRIX_INFINITY, RENDER_MATRIX_INFINITY };
    112 static const __m128 vector_float_neg_infinity				= { -RENDER_MATRIX_INFINITY, -RENDER_MATRIX_INFINITY, -RENDER_MATRIX_INFINITY, -RENDER_MATRIX_INFINITY };
    113 static const __m128 vector_float_zero						= { 0.0f, 0.0f, 0.0f, 0.0f };
    114 static const __m128 vector_float_half						= { 0.5f, 0.5f, 0.5f, 0.5f };
    115 static const __m128 vector_float_neg_half					= { -0.5f, -0.5f, -0.5f, -0.5f };
    116 static const __m128 vector_float_one						= { 1.0f, 1.0f, 1.0f, 1.0f };
    117 static const __m128 vector_float_pos_one					= { +1.0f, +1.0f, +1.0f, +1.0f };
    118 static const __m128 vector_float_neg_one					= { -1.0f, -1.0f, -1.0f, -1.0f };
    119 static const __m128 vector_float_last_one					= { 0.0f, 0.0f, 0.0f, 1.0f };
    120 
    121 #endif
    122 
    123 /*
    124 ================================================================================================
    125 
    126 Box definition
    127 
    128 ================================================================================================
    129 */
    130 
    131 /*
    132             4----{E}---5
    133  +         /|         /|
    134  Z      {H} {I}    {F} |
    135  -     /    |     /   {J}
    136       7--{G}-----6     |
    137       |     |    |     |
    138      {L}    0----|-{A}-1
    139       |    /    {K}   /       -
    140       | {D}      | {B}       Y
    141       |/         |/         +
    142       3---{C}----2
    143 
    144 	    - X +
    145 */
    146 
    147 static const short boxPolygonVertices[6][4] = {
    148 	{ 0, 3, 7, 4 },	// neg-X
    149 	{ 0, 1, 5, 4 },	// neg-Y
    150 	{ 0, 1, 2, 3 },	// neg-Z
    151 	{ 1, 2, 6, 5 },	// pos-X
    152 	{ 2, 3, 7, 6 },	// pos-Y
    153 	{ 4, 5, 6, 7 }	// pos-Z
    154 };
    155 
    156 static const short boxEdgeVertices[12][2] = {
    157 	/* A = */ { 0, 1 }, /* B = */ { 1, 2 }, /* C = */ { 2, 3 }, /* D = */ { 3, 0 },	// bottom
    158 	/* E = */ { 4, 5 }, /* F = */ { 5, 6 }, /* G = */ { 6, 7 }, /* H = */ { 7, 4 },	// top
    159 	/* I = */ { 0, 4 }, /* J = */ { 1, 5 }, /* K = */ { 2, 6 }, /* L = */ { 3, 7 }	// sides
    160 };
    161 
    162 static int boxEdgePolygons[12][2] = {
    163 	/* A = */ { 1, 2 }, /* B = */ { 3, 2 }, /* C = */ { 4, 2 }, /* D = */ { 0, 2 },	// bottom
    164 	/* E = */ { 1, 5 }, /* F = */ { 3, 5 }, /* G = */ { 4, 5 }, /* H = */ { 0, 5 }, // top
    165 	/* I = */ { 0, 1 }, /* J = */ { 3, 1 }, /* K = */ { 3, 4 }, /* L = */ { 0, 4 }	// sides
    166 };
    167 
    168 /*
    169 #include <Windows.h>
    170 
    171 class idCreateBoxFrontPolygonsForFrontBits {
    172 public:
    173 	idCreateBoxFrontPolygonsForFrontBits() {
    174 		for ( int i = 0; i < 64; i++ ) {
    175 			int frontPolygons[7] = { 0 };
    176 			int numFrontPolygons = 0;
    177 			char bits[7] = { 0 };
    178 			for ( int j = 0; j < 6; j++ ) {
    179 				if ( ( i & ( 1 << j ) ) != 0 ) {
    180 					frontPolygons[numFrontPolygons++] = j;
    181 					bits[5 - j] = '1';
    182 				} else {
    183 					bits[5 - j] = '0';
    184 				}
    185 			}
    186 			const char * comment = ( ( i & ( i >> 3 ) & 7 ) != 0 ) ? " invalid" : "";
    187 			if ( i == 0 ) {
    188 				comment = " inside the box, every polygon is considered front facing";
    189 				numFrontPolygons = 6;
    190 				for ( int j = 0; j < 6; j++ ) {
    191 					frontPolygons[j] = j;
    192 				}
    193 			}
    194 			char buffer[1024];
    195 			sprintf( buffer, "{ { %d, %d, %d, %d, %d, %d, %d }, %d }, // %s = %d%s\n",
    196 								frontPolygons[0], frontPolygons[1], frontPolygons[2], frontPolygons[3], 
    197 								frontPolygons[4], frontPolygons[5], frontPolygons[6],
    198 								numFrontPolygons, bits, i, comment );
    199 			OutputDebugString( buffer );
    200 		}
    201 	}
    202 } createBoxFrontPolygonsForFrontBits;
    203 */
    204 
    205 // make sure this is a power of two for fast addressing an array of these without integer multiplication
    206 static const struct frontPolygons_t {
    207 	byte	indices[7];
    208 	byte	count;
    209 } boxFrontPolygonsForFrontBits[64] = {
    210 	{ { 0, 1, 2, 3, 4, 5, 0 }, 6 }, // 000000 = 0 inside the box, every polygon is considered front facing
    211 	{ { 0, 0, 0, 0, 0, 0, 0 }, 1 }, // 000001 = 1
    212 	{ { 1, 0, 0, 0, 0, 0, 0 }, 1 }, // 000010 = 2
    213 	{ { 0, 1, 0, 0, 0, 0, 0 }, 2 }, // 000011 = 3
    214 	{ { 2, 0, 0, 0, 0, 0, 0 }, 1 }, // 000100 = 4
    215 	{ { 0, 2, 0, 0, 0, 0, 0 }, 2 }, // 000101 = 5
    216 	{ { 1, 2, 0, 0, 0, 0, 0 }, 2 }, // 000110 = 6
    217 	{ { 0, 1, 2, 0, 0, 0, 0 }, 3 }, // 000111 = 7
    218 	{ { 3, 0, 0, 0, 0, 0, 0 }, 1 }, // 001000 = 8
    219 	{ { 0, 3, 0, 0, 0, 0, 0 }, 2 }, // 001001 = 9 invalid
    220 	{ { 1, 3, 0, 0, 0, 0, 0 }, 2 }, // 001010 = 10
    221 	{ { 0, 1, 3, 0, 0, 0, 0 }, 3 }, // 001011 = 11 invalid
    222 	{ { 2, 3, 0, 0, 0, 0, 0 }, 2 }, // 001100 = 12
    223 	{ { 0, 2, 3, 0, 0, 0, 0 }, 3 }, // 001101 = 13 invalid
    224 	{ { 1, 2, 3, 0, 0, 0, 0 }, 3 }, // 001110 = 14
    225 	{ { 0, 1, 2, 3, 0, 0, 0 }, 4 }, // 001111 = 15 invalid
    226 	{ { 4, 0, 0, 0, 0, 0, 0 }, 1 }, // 010000 = 16
    227 	{ { 0, 4, 0, 0, 0, 0, 0 }, 2 }, // 010001 = 17
    228 	{ { 1, 4, 0, 0, 0, 0, 0 }, 2 }, // 010010 = 18 invalid
    229 	{ { 0, 1, 4, 0, 0, 0, 0 }, 3 }, // 010011 = 19 invalid
    230 	{ { 2, 4, 0, 0, 0, 0, 0 }, 2 }, // 010100 = 20
    231 	{ { 0, 2, 4, 0, 0, 0, 0 }, 3 }, // 010101 = 21
    232 	{ { 1, 2, 4, 0, 0, 0, 0 }, 3 }, // 010110 = 22 invalid
    233 	{ { 0, 1, 2, 4, 0, 0, 0 }, 4 }, // 010111 = 23 invalid
    234 	{ { 3, 4, 0, 0, 0, 0, 0 }, 2 }, // 011000 = 24
    235 	{ { 0, 3, 4, 0, 0, 0, 0 }, 3 }, // 011001 = 25 invalid
    236 	{ { 1, 3, 4, 0, 0, 0, 0 }, 3 }, // 011010 = 26 invalid
    237 	{ { 0, 1, 3, 4, 0, 0, 0 }, 4 }, // 011011 = 27 invalid
    238 	{ { 2, 3, 4, 0, 0, 0, 0 }, 3 }, // 011100 = 28
    239 	{ { 0, 2, 3, 4, 0, 0, 0 }, 4 }, // 011101 = 29 invalid
    240 	{ { 1, 2, 3, 4, 0, 0, 0 }, 4 }, // 011110 = 30 invalid
    241 	{ { 0, 1, 2, 3, 4, 0, 0 }, 5 }, // 011111 = 31 invalid
    242 	{ { 5, 0, 0, 0, 0, 0, 0 }, 1 }, // 100000 = 32
    243 	{ { 0, 5, 0, 0, 0, 0, 0 }, 2 }, // 100001 = 33
    244 	{ { 1, 5, 0, 0, 0, 0, 0 }, 2 }, // 100010 = 34
    245 	{ { 0, 1, 5, 0, 0, 0, 0 }, 3 }, // 100011 = 35
    246 	{ { 2, 5, 0, 0, 0, 0, 0 }, 2 }, // 100100 = 36 invalid
    247 	{ { 0, 2, 5, 0, 0, 0, 0 }, 3 }, // 100101 = 37 invalid
    248 	{ { 1, 2, 5, 0, 0, 0, 0 }, 3 }, // 100110 = 38 invalid
    249 	{ { 0, 1, 2, 5, 0, 0, 0 }, 4 }, // 100111 = 39 invalid
    250 	{ { 3, 5, 0, 0, 0, 0, 0 }, 2 }, // 101000 = 40
    251 	{ { 0, 3, 5, 0, 0, 0, 0 }, 3 }, // 101001 = 41 invalid
    252 	{ { 1, 3, 5, 0, 0, 0, 0 }, 3 }, // 101010 = 42
    253 	{ { 0, 1, 3, 5, 0, 0, 0 }, 4 }, // 101011 = 43 invalid
    254 	{ { 2, 3, 5, 0, 0, 0, 0 }, 3 }, // 101100 = 44 invalid
    255 	{ { 0, 2, 3, 5, 0, 0, 0 }, 4 }, // 101101 = 45 invalid
    256 	{ { 1, 2, 3, 5, 0, 0, 0 }, 4 }, // 101110 = 46 invalid
    257 	{ { 0, 1, 2, 3, 5, 0, 0 }, 5 }, // 101111 = 47 invalid
    258 	{ { 4, 5, 0, 0, 0, 0, 0 }, 2 }, // 110000 = 48
    259 	{ { 0, 4, 5, 0, 0, 0, 0 }, 3 }, // 110001 = 49
    260 	{ { 1, 4, 5, 0, 0, 0, 0 }, 3 }, // 110010 = 50 invalid
    261 	{ { 0, 1, 4, 5, 0, 0, 0 }, 4 }, // 110011 = 51 invalid
    262 	{ { 2, 4, 5, 0, 0, 0, 0 }, 3 }, // 110100 = 52 invalid
    263 	{ { 0, 2, 4, 5, 0, 0, 0 }, 4 }, // 110101 = 53 invalid
    264 	{ { 1, 2, 4, 5, 0, 0, 0 }, 4 }, // 110110 = 54 invalid
    265 	{ { 0, 1, 2, 4, 5, 0, 0 }, 5 }, // 110111 = 55 invalid
    266 	{ { 3, 4, 5, 0, 0, 0, 0 }, 3 }, // 111000 = 56
    267 	{ { 0, 3, 4, 5, 0, 0, 0 }, 4 }, // 111001 = 57 invalid
    268 	{ { 1, 3, 4, 5, 0, 0, 0 }, 4 }, // 111010 = 58 invalid
    269 	{ { 0, 1, 3, 4, 5, 0, 0 }, 5 }, // 111011 = 59 invalid
    270 	{ { 2, 3, 4, 5, 0, 0, 0 }, 4 }, // 111100 = 60 invalid
    271 	{ { 0, 2, 3, 4, 5, 0, 0 }, 5 }, // 111101 = 61 invalid
    272 	{ { 1, 2, 3, 4, 5, 0, 0 }, 5 }, // 111110 = 62 invalid
    273 	{ { 0, 1, 2, 3, 4, 5, 0 }, 6 }, // 111111 = 63 invalid
    274 };
    275 
    276 /*
    277 #include <Windows.h>
    278 
    279 class idCreateBoxSilhouetteEdgesForFrontBits {
    280 public:
    281 	idCreateBoxSilhouetteEdgesForFrontBits() {
    282 		for ( int i = 0; i < 64; i++ ) {
    283 			int silhouetteEdges[12] = { 0 };
    284 			int numSilhouetteEdges = 0;
    285 
    286 			for ( int j = 0; j < 12; j++ ) {
    287 				if ( i == 0 || ( ( i >> boxEdgePolygons[j][0] ) & 1 ) != ( ( i >> boxEdgePolygons[j][1] ) & 1 ) ) {
    288 					silhouetteEdges[numSilhouetteEdges++] = j;
    289 				}
    290 			}
    291 
    292 			char bits[7] = { 0 };
    293 			for ( int j = 0; j < 6; j++ ) {
    294 				if ( ( i & ( 1 << j ) ) != 0 ) {
    295 					bits[5 - j] = '1';
    296 				} else {
    297 					bits[5 - j] = '0';
    298 				}
    299 			}
    300 			const char * comment = ( ( i & ( i >> 3 ) & 7 ) != 0 ) ? " invalid" : "";
    301 			if ( i == 0 ) {
    302 				comment = " inside the box, every edge is considered part of the silhouette";
    303 			}
    304 			char buffer[1024];
    305 			sprintf( buffer, "{ { %2d, %2d, %2d, %2d, %2d, %2d, %2d, %2d, %2d, %2d, %2d, %2d }, %2d }, // %s = %d%s\n",
    306 								silhouetteEdges[0], silhouetteEdges[1], silhouetteEdges[2], silhouetteEdges[3],
    307 								silhouetteEdges[4], silhouetteEdges[5], silhouetteEdges[6], silhouetteEdges[7],
    308 								silhouetteEdges[8], silhouetteEdges[9], silhouetteEdges[10], silhouetteEdges[11],
    309 								numSilhouetteEdges, bits, i, comment );
    310 			OutputDebugString( buffer );
    311 		}
    312 	}
    313 } createBoxSilhouetteEdgesForFrontBits;
    314 */
    315 
    316 // make sure this is a power of two for fast addressing an array of these without integer multiplication
    317 static const struct silhouetteEdges_t {
    318 	byte	indices[12];
    319 	int32	count;
    320 } boxSilhouetteEdgesForFrontBits[64] = {
    321 	{ {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11 }, 12 }, // 000000 = 0 inside the box, every edge is considered part of the silhouette
    322 	{ {  3,  7,  8, 11,  0,  0,  0,  0,  0,  0,  0,  0 },  4 }, // 000001 = 1
    323 	{ {  0,  4,  8,  9,  0,  0,  0,  0,  0,  0,  0,  0 },  4 }, // 000010 = 2
    324 	{ {  0,  3,  4,  7,  9, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 000011 = 3
    325 	{ {  0,  1,  2,  3,  0,  0,  0,  0,  0,  0,  0,  0 },  4 }, // 000100 = 4
    326 	{ {  0,  1,  2,  7,  8, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 000101 = 5
    327 	{ {  1,  2,  3,  4,  8,  9,  0,  0,  0,  0,  0,  0 },  6 }, // 000110 = 6
    328 	{ {  1,  2,  4,  7,  9, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 000111 = 7
    329 	{ {  1,  5,  9, 10,  0,  0,  0,  0,  0,  0,  0,  0 },  4 }, // 001000 = 8
    330 	{ {  1,  3,  5,  7,  8,  9, 10, 11,  0,  0,  0,  0 },  8 }, // 001001 = 9 invalid
    331 	{ {  0,  1,  4,  5,  8, 10,  0,  0,  0,  0,  0,  0 },  6 }, // 001010 = 10
    332 	{ {  0,  1,  3,  4,  5,  7, 10, 11,  0,  0,  0,  0 },  8 }, // 001011 = 11 invalid
    333 	{ {  0,  2,  3,  5,  9, 10,  0,  0,  0,  0,  0,  0 },  6 }, // 001100 = 12
    334 	{ {  0,  2,  5,  7,  8,  9, 10, 11,  0,  0,  0,  0 },  8 }, // 001101 = 13 invalid
    335 	{ {  2,  3,  4,  5,  8, 10,  0,  0,  0,  0,  0,  0 },  6 }, // 001110 = 14
    336 	{ {  2,  4,  5,  7, 10, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 001111 = 15 invalid
    337 	{ {  2,  6, 10, 11,  0,  0,  0,  0,  0,  0,  0,  0 },  4 }, // 010000 = 16
    338 	{ {  2,  3,  6,  7,  8, 10,  0,  0,  0,  0,  0,  0 },  6 }, // 010001 = 17
    339 	{ {  0,  2,  4,  6,  8,  9, 10, 11,  0,  0,  0,  0 },  8 }, // 010010 = 18 invalid
    340 	{ {  0,  2,  3,  4,  6,  7,  9, 10,  0,  0,  0,  0 },  8 }, // 010011 = 19 invalid
    341 	{ {  0,  1,  3,  6, 10, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 010100 = 20
    342 	{ {  0,  1,  6,  7,  8, 10,  0,  0,  0,  0,  0,  0 },  6 }, // 010101 = 21
    343 	{ {  1,  3,  4,  6,  8,  9, 10, 11,  0,  0,  0,  0 },  8 }, // 010110 = 22 invalid
    344 	{ {  1,  4,  6,  7,  9, 10,  0,  0,  0,  0,  0,  0 },  6 }, // 010111 = 23 invalid
    345 	{ {  1,  2,  5,  6,  9, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 011000 = 24
    346 	{ {  1,  2,  3,  5,  6,  7,  8,  9,  0,  0,  0,  0 },  8 }, // 011001 = 25 invalid
    347 	{ {  0,  1,  2,  4,  5,  6,  8, 11,  0,  0,  0,  0 },  8 }, // 011010 = 26 invalid
    348 	{ {  0,  1,  2,  3,  4,  5,  6,  7,  0,  0,  0,  0 },  8 }, // 011011 = 27 invalid
    349 	{ {  0,  3,  5,  6,  9, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 011100 = 28
    350 	{ {  0,  5,  6,  7,  8,  9,  0,  0,  0,  0,  0,  0 },  6 }, // 011101 = 29 invalid
    351 	{ {  3,  4,  5,  6,  8, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 011110 = 30 invalid
    352 	{ {  4,  5,  6,  7,  0,  0,  0,  0,  0,  0,  0,  0 },  4 }, // 011111 = 31 invalid
    353 	{ {  4,  5,  6,  7,  0,  0,  0,  0,  0,  0,  0,  0 },  4 }, // 100000 = 32
    354 	{ {  3,  4,  5,  6,  8, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 100001 = 33
    355 	{ {  0,  5,  6,  7,  8,  9,  0,  0,  0,  0,  0,  0 },  6 }, // 100010 = 34
    356 	{ {  0,  3,  5,  6,  9, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 100011 = 35
    357 	{ {  0,  1,  2,  3,  4,  5,  6,  7,  0,  0,  0,  0 },  8 }, // 100100 = 36 invalid
    358 	{ {  0,  1,  2,  4,  5,  6,  8, 11,  0,  0,  0,  0 },  8 }, // 100101 = 37 invalid
    359 	{ {  1,  2,  3,  5,  6,  7,  8,  9,  0,  0,  0,  0 },  8 }, // 100110 = 38 invalid
    360 	{ {  1,  2,  5,  6,  9, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 100111 = 39 invalid
    361 	{ {  1,  4,  6,  7,  9, 10,  0,  0,  0,  0,  0,  0 },  6 }, // 101000 = 40
    362 	{ {  1,  3,  4,  6,  8,  9, 10, 11,  0,  0,  0,  0 },  8 }, // 101001 = 41 invalid
    363 	{ {  0,  1,  6,  7,  8, 10,  0,  0,  0,  0,  0,  0 },  6 }, // 101010 = 42
    364 	{ {  0,  1,  3,  6, 10, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 101011 = 43 invalid
    365 	{ {  0,  2,  3,  4,  6,  7,  9, 10,  0,  0,  0,  0 },  8 }, // 101100 = 44 invalid
    366 	{ {  0,  2,  4,  6,  8,  9, 10, 11,  0,  0,  0,  0 },  8 }, // 101101 = 45 invalid
    367 	{ {  2,  3,  6,  7,  8, 10,  0,  0,  0,  0,  0,  0 },  6 }, // 101110 = 46 invalid
    368 	{ {  2,  6, 10, 11,  0,  0,  0,  0,  0,  0,  0,  0 },  4 }, // 101111 = 47 invalid
    369 	{ {  2,  4,  5,  7, 10, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 110000 = 48
    370 	{ {  2,  3,  4,  5,  8, 10,  0,  0,  0,  0,  0,  0 },  6 }, // 110001 = 49
    371 	{ {  0,  2,  5,  7,  8,  9, 10, 11,  0,  0,  0,  0 },  8 }, // 110010 = 50 invalid
    372 	{ {  0,  2,  3,  5,  9, 10,  0,  0,  0,  0,  0,  0 },  6 }, // 110011 = 51 invalid
    373 	{ {  0,  1,  3,  4,  5,  7, 10, 11,  0,  0,  0,  0 },  8 }, // 110100 = 52 invalid
    374 	{ {  0,  1,  4,  5,  8, 10,  0,  0,  0,  0,  0,  0 },  6 }, // 110101 = 53 invalid
    375 	{ {  1,  3,  5,  7,  8,  9, 10, 11,  0,  0,  0,  0 },  8 }, // 110110 = 54 invalid
    376 	{ {  1,  5,  9, 10,  0,  0,  0,  0,  0,  0,  0,  0 },  4 }, // 110111 = 55 invalid
    377 	{ {  1,  2,  4,  7,  9, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 111000 = 56
    378 	{ {  1,  2,  3,  4,  8,  9,  0,  0,  0,  0,  0,  0 },  6 }, // 111001 = 57 invalid
    379 	{ {  0,  1,  2,  7,  8, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 111010 = 58 invalid
    380 	{ {  0,  1,  2,  3,  0,  0,  0,  0,  0,  0,  0,  0 },  4 }, // 111011 = 59 invalid
    381 	{ {  0,  3,  4,  7,  9, 11,  0,  0,  0,  0,  0,  0 },  6 }, // 111100 = 60 invalid
    382 	{ {  0,  4,  8,  9,  0,  0,  0,  0,  0,  0,  0,  0 },  4 }, // 111101 = 61 invalid
    383 	{ {  3,  7,  8, 11,  0,  0,  0,  0,  0,  0,  0,  0 },  4 }, // 111110 = 62 invalid
    384 	{ {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },  0 }, // 111111 = 63 invalid
    385 };
    386 
    387 /*
    388 #include <Windows.h>
    389 
    390 class idCreateBoxSilhouetteVerticesForFrontBits {
    391 public:
    392 	idCreateBoxSilhouetteVerticesForFrontBits() {
    393 		for ( int i = 0; i < 64; i++ ) {
    394 			int silhouetteEdges[12] = { 0 };
    395 			int numSilhouetteEdges = 0;
    396 
    397 			for ( int j = 0; j < 12; j++ ) {
    398 				if ( i == 0 || ( ( i >> boxEdgePolygons[j][0] ) & 1 ) != ( ( i >> boxEdgePolygons[j][1] ) & 1 ) ) {
    399 					silhouetteEdges[numSilhouetteEdges++] = j;
    400 				}
    401 			}
    402 
    403 			int silhouetteVertices[8] = { 0 };
    404 			int numSilhouetteVertices = 0;
    405 
    406 			int vertex = boxEdgeVertices[silhouetteEdges[0]][0];
    407 			for ( int j = 0; j < 7; j++ ) {
    408 				int newVertex = -1;
    409 				for ( int j = 0; j < numSilhouetteEdges; j++ ) {
    410 					if ( silhouetteEdges[j] == -1 ) {
    411 						continue;
    412 					}
    413 					if ( boxEdgeVertices[silhouetteEdges[j]][0] == vertex ) {
    414 						newVertex = boxEdgeVertices[silhouetteEdges[j]][1];
    415 						silhouetteEdges[j] = -1;
    416 						break;
    417 					} else if ( boxEdgeVertices[silhouetteEdges[j]][1] == vertex ) {
    418 						newVertex = boxEdgeVertices[silhouetteEdges[j]][0];
    419 						silhouetteEdges[j] = -1;
    420 						break;
    421 					}
    422 				}
    423 				if ( newVertex == -1 ) {
    424 					break;
    425 				}
    426 				silhouetteVertices[numSilhouetteVertices++] = newVertex;
    427 				vertex = newVertex;
    428 			}
    429 
    430 			char bits[7] = { 0 };
    431 			for ( int j = 0; j < 6; j++ ) {
    432 				if ( ( i & ( 1 << j ) ) != 0 ) {
    433 					bits[5 - j] = '1';
    434 				} else {
    435 					bits[5 - j] = '0';
    436 				}
    437 			}
    438 			const char * comment = ( ( i & ( i >> 3 ) & 7 ) != 0 ) ? " invalid" : "";
    439 			if ( i == 0 ) {
    440 				comment = " inside the box, no silhouette";
    441 			}
    442 			char buffer[1024];
    443 			sprintf( buffer, "{ { %d, %d, %d, %d, %d, %d, %d }, %d }, // %s = %d%s\n",
    444 								silhouetteVertices[0], silhouetteVertices[1], silhouetteVertices[2], silhouetteVertices[3], 
    445 								silhouetteVertices[4], silhouetteVertices[5], silhouetteVertices[6], numSilhouetteVertices, bits, i, comment );
    446 			OutputDebugString( buffer );
    447 		}
    448 	}
    449 } createBoxSilhouetteVerticesForFrontBits;
    450 */
    451 
    452 // make sure this is a power of two for fast addressing an array of these without integer multiplication
    453 static const struct silhouetteVertices_t {
    454 	byte	indices[7];
    455 	byte	count;
    456 } boxSilhouetteVerticesForFrontBits[64] = {
    457 	{ { 1, 2, 3, 0, 4, 5, 6 }, 7 }, // 000000 = 0 inside the box, no vertex is considered part of the silhouette
    458 	{ { 0, 4, 7, 3, 0, 0, 0 }, 4 }, // 000001 = 1
    459 	{ { 1, 5, 4, 0, 0, 0, 0 }, 4 }, // 000010 = 2
    460 	{ { 1, 5, 4, 7, 3, 0, 0 }, 6 }, // 000011 = 3
    461 	{ { 1, 2, 3, 0, 0, 0, 0 }, 4 }, // 000100 = 4
    462 	{ { 1, 2, 3, 7, 4, 0, 0 }, 6 }, // 000101 = 5
    463 	{ { 2, 3, 0, 4, 5, 1, 0 }, 6 }, // 000110 = 6
    464 	{ { 2, 3, 7, 4, 5, 1, 0 }, 6 }, // 000111 = 7
    465 	{ { 2, 6, 5, 1, 0, 0, 0 }, 4 }, // 001000 = 8
    466 	{ { 2, 6, 5, 1, 0, 0, 0 }, 4 }, // 001001 = 9 invalid
    467 	{ { 1, 2, 6, 5, 4, 0, 0 }, 6 }, // 001010 = 10
    468 	{ { 1, 2, 6, 5, 4, 7, 3 }, 7 }, // 001011 = 11 invalid
    469 	{ { 1, 5, 6, 2, 3, 0, 0 }, 6 }, // 001100 = 12
    470 	{ { 1, 5, 6, 2, 3, 7, 4 }, 7 }, // 001101 = 13 invalid
    471 	{ { 3, 0, 4, 5, 6, 2, 0 }, 6 }, // 001110 = 14
    472 	{ { 3, 7, 4, 5, 6, 2, 0 }, 6 }, // 001111 = 15 invalid
    473 	{ { 3, 7, 6, 2, 0, 0, 0 }, 4 }, // 010000 = 16
    474 	{ { 3, 0, 4, 7, 6, 2, 0 }, 6 }, // 010001 = 17
    475 	{ { 1, 5, 4, 0, 0, 0, 0 }, 4 }, // 010010 = 18 invalid
    476 	{ { 1, 5, 4, 7, 6, 2, 3 }, 7 }, // 010011 = 19 invalid
    477 	{ { 1, 2, 6, 7, 3, 0, 0 }, 6 }, // 010100 = 20
    478 	{ { 1, 2, 6, 7, 4, 0, 0 }, 6 }, // 010101 = 21
    479 	{ { 2, 6, 7, 3, 0, 4, 5 }, 7 }, // 010110 = 22 invalid
    480 	{ { 2, 6, 7, 4, 5, 1, 0 }, 6 }, // 010111 = 23 invalid
    481 	{ { 2, 3, 7, 6, 5, 1, 0 }, 6 }, // 011000 = 24
    482 	{ { 2, 3, 0, 4, 7, 6, 5 }, 7 }, // 011001 = 25 invalid
    483 	{ { 1, 2, 3, 7, 6, 5, 4 }, 7 }, // 011010 = 26 invalid
    484 	{ { 1, 2, 3, 0, 0, 0, 0 }, 4 }, // 011011 = 27 invalid
    485 	{ { 1, 5, 6, 7, 3, 0, 0 }, 6 }, // 011100 = 28
    486 	{ { 1, 5, 6, 7, 4, 0, 0 }, 6 }, // 011101 = 29 invalid
    487 	{ { 0, 4, 5, 6, 7, 3, 0 }, 6 }, // 011110 = 30 invalid
    488 	{ { 5, 6, 7, 4, 0, 0, 0 }, 4 }, // 011111 = 31 invalid
    489 	{ { 5, 6, 7, 4, 0, 0, 0 }, 4 }, // 100000 = 32
    490 	{ { 0, 4, 5, 6, 7, 3, 0 }, 6 }, // 100001 = 33
    491 	{ { 1, 5, 6, 7, 4, 0, 0 }, 6 }, // 100010 = 34
    492 	{ { 1, 5, 6, 7, 3, 0, 0 }, 6 }, // 100011 = 35
    493 	{ { 1, 2, 3, 0, 0, 0, 0 }, 4 }, // 100100 = 36 invalid
    494 	{ { 1, 2, 3, 7, 6, 5, 4 }, 7 }, // 100101 = 37 invalid
    495 	{ { 2, 3, 0, 4, 7, 6, 5 }, 7 }, // 100110 = 38 invalid
    496 	{ { 2, 3, 7, 6, 5, 1, 0 }, 6 }, // 100111 = 39 invalid
    497 	{ { 2, 6, 7, 4, 5, 1, 0 }, 6 }, // 101000 = 40
    498 	{ { 2, 6, 7, 3, 0, 4, 5 }, 7 }, // 101001 = 41 invalid
    499 	{ { 1, 2, 6, 7, 4, 0, 0 }, 6 }, // 101010 = 42
    500 	{ { 1, 2, 6, 7, 3, 0, 0 }, 6 }, // 101011 = 43 invalid
    501 	{ { 1, 5, 4, 7, 6, 2, 3 }, 7 }, // 101100 = 44 invalid
    502 	{ { 1, 5, 4, 0, 0, 0, 0 }, 4 }, // 101101 = 45 invalid
    503 	{ { 3, 0, 4, 7, 6, 2, 0 }, 6 }, // 101110 = 46 invalid
    504 	{ { 3, 7, 6, 2, 0, 0, 0 }, 4 }, // 101111 = 47 invalid
    505 	{ { 3, 7, 4, 5, 6, 2, 0 }, 6 }, // 110000 = 48
    506 	{ { 3, 0, 4, 5, 6, 2, 0 }, 6 }, // 110001 = 49
    507 	{ { 1, 5, 6, 2, 3, 7, 4 }, 7 }, // 110010 = 50 invalid
    508 	{ { 1, 5, 6, 2, 3, 0, 0 }, 6 }, // 110011 = 51 invalid
    509 	{ { 1, 2, 6, 5, 4, 7, 3 }, 7 }, // 110100 = 52 invalid
    510 	{ { 1, 2, 6, 5, 4, 0, 0 }, 6 }, // 110101 = 53 invalid
    511 	{ { 2, 6, 5, 1, 0, 0, 0 }, 4 }, // 110110 = 54 invalid
    512 	{ { 2, 6, 5, 1, 0, 0, 0 }, 4 }, // 110111 = 55 invalid
    513 	{ { 2, 3, 7, 4, 5, 1, 0 }, 6 }, // 111000 = 56
    514 	{ { 2, 3, 0, 4, 5, 1, 0 }, 6 }, // 111001 = 57 invalid
    515 	{ { 1, 2, 3, 7, 4, 0, 0 }, 6 }, // 111010 = 58 invalid
    516 	{ { 1, 2, 3, 0, 0, 0, 0 }, 4 }, // 111011 = 59 invalid
    517 	{ { 1, 5, 4, 7, 3, 0, 0 }, 6 }, // 111100 = 60 invalid
    518 	{ { 1, 5, 4, 0, 0, 0, 0 }, 4 }, // 111101 = 61 invalid
    519 	{ { 0, 4, 7, 3, 0, 0, 0 }, 4 }, // 111110 = 62 invalid
    520 	{ { 0, 0, 0, 0, 0, 0, 0 }, 0 }, // 111111 = 63 invalid
    521 };
    522 
    523 /*
    524 ========================
    525 GetBoxFrontBits
    526 
    527 front bits:
    528   bit 0 = neg-X is front facing
    529   bit 1 = neg-Y is front facing
    530   bit 2 = neg-Z is front facing
    531   bit 3 = pos-X is front facing
    532   bit 4 = pos-Y is front facing
    533   bit 5 = pos-Z is front facing
    534 ========================
    535 */
    536 #ifdef ID_WIN_X86_SSE2_INTRIN
    537 
    538 static int GetBoxFrontBits_SSE2( const __m128 & b0, const __m128 & b1, const __m128 & viewOrigin ) {
    539 	const __m128 dir0 = _mm_sub_ps( viewOrigin, b0 );
    540 	const __m128 dir1 = _mm_sub_ps( b1, viewOrigin );
    541 	const __m128 d0 = _mm_cmplt_ps( dir0, _mm_setzero_ps() );
    542 	const __m128 d1 = _mm_cmplt_ps( dir1, _mm_setzero_ps() );
    543 
    544 	int frontBits = _mm_movemask_ps( d0 ) | ( _mm_movemask_ps( d1 ) << 3 );
    545 	return frontBits;
    546 }
    547 
    548 #else
    549 
    550 static int GetBoxFrontBits_Generic( const idBounds & bounds, const idVec3 & viewOrigin ) {
    551 	idVec3 dir0 = viewOrigin - bounds[0];
    552 	idVec3 dir1 = bounds[1] - viewOrigin;
    553 	int frontBits = 0;
    554 	frontBits |= IEEE_FLT_SIGNBITSET( dir0.x ) << 0;
    555 	frontBits |= IEEE_FLT_SIGNBITSET( dir0.y ) << 1;
    556 	frontBits |= IEEE_FLT_SIGNBITSET( dir0.z ) << 2;
    557 	frontBits |= IEEE_FLT_SIGNBITSET( dir1.x ) << 3;
    558 	frontBits |= IEEE_FLT_SIGNBITSET( dir1.y ) << 4;
    559 	frontBits |= IEEE_FLT_SIGNBITSET( dir1.z ) << 5;
    560 	return frontBits;
    561 }
    562 
    563 #endif
    564 
    565 /*
    566 ================================================================================================
    567 
    568 idRenderMatrix implementation
    569 
    570 ================================================================================================
    571 */
    572 
    573 /*
    574 ========================
    575 idRenderMatrix::CreateFromOriginAxis
    576 ========================
    577 */
    578 void idRenderMatrix::CreateFromOriginAxis( const idVec3 & origin, const idMat3 & axis, idRenderMatrix & out ) {
    579 	out[0][0] = axis[0][0];
    580 	out[0][1] = axis[1][0];
    581 	out[0][2] = axis[2][0];
    582 	out[0][3] = origin[0];
    583 
    584 	out[1][0] = axis[0][1];
    585 	out[1][1] = axis[1][1];
    586 	out[1][2] = axis[2][1];
    587 	out[1][3] = origin[1];
    588 
    589 	out[2][0] = axis[0][2];
    590 	out[2][1] = axis[1][2];
    591 	out[2][2] = axis[2][2];
    592 	out[2][3] = origin[2];
    593 
    594 	out[3][0] = 0.0f;
    595 	out[3][1] = 0.0f;
    596 	out[3][2] = 0.0f;
    597 	out[3][3] = 1.0f;
    598 }
    599 
    600 /*
    601 ========================
    602 idRenderMatrix::CreateFromOriginAxisScale
    603 ========================
    604 */
    605 void idRenderMatrix::CreateFromOriginAxisScale( const idVec3 & origin, const idMat3 & axis, const idVec3 & scale, idRenderMatrix & out ) {
    606 	out[0][0] = axis[0][0] * scale[0];
    607 	out[0][1] = axis[1][0] * scale[1];
    608 	out[0][2] = axis[2][0] * scale[2];
    609 	out[0][3] = origin[0];
    610 
    611 	out[1][0] = axis[0][1] * scale[0];
    612 	out[1][1] = axis[1][1] * scale[1];
    613 	out[1][2] = axis[2][1] * scale[2];
    614 	out[1][3] = origin[1];
    615 
    616 	out[2][0] = axis[0][2] * scale[0];
    617 	out[2][1] = axis[1][2] * scale[1];
    618 	out[2][2] = axis[2][2] * scale[2];
    619 	out[2][3] = origin[2];
    620 
    621 	out[3][0] = 0.0f;
    622 	out[3][1] = 0.0f;
    623 	out[3][2] = 0.0f;
    624 	out[3][3] = 1.0f;
    625 }
    626 
    627 /*
    628 ========================
    629 idRenderMatrix::CreateViewMatrix
    630 
    631 Our axis looks down positive +X, render matrix looks down -Z.
    632 ========================
    633 */
    634 void idRenderMatrix::CreateViewMatrix( const idVec3 & origin, const idMat3 & axis, idRenderMatrix & out ) {
    635 	out[0][0] = -axis[1][0];
    636 	out[0][1] = -axis[1][1];
    637 	out[0][2] = -axis[1][2];
    638 	out[0][3] = origin[0] * axis[1][0] + origin[1] * axis[1][1] + origin[2] * axis[1][2];
    639 
    640 	out[1][0] = axis[2][0];
    641 	out[1][1] = axis[2][1];
    642 	out[1][2] = axis[2][2];
    643 	out[1][3] = -( origin[0] * axis[2][0] + origin[1] * axis[2][1] + origin[2] * axis[2][2] );
    644 
    645 	out[2][0] = -axis[0][0];
    646 	out[2][1] = -axis[0][1];
    647 	out[2][2] = -axis[0][2];
    648 	out[2][3] = origin[0] * axis[0][0] + origin[1] * axis[0][1] + origin[2] * axis[0][2];
    649 
    650 	out[3][0] = 0.0f;
    651 	out[3][1] = 0.0f;
    652 	out[3][2] = 0.0f;
    653 	out[3][3] = 1.0f;
    654 }
    655 
    656 /*
    657 ========================
    658 idRenderMatrix::CreateProjectionMatrix
    659 
    660 If zFar == 0, an infinite far plane will be used.
    661 ========================
    662 */
    663 void idRenderMatrix::CreateProjectionMatrix( float xMin, float xMax, float yMin, float yMax, float zNear, float zFar, idRenderMatrix & out ) {
    664 	const float width  = xMax - xMin;
    665 	const float height = yMax - yMin;
    666 
    667 	out[0][0] = 2.0f * zNear / width;
    668 	out[0][1] = 0.0f;
    669 	out[0][2] = ( xMax + xMin ) / width;	// normally 0	
    670 	out[0][3] = 0.0f;
    671 
    672 	out[1][0] = 0.0f;
    673 	out[1][1] = 2.0f * zNear / height;
    674 	out[1][2] = ( yMax + yMin ) / height;	// normally 0	
    675 	out[1][3] = 0.0f;
    676 
    677 	if ( zFar <= zNear ) {
    678 		// this is the far-plane-at-infinity formulation
    679 		out[2][0] = 0.0f;
    680 		out[2][1] = 0.0f;
    681 		out[2][2] = -1.0f;
    682 #if defined( CLIP_SPACE_D3D )
    683 		// the D3D clip space Z is in range [0,1] instead of [-1,1]
    684 		out[2][3] = -zNear;
    685 #else
    686 		out[2][3] = -2.0f * zNear;
    687 #endif
    688 	} else {
    689 		out[2][0] = 0.0f;
    690 		out[2][1] = 0.0f;
    691 #if defined( CLIP_SPACE_D3D )
    692 		// the D3D clip space Z is in range [0,1] instead of [-1,1]
    693 		out[2][2] = -( zFar ) / ( zFar - zNear );
    694 		out[2][3] = -( zFar * zNear ) / ( zFar - zNear );
    695 #else
    696 		out[2][2] = -( zFar + zNear ) / ( zFar - zNear );
    697 		out[2][3] = -( 2.0f * zFar * zNear ) / ( zFar - zNear );
    698 #endif
    699 	}
    700 
    701 	out[3][0] = 0.0f;
    702 	out[3][1] = 0.0f;
    703 	out[3][2] = -1.0f;
    704 	out[3][3] = 0.0f;
    705 }
    706 
    707 /*
    708 ========================
    709 idRenderMatrix::CreateProjectionMatrixFov
    710 
    711 xOffset and yOffset should be in the -1 to 1 range for sub-pixel accumulation jitter.
    712 xOffset can also be used for eye separation when rendering stereo.
    713 ========================
    714 */
    715 void idRenderMatrix::CreateProjectionMatrixFov( float xFovDegrees, float yFovDegrees, float zNear, float zFar, float xOffset, float yOffset, idRenderMatrix & out ) {
    716 	float xMax = zNear * idMath::Tan( DEG2RAD( xFovDegrees ) * 0.5f );
    717 	float xMin = -xMax;
    718 
    719 	float yMax = zNear * idMath::Tan( DEG2RAD( yFovDegrees ) * 0.5f );
    720 	float yMin = -yMax;
    721 
    722 	xMin += xOffset;
    723 	xMax += xOffset;
    724 
    725 	yMin += yOffset;
    726 	yMax += yOffset;
    727 
    728 	CreateProjectionMatrix( xMin, xMax, yMin, yMax, zNear, zFar, out );
    729 }
    730 
    731 /*
    732 ========================
    733 idRenderMatrix::OffsetScaleForBounds
    734 
    735 Add the offset to the center of the bounds and scale for the width of the bounds.
    736 The result matrix will transform the unit-cube to exactly cover the bounds.
    737 ========================
    738 */
    739 void idRenderMatrix::OffsetScaleForBounds( const idRenderMatrix & src, const idBounds & bounds, idRenderMatrix & out ) {
    740 	assert( &src != &out );
    741 
    742 #ifdef ID_WIN_X86_SSE2_INTRIN
    743 
    744 	__m128 b0 = _mm_loadu_bounds_0( bounds );
    745 	__m128 b1 = _mm_loadu_bounds_1( bounds );
    746 
    747 	__m128 offset = _mm_mul_ps( _mm_add_ps( b1, b0 ), vector_float_half );
    748 	__m128 scale  = _mm_mul_ps( _mm_sub_ps( b1, b0 ), vector_float_half );
    749 
    750 	scale = _mm_or_ps( scale, vector_float_last_one );
    751 
    752 	__m128 a0 = _mm_loadu_ps( src.m + 0*4 );
    753 	__m128 a1 = _mm_loadu_ps( src.m + 1*4 );
    754 	__m128 a2 = _mm_loadu_ps( src.m + 2*4 );
    755 	__m128 a3 = _mm_loadu_ps( src.m + 3*4 );
    756 
    757 	__m128 d0 = _mm_mul_ps( a0, offset );
    758 	__m128 d1 = _mm_mul_ps( a1, offset );
    759 	__m128 d2 = _mm_mul_ps( a2, offset );
    760 	__m128 d3 = _mm_mul_ps( a3, offset );
    761 
    762 	__m128 s0 = _mm_unpacklo_ps( d0, d2 );		// a0, c0, a1, c1
    763 	__m128 s1 = _mm_unpackhi_ps( d0, d2 );		// a2, c2, a3, c3
    764 	__m128 s2 = _mm_unpacklo_ps( d1, d3 );		// b0, d0, b1, d1
    765 	__m128 s3 = _mm_unpackhi_ps( d1, d3 );		// b2, d2, b3, d3
    766 
    767 	__m128 t0 = _mm_unpacklo_ps( s0, s2 );		// a0, b0, c0, d0
    768 	__m128 t1 = _mm_unpackhi_ps( s0, s2 );		// a1, b1, c1, d1
    769 	__m128 t2 = _mm_unpacklo_ps( s1, s3 );		// a2, b2, c2, d2
    770 
    771 	t0 = _mm_add_ps( t0, t1 );
    772 	t0 = _mm_add_ps( t0, t2 );
    773 
    774 	__m128 n0 = _mm_and_ps( _mm_splat_ps( t0, 0 ), vector_float_keep_last );
    775 	__m128 n1 = _mm_and_ps( _mm_splat_ps( t0, 1 ), vector_float_keep_last );
    776 	__m128 n2 = _mm_and_ps( _mm_splat_ps( t0, 2 ), vector_float_keep_last );
    777 	__m128 n3 = _mm_and_ps( _mm_splat_ps( t0, 3 ), vector_float_keep_last );
    778 
    779 	a0 = _mm_madd_ps( a0, scale, n0 );
    780 	a1 = _mm_madd_ps( a1, scale, n1 );
    781 	a2 = _mm_madd_ps( a2, scale, n2 );
    782 	a3 = _mm_madd_ps( a3, scale, n3 );
    783 
    784 	_mm_storeu_ps( out.m + 0*4, a0 );
    785 	_mm_storeu_ps( out.m + 1*4, a1 );
    786 	_mm_storeu_ps( out.m + 2*4, a2 );
    787 	_mm_storeu_ps( out.m + 3*4, a3 );
    788 
    789 #else
    790 
    791 	const idVec3 offset = ( bounds[1] + bounds[0] ) * 0.5f;
    792 	const idVec3 scale = ( bounds[1] - bounds[0] ) * 0.5f;
    793 
    794 	out[0][0] = src[0][0] * scale[0];
    795 	out[0][1] = src[0][1] * scale[1];
    796 	out[0][2] = src[0][2] * scale[2];
    797 	out[0][3] = src[0][3] + src[0][0] * offset[0] + src[0][1] * offset[1] + src[0][2] * offset[2];
    798 
    799 	out[1][0] = src[1][0] * scale[0];
    800 	out[1][1] = src[1][1] * scale[1];
    801 	out[1][2] = src[1][2] * scale[2];
    802 	out[1][3] = src[1][3] + src[1][0] * offset[0] + src[1][1] * offset[1] + src[1][2] * offset[2];
    803 
    804 	out[2][0] = src[2][0] * scale[0];
    805 	out[2][1] = src[2][1] * scale[1];
    806 	out[2][2] = src[2][2] * scale[2];
    807 	out[2][3] = src[2][3] + src[2][0] * offset[0] + src[2][1] * offset[1] + src[2][2] * offset[2];
    808 
    809 	out[3][0] = src[3][0] * scale[0];
    810 	out[3][1] = src[3][1] * scale[1];
    811 	out[3][2] = src[3][2] * scale[2];
    812 	out[3][3] = src[3][3] + src[3][0] * offset[0] + src[3][1] * offset[1] + src[3][2] * offset[2];
    813 
    814 #endif
    815 }
    816 
    817 /*
    818 ========================
    819 idRenderMatrix::InverseOffsetScaleForBounds
    820 
    821 Subtract the offset to the center of the bounds and inverse scale for the width of the bounds.
    822 The result matrix will transform the bounds to exactly cover the unit-cube.
    823 ========================
    824 */
    825 void idRenderMatrix::InverseOffsetScaleForBounds( const idRenderMatrix & src, const idBounds & bounds, idRenderMatrix & out ) {
    826 	assert( &src != &out );
    827 
    828 #ifdef ID_WIN_X86_SSE2_INTRIN
    829 
    830 	__m128 b0 = _mm_loadu_bounds_0( bounds );
    831 	__m128 b1 = _mm_loadu_bounds_1( bounds );
    832 
    833 	__m128 offset = _mm_mul_ps( _mm_add_ps( b1, b0 ), vector_float_neg_half );
    834 	__m128 scale  = _mm_mul_ps( _mm_sub_ps( b0, b1 ), vector_float_neg_half );
    835 
    836 	scale = _mm_max_ps( scale, vector_float_smallest_non_denorm );
    837 
    838 	__m128 rscale = _mm_rcp32_ps( scale );
    839 
    840 	offset = _mm_mul_ps( offset, rscale );
    841 
    842 	__m128 d0 = _mm_and_ps( _mm_splat_ps( offset, 0 ), vector_float_keep_last );
    843 	__m128 d1 = _mm_and_ps( _mm_splat_ps( offset, 1 ), vector_float_keep_last );
    844 	__m128 d2 = _mm_and_ps( _mm_splat_ps( offset, 2 ), vector_float_keep_last );
    845 
    846 	__m128 a0 = _mm_loadu_ps( src.m + 0*4 );
    847 	__m128 a1 = _mm_loadu_ps( src.m + 1*4 );
    848 	__m128 a2 = _mm_loadu_ps( src.m + 2*4 );
    849 	__m128 a3 = _mm_loadu_ps( src.m + 3*4 );
    850 
    851 	a0 = _mm_madd_ps( a0, _mm_splat_ps( rscale, 0 ), d0 );
    852 	a1 = _mm_madd_ps( a1, _mm_splat_ps( rscale, 1 ), d1 );
    853 	a2 = _mm_madd_ps( a2, _mm_splat_ps( rscale, 2 ), d2 );
    854 
    855 	_mm_storeu_ps( out.m + 0*4, a0 );
    856 	_mm_storeu_ps( out.m + 1*4, a1 );
    857 	_mm_storeu_ps( out.m + 2*4, a2 );
    858 	_mm_storeu_ps( out.m + 3*4, a3 );
    859 
    860 #else
    861 
    862 	const idVec3 offset = -0.5f * ( bounds[1] + bounds[0] );
    863 	const idVec3 scale = 2.0f / ( bounds[1] - bounds[0] );
    864 
    865 	out[0][0] = scale[0] * src[0][0];
    866 	out[0][1] = scale[0] * src[0][1];
    867 	out[0][2] = scale[0] * src[0][2];
    868 	out[0][3] = scale[0] * ( src[0][3] + offset[0] );
    869 						
    870 	out[1][0] = scale[1] * src[1][0];
    871 	out[1][1] = scale[1] * src[1][1];
    872 	out[1][2] = scale[1] * src[1][2];
    873 	out[1][3] = scale[1] * ( src[1][3] + offset[1] );
    874 						
    875 	out[2][0] = scale[2] * src[2][0];
    876 	out[2][1] = scale[2] * src[2][1];
    877 	out[2][2] = scale[2] * src[2][2];
    878 	out[2][3] = scale[2] * ( src[2][3] + offset[2] );
    879 
    880 	out[3][0] = src[3][0];
    881 	out[3][1] = src[3][1];
    882 	out[3][2] = src[3][2];
    883 	out[3][3] = src[3][3];
    884 
    885 #endif
    886 }
    887 
    888 /*
    889 ========================
    890 idRenderMatrix::Transpose
    891 ========================
    892 */
    893 void idRenderMatrix::Transpose( const idRenderMatrix & src, idRenderMatrix & out ) {
    894 	assert( &src != &out );
    895 
    896 #ifdef ID_WIN_X86_SSE2_INTRIN
    897 
    898 	const __m128 a0 = _mm_loadu_ps( src.m + 0*4 );
    899 	const __m128 a1 = _mm_loadu_ps( src.m + 1*4 );
    900 	const __m128 a2 = _mm_loadu_ps( src.m + 2*4 );
    901 	const __m128 a3 = _mm_loadu_ps( src.m + 3*4 );
    902 
    903 	const __m128 r0 = _mm_unpacklo_ps( a0, a2 );
    904 	const __m128 r1 = _mm_unpackhi_ps( a0, a2 );
    905 	const __m128 r2 = _mm_unpacklo_ps( a1, a3 );
    906 	const __m128 r3 = _mm_unpackhi_ps( a1, a3 );
    907 
    908 	const __m128 t0 = _mm_unpacklo_ps( r0, r2 );
    909 	const __m128 t1 = _mm_unpackhi_ps( r0, r2 );
    910 	const __m128 t2 = _mm_unpacklo_ps( r1, r3 );
    911 	const __m128 t3 = _mm_unpackhi_ps( r1, r3 );
    912 
    913 	_mm_storeu_ps( out.m + 0*4, t0 );
    914 	_mm_storeu_ps( out.m + 1*4, t1 );
    915 	_mm_storeu_ps( out.m + 2*4, t2 );
    916 	_mm_storeu_ps( out.m + 3*4, t3 );
    917 
    918 #else
    919 
    920 	for ( int i = 0; i < 4; i++ ) {
    921 		for ( int j = 0; j < 4; j++ ) {
    922 			out[i][j] = src[j][i];
    923 		}
    924 	}
    925 
    926 #endif
    927 }
    928 
    929 /*
    930 ========================
    931 idRenderMatrix::Multiply
    932 ========================
    933 */
    934 void idRenderMatrix::Multiply( const idRenderMatrix & a, const idRenderMatrix & b, idRenderMatrix & out ) {
    935 
    936 #ifdef ID_WIN_X86_SSE2_INTRIN
    937 
    938 	__m128 a0 = _mm_loadu_ps( a.m + 0*4 );
    939 	__m128 a1 = _mm_loadu_ps( a.m + 1*4 );
    940 	__m128 a2 = _mm_loadu_ps( a.m + 2*4 );
    941 	__m128 a3 = _mm_loadu_ps( a.m + 3*4 );
    942 
    943 	__m128 b0 = _mm_loadu_ps( b.m + 0*4 );
    944 	__m128 b1 = _mm_loadu_ps( b.m + 1*4 );
    945 	__m128 b2 = _mm_loadu_ps( b.m + 2*4 );
    946 	__m128 b3 = _mm_loadu_ps( b.m + 3*4 );
    947 
    948 	__m128 t0 = _mm_mul_ps( _mm_splat_ps( a0, 0 ), b0 );
    949 	__m128 t1 = _mm_mul_ps( _mm_splat_ps( a1, 0 ), b0 );
    950 	__m128 t2 = _mm_mul_ps( _mm_splat_ps( a2, 0 ), b0 );
    951 	__m128 t3 = _mm_mul_ps( _mm_splat_ps( a3, 0 ), b0 );
    952 
    953 	t0 = _mm_madd_ps( _mm_splat_ps( a0, 1 ), b1, t0 );
    954 	t1 = _mm_madd_ps( _mm_splat_ps( a1, 1 ), b1, t1 );
    955 	t2 = _mm_madd_ps( _mm_splat_ps( a2, 1 ), b1, t2 );
    956 	t3 = _mm_madd_ps( _mm_splat_ps( a3, 1 ), b1, t3 );
    957 
    958 	t0 = _mm_madd_ps( _mm_splat_ps( a0, 2 ), b2, t0 );
    959 	t1 = _mm_madd_ps( _mm_splat_ps( a1, 2 ), b2, t1 );
    960 	t2 = _mm_madd_ps( _mm_splat_ps( a2, 2 ), b2, t2 );
    961 	t3 = _mm_madd_ps( _mm_splat_ps( a3, 2 ), b2, t3 );
    962 
    963 	t0 = _mm_madd_ps( _mm_splat_ps( a0, 3 ), b3, t0 );
    964 	t1 = _mm_madd_ps( _mm_splat_ps( a1, 3 ), b3, t1 );
    965 	t2 = _mm_madd_ps( _mm_splat_ps( a2, 3 ), b3, t2 );
    966 	t3 = _mm_madd_ps( _mm_splat_ps( a3, 3 ), b3, t3 );
    967 
    968 	_mm_storeu_ps( out.m + 0*4, t0 );
    969 	_mm_storeu_ps( out.m + 1*4, t1 );
    970 	_mm_storeu_ps( out.m + 2*4, t2 );
    971 	_mm_storeu_ps( out.m + 3*4, t3 );
    972 
    973 #else
    974 
    975 	/*
    976 	for ( int i = 0 ; i < 4 ; i++ ) {
    977 		for ( int j = 0 ; j < 4 ; j++ ) {
    978 			out.m[ i * 4 + j ] =
    979 				a.m[ i * 4 + 0 ] * b.m[ 0 * 4 + j ] +
    980 				a.m[ i * 4 + 1 ] * b.m[ 1 * 4 + j ] +
    981 				a.m[ i * 4 + 2 ] * b.m[ 2 * 4 + j ] +
    982 				a.m[ i * 4 + 3 ] * b.m[ 3 * 4 + j ];
    983 		}
    984 	}
    985 	*/
    986 
    987 	out.m[0*4+0] = a.m[0*4+0]*b.m[0*4+0] + a.m[0*4+1]*b.m[1*4+0] + a.m[0*4+2]*b.m[2*4+0] + a.m[0*4+3]*b.m[3*4+0];
    988 	out.m[0*4+1] = a.m[0*4+0]*b.m[0*4+1] + a.m[0*4+1]*b.m[1*4+1] + a.m[0*4+2]*b.m[2*4+1] + a.m[0*4+3]*b.m[3*4+1];
    989 	out.m[0*4+2] = a.m[0*4+0]*b.m[0*4+2] + a.m[0*4+1]*b.m[1*4+2] + a.m[0*4+2]*b.m[2*4+2] + a.m[0*4+3]*b.m[3*4+2];
    990 	out.m[0*4+3] = a.m[0*4+0]*b.m[0*4+3] + a.m[0*4+1]*b.m[1*4+3] + a.m[0*4+2]*b.m[2*4+3] + a.m[0*4+3]*b.m[3*4+3];
    991 
    992 	out.m[1*4+0] = a.m[1*4+0]*b.m[0*4+0] + a.m[1*4+1]*b.m[1*4+0] + a.m[1*4+2]*b.m[2*4+0] + a.m[1*4+3]*b.m[3*4+0];
    993 	out.m[1*4+1] = a.m[1*4+0]*b.m[0*4+1] + a.m[1*4+1]*b.m[1*4+1] + a.m[1*4+2]*b.m[2*4+1] + a.m[1*4+3]*b.m[3*4+1];
    994 	out.m[1*4+2] = a.m[1*4+0]*b.m[0*4+2] + a.m[1*4+1]*b.m[1*4+2] + a.m[1*4+2]*b.m[2*4+2] + a.m[1*4+3]*b.m[3*4+2];
    995 	out.m[1*4+3] = a.m[1*4+0]*b.m[0*4+3] + a.m[1*4+1]*b.m[1*4+3] + a.m[1*4+2]*b.m[2*4+3] + a.m[1*4+3]*b.m[3*4+3];
    996 
    997 	out.m[2*4+0] = a.m[2*4+0]*b.m[0*4+0] + a.m[2*4+1]*b.m[1*4+0] + a.m[2*4+2]*b.m[2*4+0] + a.m[2*4+3]*b.m[3*4+0];
    998 	out.m[2*4+1] = a.m[2*4+0]*b.m[0*4+1] + a.m[2*4+1]*b.m[1*4+1] + a.m[2*4+2]*b.m[2*4+1] + a.m[2*4+3]*b.m[3*4+1];
    999 	out.m[2*4+2] = a.m[2*4+0]*b.m[0*4+2] + a.m[2*4+1]*b.m[1*4+2] + a.m[2*4+2]*b.m[2*4+2] + a.m[2*4+3]*b.m[3*4+2];
   1000 	out.m[2*4+3] = a.m[2*4+0]*b.m[0*4+3] + a.m[2*4+1]*b.m[1*4+3] + a.m[2*4+2]*b.m[2*4+3] + a.m[2*4+3]*b.m[3*4+3];
   1001 
   1002 	out.m[3*4+0] = a.m[3*4+0]*b.m[0*4+0] + a.m[3*4+1]*b.m[1*4+0] + a.m[3*4+2]*b.m[2*4+0] + a.m[3*4+3]*b.m[3*4+0];
   1003 	out.m[3*4+1] = a.m[3*4+0]*b.m[0*4+1] + a.m[3*4+1]*b.m[1*4+1] + a.m[3*4+2]*b.m[2*4+1] + a.m[3*4+3]*b.m[3*4+1];
   1004 	out.m[3*4+2] = a.m[3*4+0]*b.m[0*4+2] + a.m[3*4+1]*b.m[1*4+2] + a.m[3*4+2]*b.m[2*4+2] + a.m[3*4+3]*b.m[3*4+2];
   1005 	out.m[3*4+3] = a.m[3*4+0]*b.m[0*4+3] + a.m[3*4+1]*b.m[1*4+3] + a.m[3*4+2]*b.m[2*4+3] + a.m[3*4+3]*b.m[3*4+3];
   1006 
   1007 #endif
   1008 }
   1009 
   1010 /*
   1011 ========================
   1012 idRenderMatrix::Inverse
   1013 
   1014 inverse( M ) = ( 1 / determinant( M ) ) * transpose( cofactor( M ) )
   1015 
   1016 This code is based on the code written by Cédric Lallain, published on "Cell Performance"
   1017 (by Mike Acton) and released under the BSD 3-Clause ("BSD New" or "BSD Simplified") license.
   1018 https://code.google.com/p/cellperformance-snippets/
   1019 
   1020 Note that large parallel lights can have very small values in the projection matrix,
   1021 scaling tens of thousands of world units down to a 0-1 range, so the determinants
   1022 can get really, really small.
   1023 ========================
   1024 */
   1025 bool idRenderMatrix::Inverse( const idRenderMatrix & src, idRenderMatrix & out ) {
   1026 
   1027 #ifdef ID_WIN_X86_SSE2_INTRIN
   1028 
   1029 	const __m128 r0 = _mm_loadu_ps( src.m + 0 * 4 );
   1030 	const __m128 r1 = _mm_loadu_ps( src.m + 1 * 4 );
   1031 	const __m128 r2 = _mm_loadu_ps( src.m + 2 * 4 );
   1032 	const __m128 r3 = _mm_loadu_ps( src.m + 3 * 4 );
   1033 
   1034 	// rXuY = row X rotated up by Y floats.
   1035 	const __m128 r0u1 = _mm_perm_ps( r0, _MM_SHUFFLE( 2, 1, 0, 3 ) );
   1036 	const __m128 r0u2 = _mm_perm_ps( r0, _MM_SHUFFLE( 1, 0, 3, 2 ) );
   1037 	const __m128 r0u3 = _mm_perm_ps( r0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   1038 
   1039 	const __m128 r1u1 = _mm_perm_ps( r1, _MM_SHUFFLE( 2, 1, 0, 3 ) );
   1040 	const __m128 r1u2 = _mm_perm_ps( r1, _MM_SHUFFLE( 1, 0, 3, 2 ) );
   1041 	const __m128 r1u3 = _mm_perm_ps( r1, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   1042 
   1043 	const __m128 r2u1 = _mm_perm_ps( r2, _MM_SHUFFLE( 2, 1, 0, 3 ) );
   1044 	const __m128 r2u2 = _mm_perm_ps( r2, _MM_SHUFFLE( 1, 0, 3, 2 ) );
   1045 	const __m128 r2u3 = _mm_perm_ps( r2, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   1046 
   1047 	const __m128 r3u1 = _mm_perm_ps( r3, _MM_SHUFFLE( 2, 1, 0, 3 ) );
   1048 	const __m128 r3u2 = _mm_perm_ps( r3, _MM_SHUFFLE( 1, 0, 3, 2 ) );
   1049 	const __m128 r3u3 = _mm_perm_ps( r3, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   1050 
   1051 	const __m128 m_r2u2_r3u3      						= _mm_mul_ps( r2u2, r3u3 );
   1052 	const __m128 m_r1u1_r2u2_r3u3 						= _mm_mul_ps( r1u1, m_r2u2_r3u3 );
   1053 	const __m128 m_r2u3_r3u1							= _mm_mul_ps( r2u3, r3u1 );
   1054 	const __m128 a_m_r1u2_r2u3_r3u1_m_r1u1_r2u2_r3u3	= _mm_madd_ps( r1u2, m_r2u3_r3u1, m_r1u1_r2u2_r3u3 );
   1055 	const __m128 m_r2u1_r3u2							= _mm_perm_ps( m_r2u2_r3u3, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   1056 	const __m128 pos_part_det3x3_r0						= _mm_madd_ps( r1u3, m_r2u1_r3u2, a_m_r1u2_r2u3_r3u1_m_r1u1_r2u2_r3u3 );
   1057 	const __m128 m_r2u3_r3u2							= _mm_mul_ps( r2u3, r3u2 );
   1058 	const __m128 m_r1u1_r2u3_r3u2						= _mm_mul_ps( r1u1, m_r2u3_r3u2 );
   1059 	const __m128 m_r2u1_r3u3							= _mm_perm_ps( m_r2u3_r3u1, _MM_SHUFFLE( 1, 0, 3, 2 ) );
   1060 	const __m128 a_m_r1u2_r2u1_r3u3_m_r1u1_r2u3_r3u2	= _mm_madd_ps( r1u2, m_r2u1_r3u3, m_r1u1_r2u3_r3u2 );
   1061 	const __m128 m_r2u2_r3u1							= _mm_perm_ps( m_r2u3_r3u2, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   1062 	const __m128 neg_part_det3x3_r0						= _mm_madd_ps( r1u3, m_r2u2_r3u1, a_m_r1u2_r2u1_r3u3_m_r1u1_r2u3_r3u2 );
   1063 	const __m128 det3x3_r0 								= _mm_sub_ps( pos_part_det3x3_r0, neg_part_det3x3_r0 );
   1064 
   1065 	const __m128 m_r0u1_r2u2_r3u3 						= _mm_mul_ps( r0u1, m_r2u2_r3u3 );
   1066 	const __m128 a_m_r0u2_r2u3_r3u1_m_r0u1_r2u2_r3u3	= _mm_madd_ps( r0u2, m_r2u3_r3u1, m_r0u1_r2u2_r3u3 );
   1067 	const __m128 pos_part_det3x3_r1						= _mm_madd_ps( r0u3, m_r2u1_r3u2, a_m_r0u2_r2u3_r3u1_m_r0u1_r2u2_r3u3 );
   1068 	const __m128 m_r0u1_r2u3_r3u2						= _mm_mul_ps( r0u1, m_r2u3_r3u2 );
   1069 	const __m128 a_m_r0u2_r2u1_r3u3_m_r0u1_r2u3_r3u2	= _mm_madd_ps( r0u2, m_r2u1_r3u3, m_r0u1_r2u3_r3u2 );
   1070 	const __m128 neg_part_det3x3_r1						= _mm_madd_ps( r0u3, m_r2u2_r3u1, a_m_r0u2_r2u1_r3u3_m_r0u1_r2u3_r3u2 );
   1071 	const __m128 det3x3_r1 								= _mm_sub_ps( pos_part_det3x3_r1, neg_part_det3x3_r1 );
   1072 
   1073 	const __m128 m_r0u1_r1u2      						= _mm_mul_ps( r0u1, r1u2 );
   1074 	const __m128 m_r0u1_r1u2_r2u3 						= _mm_mul_ps( m_r0u1_r1u2, r2u3 );
   1075 	const __m128 m_r0u2_r1u3							= _mm_perm_ps( m_r0u1_r1u2, _MM_SHUFFLE( 2, 1, 0, 3 ) );
   1076 	const __m128 a_m_r0u2_r1u3_r2u1_m_r0u1_r1u2_r2u3	= _mm_madd_ps( m_r0u2_r1u3, r2u1, m_r0u1_r1u2_r2u3 );
   1077 	const __m128 m_r0u3_r1u1							= _mm_mul_ps( r0u3, r1u1 );
   1078 	const __m128 pos_part_det3x3_r3						= _mm_madd_ps( m_r0u3_r1u1, r2u2, a_m_r0u2_r1u3_r2u1_m_r0u1_r1u2_r2u3 );
   1079 	const __m128 m_r0u1_r1u3							= _mm_perm_ps( m_r0u3_r1u1, _MM_SHUFFLE( 1, 0, 3, 2 ) );
   1080 	const __m128 m_r0u1_r1u3_r2u2						= _mm_mul_ps( m_r0u1_r1u3, r2u2 );
   1081 	const __m128 m_r0u2_r1u1							= _mm_mul_ps( r0u2, r1u1 );
   1082 	const __m128 a_m_r0u2_r1u1_r2u3_m_r0u1_r1u3_r2u2	= _mm_madd_ps( m_r0u2_r1u1, r2u3, m_r0u1_r1u3_r2u2 );
   1083 	const __m128 m_r0u3_r1u2							= _mm_perm_ps( m_r0u2_r1u1, _MM_SHUFFLE( 2, 1, 0, 3 ) );
   1084 	const __m128 neg_part_det3x3_r3						= _mm_madd_ps( m_r0u3_r1u2, r2u1, a_m_r0u2_r1u1_r2u3_m_r0u1_r1u3_r2u2 );
   1085 	const __m128 det3x3_r3 								= _mm_sub_ps( pos_part_det3x3_r3, neg_part_det3x3_r3 );
   1086 
   1087 	const __m128 m_r0u1_r1u2_r3u3 						= _mm_mul_ps( m_r0u1_r1u2, r3u3 );
   1088 	const __m128 a_m_r0u2_r1u3_r3u1_m_r0u1_r1u2_r3u3	= _mm_madd_ps( m_r0u2_r1u3, r3u1, m_r0u1_r1u2_r3u3 );
   1089 	const __m128 pos_part_det3x3_r2						= _mm_madd_ps( m_r0u3_r1u1, r3u2, a_m_r0u2_r1u3_r3u1_m_r0u1_r1u2_r3u3 );
   1090 	const __m128 m_r0u1_r1u3_r3u2						= _mm_mul_ps( m_r0u1_r1u3, r3u2 );
   1091 	const __m128 a_m_r0u2_r1u1_r3u3_m_r0u1_r1u3_r3u2	= _mm_madd_ps( m_r0u2_r1u1, r3u3, m_r0u1_r1u3_r3u2 );
   1092 	const __m128 neg_part_det3x3_r2						= _mm_madd_ps( m_r0u3_r1u2, r3u1, a_m_r0u2_r1u1_r3u3_m_r0u1_r1u3_r3u2 );
   1093 	const __m128 det3x3_r2 								= _mm_sub_ps( pos_part_det3x3_r2, neg_part_det3x3_r2 );
   1094 
   1095 	const __m128 c_zero		= _mm_setzero_ps();
   1096 	const __m128 c_mask		= _mm_cmpeq_ps( c_zero, c_zero );
   1097 	const __m128 c_signmask	= _mm_castsi128_ps( _mm_slli_epi32( _mm_castps_si128( c_mask ), 31 ) );
   1098 	const __m128 c_znzn		= _mm_unpacklo_ps( c_zero, c_signmask );
   1099 	const __m128 c_nznz		= _mm_unpacklo_ps( c_signmask, c_zero );
   1100 
   1101 	const __m128 cofactor_r0 = _mm_xor_ps( det3x3_r0, c_znzn );
   1102 	const __m128 cofactor_r1 = _mm_xor_ps( det3x3_r1, c_nznz );
   1103 	const __m128 cofactor_r2 = _mm_xor_ps( det3x3_r2, c_znzn );
   1104 	const __m128 cofactor_r3 = _mm_xor_ps( det3x3_r3, c_nznz );
   1105 
   1106 	const __m128 dot0	= _mm_mul_ps( r0, cofactor_r0 );
   1107 	const __m128 dot1 	= _mm_add_ps( dot0, _mm_perm_ps( dot0, _MM_SHUFFLE( 2, 1, 0, 3 ) ) );
   1108 	const __m128 det	= _mm_add_ps( dot1, _mm_perm_ps( dot1, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   1109 
   1110 	const __m128 absDet	= _mm_andnot_ps( c_signmask, det );
   1111 	if ( _mm_movemask_ps( _mm_cmplt_ps( absDet, vector_float_inverse_epsilon ) ) & 15 ) {
   1112 		return false;
   1113 	}
   1114 
   1115 	const __m128 rcpDet	= _mm_rcp32_ps( det );
   1116 
   1117 	const __m128 hi_part_r0_r2 = _mm_unpacklo_ps( cofactor_r0, cofactor_r2 );
   1118 	const __m128 lo_part_r0_r2 = _mm_unpackhi_ps( cofactor_r0, cofactor_r2 );
   1119 	const __m128 hi_part_r1_r3 = _mm_unpacklo_ps( cofactor_r1, cofactor_r3 );
   1120 	const __m128 lo_part_r1_r3 = _mm_unpackhi_ps( cofactor_r1, cofactor_r3 );
   1121 
   1122 	const __m128 adjoint_r0    = _mm_unpacklo_ps( hi_part_r0_r2, hi_part_r1_r3 );
   1123 	const __m128 adjoint_r1    = _mm_unpackhi_ps( hi_part_r0_r2, hi_part_r1_r3 );
   1124 	const __m128 adjoint_r2    = _mm_unpacklo_ps( lo_part_r0_r2, lo_part_r1_r3 );
   1125 	const __m128 adjoint_r3    = _mm_unpackhi_ps( lo_part_r0_r2, lo_part_r1_r3 );
   1126 
   1127 	_mm_storeu_ps( out.m + 0 * 4, _mm_mul_ps( adjoint_r0, rcpDet ) );
   1128 	_mm_storeu_ps( out.m + 1 * 4, _mm_mul_ps( adjoint_r1, rcpDet ) );
   1129 	_mm_storeu_ps( out.m + 2 * 4, _mm_mul_ps( adjoint_r2, rcpDet ) );
   1130 	_mm_storeu_ps( out.m + 3 * 4, _mm_mul_ps( adjoint_r3, rcpDet ) );
   1131 
   1132 #else
   1133 
   1134 	const int FRL = 4;
   1135 
   1136 	// 84+4+16 = 104 multiplications
   1137 	//			   1 division
   1138 
   1139 	// 2x2 sub-determinants required to calculate 4x4 determinant
   1140 	const float det2_01_01 = src.m[0*FRL+0] * src.m[1*FRL+1] - src.m[0*FRL+1] * src.m[1*FRL+0];
   1141 	const float det2_01_02 = src.m[0*FRL+0] * src.m[1*FRL+2] - src.m[0*FRL+2] * src.m[1*FRL+0];
   1142 	const float det2_01_03 = src.m[0*FRL+0] * src.m[1*FRL+3] - src.m[0*FRL+3] * src.m[1*FRL+0];
   1143 	const float det2_01_12 = src.m[0*FRL+1] * src.m[1*FRL+2] - src.m[0*FRL+2] * src.m[1*FRL+1];
   1144 	const float det2_01_13 = src.m[0*FRL+1] * src.m[1*FRL+3] - src.m[0*FRL+3] * src.m[1*FRL+1];
   1145 	const float det2_01_23 = src.m[0*FRL+2] * src.m[1*FRL+3] - src.m[0*FRL+3] * src.m[1*FRL+2];
   1146 
   1147 	// 3x3 sub-determinants required to calculate 4x4 determinant
   1148 	const float det3_201_012 = src.m[2*FRL+0] * det2_01_12 - src.m[2*FRL+1] * det2_01_02 + src.m[2*FRL+2] * det2_01_01;
   1149 	const float det3_201_013 = src.m[2*FRL+0] * det2_01_13 - src.m[2*FRL+1] * det2_01_03 + src.m[2*FRL+3] * det2_01_01;
   1150 	const float det3_201_023 = src.m[2*FRL+0] * det2_01_23 - src.m[2*FRL+2] * det2_01_03 + src.m[2*FRL+3] * det2_01_02;
   1151 	const float det3_201_123 = src.m[2*FRL+1] * det2_01_23 - src.m[2*FRL+2] * det2_01_13 + src.m[2*FRL+3] * det2_01_12;
   1152 
   1153 	const float det = ( - det3_201_123 * src.m[3*FRL+0] + det3_201_023 * src.m[3*FRL+1] - det3_201_013 * src.m[3*FRL+2] + det3_201_012 * src.m[3*FRL+3] );
   1154 
   1155 	if ( idMath::Fabs( det ) < RENDER_MATRIX_INVERSE_EPSILON ) {
   1156 		return false;
   1157 	}
   1158 
   1159 	const float rcpDet = 1.0f / det;
   1160 
   1161 	// remaining 2x2 sub-determinants
   1162 	const float det2_03_01 = src.m[0*FRL+0] * src.m[3*FRL+1] - src.m[0*FRL+1] * src.m[3*FRL+0];
   1163 	const float det2_03_02 = src.m[0*FRL+0] * src.m[3*FRL+2] - src.m[0*FRL+2] * src.m[3*FRL+0];
   1164 	const float det2_03_03 = src.m[0*FRL+0] * src.m[3*FRL+3] - src.m[0*FRL+3] * src.m[3*FRL+0];
   1165 	const float det2_03_12 = src.m[0*FRL+1] * src.m[3*FRL+2] - src.m[0*FRL+2] * src.m[3*FRL+1];
   1166 	const float det2_03_13 = src.m[0*FRL+1] * src.m[3*FRL+3] - src.m[0*FRL+3] * src.m[3*FRL+1];
   1167 	const float det2_03_23 = src.m[0*FRL+2] * src.m[3*FRL+3] - src.m[0*FRL+3] * src.m[3*FRL+2];
   1168 
   1169 	const float det2_13_01 = src.m[1*FRL+0] * src.m[3*FRL+1] - src.m[1*FRL+1] * src.m[3*FRL+0];
   1170 	const float det2_13_02 = src.m[1*FRL+0] * src.m[3*FRL+2] - src.m[1*FRL+2] * src.m[3*FRL+0];
   1171 	const float det2_13_03 = src.m[1*FRL+0] * src.m[3*FRL+3] - src.m[1*FRL+3] * src.m[3*FRL+0];
   1172 	const float det2_13_12 = src.m[1*FRL+1] * src.m[3*FRL+2] - src.m[1*FRL+2] * src.m[3*FRL+1];
   1173 	const float det2_13_13 = src.m[1*FRL+1] * src.m[3*FRL+3] - src.m[1*FRL+3] * src.m[3*FRL+1];
   1174 	const float det2_13_23 = src.m[1*FRL+2] * src.m[3*FRL+3] - src.m[1*FRL+3] * src.m[3*FRL+2];
   1175 
   1176 	// remaining 3x3 sub-determinants
   1177 	const float det3_203_012 = src.m[2*FRL+0] * det2_03_12 - src.m[2*FRL+1] * det2_03_02 + src.m[2*FRL+2] * det2_03_01;
   1178 	const float det3_203_013 = src.m[2*FRL+0] * det2_03_13 - src.m[2*FRL+1] * det2_03_03 + src.m[2*FRL+3] * det2_03_01;
   1179 	const float det3_203_023 = src.m[2*FRL+0] * det2_03_23 - src.m[2*FRL+2] * det2_03_03 + src.m[2*FRL+3] * det2_03_02;
   1180 	const float det3_203_123 = src.m[2*FRL+1] * det2_03_23 - src.m[2*FRL+2] * det2_03_13 + src.m[2*FRL+3] * det2_03_12;
   1181 
   1182 	const float det3_213_012 = src.m[2*FRL+0] * det2_13_12 - src.m[2*FRL+1] * det2_13_02 + src.m[2*FRL+2] * det2_13_01;
   1183 	const float det3_213_013 = src.m[2*FRL+0] * det2_13_13 - src.m[2*FRL+1] * det2_13_03 + src.m[2*FRL+3] * det2_13_01;
   1184 	const float det3_213_023 = src.m[2*FRL+0] * det2_13_23 - src.m[2*FRL+2] * det2_13_03 + src.m[2*FRL+3] * det2_13_02;
   1185 	const float det3_213_123 = src.m[2*FRL+1] * det2_13_23 - src.m[2*FRL+2] * det2_13_13 + src.m[2*FRL+3] * det2_13_12;
   1186 
   1187 	const float det3_301_012 = src.m[3*FRL+0] * det2_01_12 - src.m[3*FRL+1] * det2_01_02 + src.m[3*FRL+2] * det2_01_01;
   1188 	const float det3_301_013 = src.m[3*FRL+0] * det2_01_13 - src.m[3*FRL+1] * det2_01_03 + src.m[3*FRL+3] * det2_01_01;
   1189 	const float det3_301_023 = src.m[3*FRL+0] * det2_01_23 - src.m[3*FRL+2] * det2_01_03 + src.m[3*FRL+3] * det2_01_02;
   1190 	const float det3_301_123 = src.m[3*FRL+1] * det2_01_23 - src.m[3*FRL+2] * det2_01_13 + src.m[3*FRL+3] * det2_01_12;
   1191 
   1192 	out.m[0*FRL+0] = - det3_213_123 * rcpDet;
   1193 	out.m[1*FRL+0] = + det3_213_023 * rcpDet;
   1194 	out.m[2*FRL+0] = - det3_213_013 * rcpDet;
   1195 	out.m[3*FRL+0] = + det3_213_012 * rcpDet;
   1196 
   1197 	out.m[0*FRL+1] = + det3_203_123 * rcpDet;
   1198 	out.m[1*FRL+1] = - det3_203_023 * rcpDet;
   1199 	out.m[2*FRL+1] = + det3_203_013 * rcpDet;
   1200 	out.m[3*FRL+1] = - det3_203_012 * rcpDet;
   1201 
   1202 	out.m[0*FRL+2] = + det3_301_123 * rcpDet;
   1203 	out.m[1*FRL+2] = - det3_301_023 * rcpDet;
   1204 	out.m[2*FRL+2] = + det3_301_013 * rcpDet;
   1205 	out.m[3*FRL+2] = - det3_301_012 * rcpDet;
   1206 
   1207 	out.m[0*FRL+3] = - det3_201_123 * rcpDet;
   1208 	out.m[1*FRL+3] = + det3_201_023 * rcpDet;
   1209 	out.m[2*FRL+3] = - det3_201_013 * rcpDet;
   1210 	out.m[3*FRL+3] = + det3_201_012 * rcpDet;
   1211 
   1212 #endif
   1213 
   1214 	return true;
   1215 }
   1216 
   1217 /*
   1218 ========================
   1219 idRenderMatrix::InverseByTranspose
   1220 ========================
   1221 */
   1222 void idRenderMatrix::InverseByTranspose( const idRenderMatrix & src, idRenderMatrix & out ) {
   1223 	assert( &src != &out );
   1224 	assert( src.IsAffineTransform( 0.01f ) );
   1225 
   1226 	out[0][0] = src[0][0];
   1227 	out[1][0] = src[0][1];
   1228 	out[2][0] = src[0][2];
   1229 	out[3][0] = 0.0f;
   1230 	out[0][1] = src[1][0];
   1231 	out[1][1] = src[1][1];
   1232 	out[2][1] = src[1][2];
   1233 	out[3][1] = 0.0f;
   1234 	out[0][2] = src[2][0];
   1235 	out[1][2] = src[2][1];
   1236 	out[2][2] = src[2][2];
   1237 	out[3][2] = 0.0f;
   1238 	out[0][3] = -( src[0][0] * src[0][3] + src[1][0] * src[1][3] + src[2][0] * src[2][3] );
   1239 	out[1][3] = -( src[0][1] * src[0][3] + src[1][1] * src[1][3] + src[2][1] * src[2][3] );
   1240 	out[2][3] = -( src[0][2] * src[0][3] + src[1][2] * src[1][3] + src[2][2] * src[2][3] );
   1241 	out[3][3] = 1.0f;
   1242 }
   1243 
   1244 
   1245 /*
   1246 ========================
   1247 idRenderMatrix::InverseByDoubles
   1248 
   1249 This should never be used at run-time.
   1250 This is only for tools where more precision is needed.
   1251 ========================
   1252 */
   1253 bool idRenderMatrix::InverseByDoubles( const idRenderMatrix & src, idRenderMatrix & out ) {
   1254 	const int FRL = 4;
   1255 
   1256 	// 84+4+16 = 104 multiplications
   1257 	//			   1 division
   1258 
   1259 	// 2x2 sub-determinants required to calculate 4x4 determinant
   1260 	const double det2_01_01 = (double)src.m[0*FRL+0] * (double)src.m[1*FRL+1] - (double)src.m[0*FRL+1] * (double)src.m[1*FRL+0];
   1261 	const double det2_01_02 = (double)src.m[0*FRL+0] * (double)src.m[1*FRL+2] - (double)src.m[0*FRL+2] * (double)src.m[1*FRL+0];
   1262 	const double det2_01_03 = (double)src.m[0*FRL+0] * (double)src.m[1*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[1*FRL+0];
   1263 	const double det2_01_12 = (double)src.m[0*FRL+1] * (double)src.m[1*FRL+2] - (double)src.m[0*FRL+2] * (double)src.m[1*FRL+1];
   1264 	const double det2_01_13 = (double)src.m[0*FRL+1] * (double)src.m[1*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[1*FRL+1];
   1265 	const double det2_01_23 = (double)src.m[0*FRL+2] * (double)src.m[1*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[1*FRL+2];
   1266 
   1267 	// 3x3 sub-determinants required to calculate 4x4 determinant
   1268 	const double det3_201_012 = (double)src.m[2*FRL+0] * det2_01_12 - (double)src.m[2*FRL+1] * det2_01_02 + (double)src.m[2*FRL+2] * det2_01_01;
   1269 	const double det3_201_013 = (double)src.m[2*FRL+0] * det2_01_13 - (double)src.m[2*FRL+1] * det2_01_03 + (double)src.m[2*FRL+3] * det2_01_01;
   1270 	const double det3_201_023 = (double)src.m[2*FRL+0] * det2_01_23 - (double)src.m[2*FRL+2] * det2_01_03 + (double)src.m[2*FRL+3] * det2_01_02;
   1271 	const double det3_201_123 = (double)src.m[2*FRL+1] * det2_01_23 - (double)src.m[2*FRL+2] * det2_01_13 + (double)src.m[2*FRL+3] * det2_01_12;
   1272 
   1273 	const double det = ( - det3_201_123 * (double)src.m[3*FRL+0] + det3_201_023 * (double)src.m[3*FRL+1] - det3_201_013 * (double)src.m[3*FRL+2] + det3_201_012 * (double)src.m[3*FRL+3] );
   1274 
   1275 	const double rcpDet = 1.0f / det;
   1276 
   1277 	// remaining 2x2 sub-determinants
   1278 	const double det2_03_01 = (double)src.m[0*FRL+0] * (double)src.m[3*FRL+1] - (double)src.m[0*FRL+1] * (double)src.m[3*FRL+0];
   1279 	const double det2_03_02 = (double)src.m[0*FRL+0] * (double)src.m[3*FRL+2] - (double)src.m[0*FRL+2] * (double)src.m[3*FRL+0];
   1280 	const double det2_03_03 = (double)src.m[0*FRL+0] * (double)src.m[3*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[3*FRL+0];
   1281 	const double det2_03_12 = (double)src.m[0*FRL+1] * (double)src.m[3*FRL+2] - (double)src.m[0*FRL+2] * (double)src.m[3*FRL+1];
   1282 	const double det2_03_13 = (double)src.m[0*FRL+1] * (double)src.m[3*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[3*FRL+1];
   1283 	const double det2_03_23 = (double)src.m[0*FRL+2] * (double)src.m[3*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[3*FRL+2];
   1284 
   1285 	const double det2_13_01 = (double)src.m[1*FRL+0] * (double)src.m[3*FRL+1] - (double)src.m[1*FRL+1] * (double)src.m[3*FRL+0];
   1286 	const double det2_13_02 = (double)src.m[1*FRL+0] * (double)src.m[3*FRL+2] - (double)src.m[1*FRL+2] * (double)src.m[3*FRL+0];
   1287 	const double det2_13_03 = (double)src.m[1*FRL+0] * (double)src.m[3*FRL+3] - (double)src.m[1*FRL+3] * (double)src.m[3*FRL+0];
   1288 	const double det2_13_12 = (double)src.m[1*FRL+1] * (double)src.m[3*FRL+2] - (double)src.m[1*FRL+2] * (double)src.m[3*FRL+1];
   1289 	const double det2_13_13 = (double)src.m[1*FRL+1] * (double)src.m[3*FRL+3] - (double)src.m[1*FRL+3] * (double)src.m[3*FRL+1];
   1290 	const double det2_13_23 = (double)src.m[1*FRL+2] * (double)src.m[3*FRL+3] - (double)src.m[1*FRL+3] * (double)src.m[3*FRL+2];
   1291 
   1292 	// remaining 3x3 sub-determinants
   1293 	const double det3_203_012 = (double)src.m[2*FRL+0] * det2_03_12 - (double)src.m[2*FRL+1] * det2_03_02 + (double)src.m[2*FRL+2] * det2_03_01;
   1294 	const double det3_203_013 = (double)src.m[2*FRL+0] * det2_03_13 - (double)src.m[2*FRL+1] * det2_03_03 + (double)src.m[2*FRL+3] * det2_03_01;
   1295 	const double det3_203_023 = (double)src.m[2*FRL+0] * det2_03_23 - (double)src.m[2*FRL+2] * det2_03_03 + (double)src.m[2*FRL+3] * det2_03_02;
   1296 	const double det3_203_123 = (double)src.m[2*FRL+1] * det2_03_23 - (double)src.m[2*FRL+2] * det2_03_13 + (double)src.m[2*FRL+3] * det2_03_12;
   1297 
   1298 	const double det3_213_012 = (double)src.m[2*FRL+0] * det2_13_12 - (double)src.m[2*FRL+1] * det2_13_02 + (double)src.m[2*FRL+2] * det2_13_01;
   1299 	const double det3_213_013 = (double)src.m[2*FRL+0] * det2_13_13 - (double)src.m[2*FRL+1] * det2_13_03 + (double)src.m[2*FRL+3] * det2_13_01;
   1300 	const double det3_213_023 = (double)src.m[2*FRL+0] * det2_13_23 - (double)src.m[2*FRL+2] * det2_13_03 + (double)src.m[2*FRL+3] * det2_13_02;
   1301 	const double det3_213_123 = (double)src.m[2*FRL+1] * det2_13_23 - (double)src.m[2*FRL+2] * det2_13_13 + (double)src.m[2*FRL+3] * det2_13_12;
   1302 
   1303 	const double det3_301_012 = (double)src.m[3*FRL+0] * det2_01_12 - (double)src.m[3*FRL+1] * det2_01_02 + (double)src.m[3*FRL+2] * det2_01_01;
   1304 	const double det3_301_013 = (double)src.m[3*FRL+0] * det2_01_13 - (double)src.m[3*FRL+1] * det2_01_03 + (double)src.m[3*FRL+3] * det2_01_01;
   1305 	const double det3_301_023 = (double)src.m[3*FRL+0] * det2_01_23 - (double)src.m[3*FRL+2] * det2_01_03 + (double)src.m[3*FRL+3] * det2_01_02;
   1306 	const double det3_301_123 = (double)src.m[3*FRL+1] * det2_01_23 - (double)src.m[3*FRL+2] * det2_01_13 + (double)src.m[3*FRL+3] * det2_01_12;
   1307 
   1308 	out.m[0*FRL+0] = (float)( - det3_213_123 * rcpDet );
   1309 	out.m[1*FRL+0] = (float)( + det3_213_023 * rcpDet );
   1310 	out.m[2*FRL+0] = (float)( - det3_213_013 * rcpDet );
   1311 	out.m[3*FRL+0] = (float)( + det3_213_012 * rcpDet );
   1312 
   1313 	out.m[0*FRL+1] = (float)( + det3_203_123 * rcpDet );
   1314 	out.m[1*FRL+1] = (float)( - det3_203_023 * rcpDet );
   1315 	out.m[2*FRL+1] = (float)( + det3_203_013 * rcpDet );
   1316 	out.m[3*FRL+1] = (float)( - det3_203_012 * rcpDet );
   1317 
   1318 	out.m[0*FRL+2] = (float)( + det3_301_123 * rcpDet );
   1319 	out.m[1*FRL+2] = (float)( - det3_301_023 * rcpDet );
   1320 	out.m[2*FRL+2] = (float)( + det3_301_013 * rcpDet );
   1321 	out.m[3*FRL+2] = (float)( - det3_301_012 * rcpDet );
   1322 
   1323 	out.m[0*FRL+3] = (float)( - det3_201_123 * rcpDet );
   1324 	out.m[1*FRL+3] = (float)( + det3_201_023 * rcpDet );
   1325 	out.m[2*FRL+3] = (float)( - det3_201_013 * rcpDet );
   1326 	out.m[3*FRL+3] = (float)( + det3_201_012 * rcpDet );
   1327 
   1328 	return true;
   1329 }
   1330 
   1331 
   1332 /*
   1333 ========================
   1334 DeterminantIsNegative
   1335 ========================
   1336 */
   1337 #ifdef ID_WIN_X86_SSE2_INTRIN
   1338 
   1339 void DeterminantIsNegative( bool & negativeDeterminant, const __m128 & r0, const __m128 & r1, const __m128 & r2, const __m128 & r3 ) {
   1340 
   1341 	const __m128 r1u1 = _mm_perm_ps( r1, _MM_SHUFFLE( 2, 1, 0, 3 ) );
   1342 	const __m128 r1u2 = _mm_perm_ps( r1, _MM_SHUFFLE( 1, 0, 3, 2 ) );
   1343 	const __m128 r1u3 = _mm_perm_ps( r1, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   1344 
   1345 	const __m128 r2u2 = _mm_perm_ps( r2, _MM_SHUFFLE( 1, 0, 3, 2 ) );
   1346 	const __m128 r2u3 = _mm_perm_ps( r2, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   1347 
   1348 	const __m128 r3u1 = _mm_perm_ps( r3, _MM_SHUFFLE( 2, 1, 0, 3 ) );
   1349 	const __m128 r3u2 = _mm_perm_ps( r3, _MM_SHUFFLE( 1, 0, 3, 2 ) );
   1350 	const __m128 r3u3 = _mm_perm_ps( r3, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   1351 
   1352 	const __m128 m_r2u2_r3u3      						= _mm_mul_ps( r2u2, r3u3 );
   1353 	const __m128 m_r1u1_r2u2_r3u3 						= _mm_mul_ps( r1u1, m_r2u2_r3u3 );
   1354 	const __m128 m_r2u3_r3u1							= _mm_mul_ps( r2u3, r3u1 );
   1355 	const __m128 a_m_r1u2_r2u3_r3u1_m_r1u1_r2u2_r3u3	= _mm_madd_ps( r1u2, m_r2u3_r3u1, m_r1u1_r2u2_r3u3 );
   1356 	const __m128 m_r2u1_r3u2							= _mm_perm_ps( m_r2u2_r3u3, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   1357 	const __m128 pos_part_det3x3_r0						= _mm_madd_ps( r1u3, m_r2u1_r3u2, a_m_r1u2_r2u3_r3u1_m_r1u1_r2u2_r3u3 );
   1358 	const __m128 m_r2u3_r3u2							= _mm_mul_ps( r2u3, r3u2 );
   1359 	const __m128 m_r1u1_r2u3_r3u2						= _mm_mul_ps( r1u1, m_r2u3_r3u2 );
   1360 	const __m128 m_r2u1_r3u3							= _mm_perm_ps( m_r2u3_r3u1, _MM_SHUFFLE( 1, 0, 3, 2 ) );
   1361 	const __m128 a_m_r1u2_r2u1_r3u3_m_r1u1_r2u3_r3u2	= _mm_madd_ps( r1u2, m_r2u1_r3u3, m_r1u1_r2u3_r3u2 );
   1362 	const __m128 m_r2u2_r3u1							= _mm_perm_ps( m_r2u3_r3u2, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   1363 	const __m128 neg_part_det3x3_r0						= _mm_madd_ps( r1u3, m_r2u2_r3u1, a_m_r1u2_r2u1_r3u3_m_r1u1_r2u3_r3u2 );
   1364 	const __m128 det3x3_r0 								= _mm_sub_ps( pos_part_det3x3_r0, neg_part_det3x3_r0 );
   1365 
   1366 	const __m128 c_zero		= _mm_setzero_ps();
   1367 	const __m128 c_mask		= _mm_cmpeq_ps( c_zero, c_zero );
   1368 	const __m128 c_signmask	= _mm_castsi128_ps( _mm_slli_epi32( _mm_castps_si128( c_mask ), 31 ) );
   1369 	const __m128 c_znzn		= _mm_unpacklo_ps( c_zero, c_signmask );
   1370 
   1371 	const __m128 cofactor_r0 = _mm_xor_ps( det3x3_r0, c_znzn );
   1372 
   1373 	const __m128 dot0	= _mm_mul_ps( r0, cofactor_r0 );
   1374 	const __m128 dot1 	= _mm_add_ps( dot0, _mm_perm_ps( dot0, _MM_SHUFFLE( 2, 1, 0, 3 ) ) );
   1375 	const __m128 det	= _mm_add_ps( dot1, _mm_perm_ps( dot1, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   1376 
   1377 	const __m128 result	= _mm_cmpgt_ps( c_zero, det );
   1378 
   1379 	negativeDeterminant	= _mm_movemask_ps( result ) & 1;
   1380 }
   1381 
   1382 #else
   1383 
   1384 void DeterminantIsNegative( bool & negativeDeterminant, const float * row0, const float * row1, const float * row2, const float * row3 ) {
   1385 
   1386 	// 2x2 sub-determinants required to calculate 4x4 determinant
   1387 	const float det2_01_01 = row0[0] * row1[1] - row0[1] * row1[0];
   1388 	const float det2_01_02 = row0[0] * row1[2] - row0[2] * row1[0];
   1389 	const float det2_01_03 = row0[0] * row1[3] - row0[3] * row1[0];
   1390 	const float det2_01_12 = row0[1] * row1[2] - row0[2] * row1[1];
   1391 	const float det2_01_13 = row0[1] * row1[3] - row0[3] * row1[1];
   1392 	const float det2_01_23 = row0[2] * row1[3] - row0[3] * row1[2];
   1393 
   1394 	// 3x3 sub-determinants required to calculate 4x4 determinant
   1395 	const float det3_201_012 = row2[0] * det2_01_12 - row2[1] * det2_01_02 + row2[2] * det2_01_01;
   1396 	const float det3_201_013 = row2[0] * det2_01_13 - row2[1] * det2_01_03 + row2[3] * det2_01_01;
   1397 	const float det3_201_023 = row2[0] * det2_01_23 - row2[2] * det2_01_03 + row2[3] * det2_01_02;
   1398 	const float det3_201_123 = row2[1] * det2_01_23 - row2[2] * det2_01_13 + row2[3] * det2_01_12;
   1399 
   1400 	const float det = ( - det3_201_123 * row3[0] + det3_201_023 * row3[1] - det3_201_013 * row3[2] + det3_201_012 * row3[3] );
   1401 
   1402 	negativeDeterminant = ( det < 0.0f );
   1403 }
   1404 
   1405 #endif
   1406 
   1407 /*
   1408 ========================
   1409 idRenderMatrix::CopyMatrix
   1410 ========================
   1411 */
   1412 void idRenderMatrix::CopyMatrix( const idRenderMatrix & matrix, idVec4 & row0, idVec4 & row1, idVec4 & row2, idVec4 & row3 ) {
   1413 	assert_16_byte_aligned( row0.ToFloatPtr() );
   1414 	assert_16_byte_aligned( row1.ToFloatPtr() );
   1415 	assert_16_byte_aligned( row2.ToFloatPtr() );
   1416 	assert_16_byte_aligned( row3.ToFloatPtr() );
   1417 
   1418 #ifdef ID_WIN_X86_SSE2_INTRIN
   1419 
   1420 	const __m128 r0 = _mm_loadu_ps( matrix.m + 0 * 4 );
   1421 	const __m128 r1 = _mm_loadu_ps( matrix.m + 1 * 4 );
   1422 	const __m128 r2 = _mm_loadu_ps( matrix.m + 2 * 4 );
   1423 	const __m128 r3 = _mm_loadu_ps( matrix.m + 3 * 4 );
   1424 
   1425 	_mm_store_ps( row0.ToFloatPtr(), r0 );
   1426 	_mm_store_ps( row1.ToFloatPtr(), r1 );
   1427 	_mm_store_ps( row2.ToFloatPtr(), r2 );
   1428 	_mm_store_ps( row3.ToFloatPtr(), r3 );
   1429 
   1430 #else
   1431 
   1432 	memcpy( row0.ToFloatPtr(), matrix[0], sizeof( idVec4 ) );
   1433 	memcpy( row1.ToFloatPtr(), matrix[1], sizeof( idVec4 ) );
   1434 	memcpy( row2.ToFloatPtr(), matrix[2], sizeof( idVec4 ) );
   1435 	memcpy( row3.ToFloatPtr(), matrix[3], sizeof( idVec4 ) );
   1436 
   1437 #endif
   1438 }
   1439 
   1440 /*
   1441 ========================
   1442 idRenderMatrix::SetMVP
   1443 ========================
   1444 */
   1445 void idRenderMatrix::SetMVP( const idRenderMatrix & mvp, idVec4 & row0, idVec4 & row1, idVec4 & row2, idVec4 & row3, bool & negativeDeterminant ) {
   1446 	assert_16_byte_aligned( row0.ToFloatPtr() );
   1447 	assert_16_byte_aligned( row1.ToFloatPtr() );
   1448 	assert_16_byte_aligned( row2.ToFloatPtr() );
   1449 	assert_16_byte_aligned( row3.ToFloatPtr() );
   1450 
   1451 #ifdef ID_WIN_X86_SSE2_INTRIN
   1452 
   1453 	const __m128 r0 = _mm_loadu_ps( mvp.m + 0 * 4 );
   1454 	const __m128 r1 = _mm_loadu_ps( mvp.m + 1 * 4 );
   1455 	const __m128 r2 = _mm_loadu_ps( mvp.m + 2 * 4 );
   1456 	const __m128 r3 = _mm_loadu_ps( mvp.m + 3 * 4 );
   1457 
   1458 	_mm_store_ps( row0.ToFloatPtr(), r0 );
   1459 	_mm_store_ps( row1.ToFloatPtr(), r1 );
   1460 	_mm_store_ps( row2.ToFloatPtr(), r2 );
   1461 	_mm_store_ps( row3.ToFloatPtr(), r3 );
   1462 
   1463 	DeterminantIsNegative( negativeDeterminant, r0, r1, r2, r3 );
   1464 
   1465 #else
   1466 
   1467 	memcpy( row0.ToFloatPtr(), mvp[0], sizeof( idVec4 ) );
   1468 	memcpy( row1.ToFloatPtr(), mvp[1], sizeof( idVec4 ) );
   1469 	memcpy( row2.ToFloatPtr(), mvp[2], sizeof( idVec4 ) );
   1470 	memcpy( row3.ToFloatPtr(), mvp[3], sizeof( idVec4 ) );
   1471 
   1472 	DeterminantIsNegative( negativeDeterminant, mvp[0], mvp[1], mvp[2], mvp[3] );
   1473 
   1474 #endif
   1475 }
   1476 
   1477 /*
   1478 ========================
   1479 idRenderMatrix::SetMVPForBounds
   1480 ========================
   1481 */
   1482 void idRenderMatrix::SetMVPForBounds( const idRenderMatrix & mvp, const idBounds & bounds, idVec4 & row0, idVec4 & row1, idVec4 & row2, idVec4 & row3, bool & negativeDeterminant ) {
   1483 	assert_16_byte_aligned( row0.ToFloatPtr() );
   1484 	assert_16_byte_aligned( row1.ToFloatPtr() );
   1485 	assert_16_byte_aligned( row2.ToFloatPtr() );
   1486 	assert_16_byte_aligned( row3.ToFloatPtr() );
   1487 
   1488 #ifdef ID_WIN_X86_SSE2_INTRIN
   1489 
   1490 	__m128 b0 = _mm_loadu_bounds_0( bounds );
   1491 	__m128 b1 = _mm_loadu_bounds_1( bounds );
   1492 
   1493 	__m128 offset = _mm_mul_ps( _mm_add_ps( b1, b0 ), vector_float_half );
   1494 	__m128 scale  = _mm_mul_ps( _mm_sub_ps( b1, b0 ), vector_float_half );
   1495 
   1496 	scale = _mm_or_ps( scale, vector_float_last_one );
   1497 
   1498 	__m128 r0 = _mm_loadu_ps( mvp.m + 0 * 4 );
   1499 	__m128 r1 = _mm_loadu_ps( mvp.m + 1 * 4 );
   1500 	__m128 r2 = _mm_loadu_ps( mvp.m + 2 * 4 );
   1501 	__m128 r3 = _mm_loadu_ps( mvp.m + 3 * 4 );
   1502 
   1503 	__m128 d0 = _mm_mul_ps( r0, offset );
   1504 	__m128 d1 = _mm_mul_ps( r1, offset );
   1505 	__m128 d2 = _mm_mul_ps( r2, offset );
   1506 	__m128 d3 = _mm_mul_ps( r3, offset );
   1507 
   1508 	__m128 s0 = _mm_unpacklo_ps( d0, d2 );		// a0, c0, a1, c1
   1509 	__m128 s1 = _mm_unpackhi_ps( d0, d2 );		// a2, c2, a3, c3
   1510 	__m128 s2 = _mm_unpacklo_ps( d1, d3 );		// b0, d0, b1, d1
   1511 	__m128 s3 = _mm_unpackhi_ps( d1, d3 );		// b2, d2, b3, d3
   1512 
   1513 	__m128 t0 = _mm_unpacklo_ps( s0, s2 );		// a0, b0, c0, d0
   1514 	__m128 t1 = _mm_unpackhi_ps( s0, s2 );		// a1, b1, c1, d1
   1515 	__m128 t2 = _mm_unpacklo_ps( s1, s3 );		// a2, b2, c2, d2
   1516 
   1517 	t0 = _mm_add_ps( t0, t1 );
   1518 	t0 = _mm_add_ps( t0, t2 );
   1519 
   1520 	__m128 n0 = _mm_and_ps( _mm_splat_ps( t0, 0 ), vector_float_keep_last );
   1521 	__m128 n1 = _mm_and_ps( _mm_splat_ps( t0, 1 ), vector_float_keep_last );
   1522 	__m128 n2 = _mm_and_ps( _mm_splat_ps( t0, 2 ), vector_float_keep_last );
   1523 	__m128 n3 = _mm_and_ps( _mm_splat_ps( t0, 3 ), vector_float_keep_last );
   1524 
   1525 	r0 = _mm_madd_ps( r0, scale, n0 );
   1526 	r1 = _mm_madd_ps( r1, scale, n1 );
   1527 	r2 = _mm_madd_ps( r2, scale, n2 );
   1528 	r3 = _mm_madd_ps( r3, scale, n3 );
   1529 
   1530 	_mm_store_ps( row0.ToFloatPtr(), r0 );
   1531 	_mm_store_ps( row1.ToFloatPtr(), r1 );
   1532 	_mm_store_ps( row2.ToFloatPtr(), r2 );
   1533 	_mm_store_ps( row3.ToFloatPtr(), r3 );
   1534 
   1535 	DeterminantIsNegative( negativeDeterminant, r0, r1, r2, r3 );
   1536 
   1537 #else
   1538 
   1539 	const idVec3 offset = ( bounds[1] + bounds[0] ) * 0.5f;
   1540 	const idVec3 scale = ( bounds[1] - bounds[0] ) * 0.5f;
   1541 
   1542 	row0[0] = mvp[0][0] * scale[0];
   1543 	row0[1] = mvp[0][1] * scale[1];
   1544 	row0[2] = mvp[0][2] * scale[2];
   1545 	row0[3] = mvp[0][3] + mvp[0][0] * offset[0] + mvp[0][1] * offset[1] + mvp[0][2] * offset[2];
   1546 
   1547 	row1[0] = mvp[1][0] * scale[0];
   1548 	row1[1] = mvp[1][1] * scale[1];
   1549 	row1[2] = mvp[1][2] * scale[2];
   1550 	row1[3] = mvp[1][3] + mvp[1][0] * offset[0] + mvp[1][1] * offset[1] + mvp[1][2] * offset[2];
   1551 
   1552 	row2[0] = mvp[2][0] * scale[0];
   1553 	row2[1] = mvp[2][1] * scale[1];
   1554 	row2[2] = mvp[2][2] * scale[2];
   1555 	row2[3] = mvp[2][3] + mvp[2][0] * offset[0] + mvp[2][1] * offset[1] + mvp[2][2] * offset[2];
   1556 
   1557 	row3[0] = mvp[3][0] * scale[0];
   1558 	row3[1] = mvp[3][1] * scale[1];
   1559 	row3[2] = mvp[3][2] * scale[2];
   1560 	row3[3] = mvp[3][3] + mvp[3][0] * offset[0] + mvp[3][1] * offset[1] + mvp[3][2] * offset[2];
   1561 
   1562 	DeterminantIsNegative( negativeDeterminant, row0.ToFloatPtr(), row1.ToFloatPtr(), row2.ToFloatPtr(), row3.ToFloatPtr() );
   1563 
   1564 #endif
   1565 }
   1566 
   1567 /*
   1568 ========================
   1569 idRenderMatrix::SetMVPForInverseProject
   1570 ========================
   1571 */
   1572 void idRenderMatrix::SetMVPForInverseProject( const idRenderMatrix & mvp, const idRenderMatrix & inverseProject, idVec4 & row0, idVec4 & row1, idVec4 & row2, idVec4 & row3, bool & negativeDeterminant ) {
   1573 	assert_16_byte_aligned( row0.ToFloatPtr() );
   1574 	assert_16_byte_aligned( row1.ToFloatPtr() );
   1575 	assert_16_byte_aligned( row2.ToFloatPtr() );
   1576 	assert_16_byte_aligned( row3.ToFloatPtr() );
   1577 
   1578 #ifdef ID_WIN_X86_SSE2_INTRIN
   1579 
   1580 	__m128 r0 = _mm_loadu_ps( mvp.m + 0 * 4 );
   1581 	__m128 r1 = _mm_loadu_ps( mvp.m + 1 * 4 );
   1582 	__m128 r2 = _mm_loadu_ps( mvp.m + 2 * 4 );
   1583 	__m128 r3 = _mm_loadu_ps( mvp.m + 3 * 4 );
   1584 
   1585 	__m128 p0 = _mm_loadu_ps( inverseProject.m + 0 * 4 );
   1586 	__m128 p1 = _mm_loadu_ps( inverseProject.m + 1 * 4 );
   1587 	__m128 p2 = _mm_loadu_ps( inverseProject.m + 2 * 4 );
   1588 	__m128 p3 = _mm_loadu_ps( inverseProject.m + 3 * 4 );
   1589 
   1590 	__m128 t0 = _mm_mul_ps( _mm_splat_ps( r0, 0 ), p0 );
   1591 	__m128 t1 = _mm_mul_ps( _mm_splat_ps( r1, 0 ), p0 );
   1592 	__m128 t2 = _mm_mul_ps( _mm_splat_ps( r2, 0 ), p0 );
   1593 	__m128 t3 = _mm_mul_ps( _mm_splat_ps( r3, 0 ), p0 );
   1594 
   1595 	t0 = _mm_madd_ps( _mm_splat_ps( r0, 1 ), p1, t0 );
   1596 	t1 = _mm_madd_ps( _mm_splat_ps( r1, 1 ), p1, t1 );
   1597 	t2 = _mm_madd_ps( _mm_splat_ps( r2, 1 ), p1, t2 );
   1598 	t3 = _mm_madd_ps( _mm_splat_ps( r3, 1 ), p1, t3 );
   1599 
   1600 	t0 = _mm_madd_ps( _mm_splat_ps( r0, 2 ), p2, t0 );
   1601 	t1 = _mm_madd_ps( _mm_splat_ps( r1, 2 ), p2, t1 );
   1602 	t2 = _mm_madd_ps( _mm_splat_ps( r2, 2 ), p2, t2 );
   1603 	t3 = _mm_madd_ps( _mm_splat_ps( r3, 2 ), p2, t3 );
   1604 
   1605 	t0 = _mm_madd_ps( _mm_splat_ps( r0, 3 ), p3, t0 );
   1606 	t1 = _mm_madd_ps( _mm_splat_ps( r1, 3 ), p3, t1 );
   1607 	t2 = _mm_madd_ps( _mm_splat_ps( r2, 3 ), p3, t2 );
   1608 	t3 = _mm_madd_ps( _mm_splat_ps( r3, 3 ), p3, t3 );
   1609 
   1610 	_mm_store_ps( row0.ToFloatPtr(), t0 );
   1611 	_mm_store_ps( row1.ToFloatPtr(), t1 );
   1612 	_mm_store_ps( row2.ToFloatPtr(), t2 );
   1613 	_mm_store_ps( row3.ToFloatPtr(), t3 );
   1614 
   1615 	DeterminantIsNegative( negativeDeterminant, t0, t1, t2, t3 );
   1616 
   1617 #else
   1618 
   1619 	row0[0] = mvp.m[0*4+0]*inverseProject.m[0*4+0] + mvp.m[0*4+1]*inverseProject.m[1*4+0] + mvp.m[0*4+2]*inverseProject.m[2*4+0] + mvp.m[0*4+3]*inverseProject.m[3*4+0];
   1620 	row0[1] = mvp.m[0*4+0]*inverseProject.m[0*4+1] + mvp.m[0*4+1]*inverseProject.m[1*4+1] + mvp.m[0*4+2]*inverseProject.m[2*4+1] + mvp.m[0*4+3]*inverseProject.m[3*4+1];
   1621 	row0[2] = mvp.m[0*4+0]*inverseProject.m[0*4+2] + mvp.m[0*4+1]*inverseProject.m[1*4+2] + mvp.m[0*4+2]*inverseProject.m[2*4+2] + mvp.m[0*4+3]*inverseProject.m[3*4+2];
   1622 	row0[3] = mvp.m[0*4+0]*inverseProject.m[0*4+3] + mvp.m[0*4+1]*inverseProject.m[1*4+3] + mvp.m[0*4+2]*inverseProject.m[2*4+3] + mvp.m[0*4+3]*inverseProject.m[3*4+3];
   1623 
   1624 	row1[0] = mvp.m[1*4+0]*inverseProject.m[0*4+0] + mvp.m[1*4+1]*inverseProject.m[1*4+0] + mvp.m[1*4+2]*inverseProject.m[2*4+0] + mvp.m[1*4+3]*inverseProject.m[3*4+0];
   1625 	row1[1] = mvp.m[1*4+0]*inverseProject.m[0*4+1] + mvp.m[1*4+1]*inverseProject.m[1*4+1] + mvp.m[1*4+2]*inverseProject.m[2*4+1] + mvp.m[1*4+3]*inverseProject.m[3*4+1];
   1626 	row1[2] = mvp.m[1*4+0]*inverseProject.m[0*4+2] + mvp.m[1*4+1]*inverseProject.m[1*4+2] + mvp.m[1*4+2]*inverseProject.m[2*4+2] + mvp.m[1*4+3]*inverseProject.m[3*4+2];
   1627 	row1[3] = mvp.m[1*4+0]*inverseProject.m[0*4+3] + mvp.m[1*4+1]*inverseProject.m[1*4+3] + mvp.m[1*4+2]*inverseProject.m[2*4+3] + mvp.m[1*4+3]*inverseProject.m[3*4+3];
   1628 
   1629 	row2[0] = mvp.m[2*4+0]*inverseProject.m[0*4+0] + mvp.m[2*4+1]*inverseProject.m[1*4+0] + mvp.m[2*4+2]*inverseProject.m[2*4+0] + mvp.m[2*4+3]*inverseProject.m[3*4+0];
   1630 	row2[1] = mvp.m[2*4+0]*inverseProject.m[0*4+1] + mvp.m[2*4+1]*inverseProject.m[1*4+1] + mvp.m[2*4+2]*inverseProject.m[2*4+1] + mvp.m[2*4+3]*inverseProject.m[3*4+1];
   1631 	row2[2] = mvp.m[2*4+0]*inverseProject.m[0*4+2] + mvp.m[2*4+1]*inverseProject.m[1*4+2] + mvp.m[2*4+2]*inverseProject.m[2*4+2] + mvp.m[2*4+3]*inverseProject.m[3*4+2];
   1632 	row2[3] = mvp.m[2*4+0]*inverseProject.m[0*4+3] + mvp.m[2*4+1]*inverseProject.m[1*4+3] + mvp.m[2*4+2]*inverseProject.m[2*4+3] + mvp.m[2*4+3]*inverseProject.m[3*4+3];
   1633 
   1634 	row3[0] = mvp.m[3*4+0]*inverseProject.m[0*4+0] + mvp.m[3*4+1]*inverseProject.m[1*4+0] + mvp.m[3*4+2]*inverseProject.m[2*4+0] + mvp.m[3*4+3]*inverseProject.m[3*4+0];
   1635 	row3[1] = mvp.m[3*4+0]*inverseProject.m[0*4+1] + mvp.m[3*4+1]*inverseProject.m[1*4+1] + mvp.m[3*4+2]*inverseProject.m[2*4+1] + mvp.m[3*4+3]*inverseProject.m[3*4+1];
   1636 	row3[2] = mvp.m[3*4+0]*inverseProject.m[0*4+2] + mvp.m[3*4+1]*inverseProject.m[1*4+2] + mvp.m[3*4+2]*inverseProject.m[2*4+2] + mvp.m[3*4+3]*inverseProject.m[3*4+2];
   1637 	row3[3] = mvp.m[3*4+0]*inverseProject.m[0*4+3] + mvp.m[3*4+1]*inverseProject.m[1*4+3] + mvp.m[3*4+2]*inverseProject.m[2*4+3] + mvp.m[3*4+3]*inverseProject.m[3*4+3];
   1638 
   1639 	DeterminantIsNegative( negativeDeterminant, row0.ToFloatPtr(), row1.ToFloatPtr(), row2.ToFloatPtr(), row3.ToFloatPtr() );
   1640 
   1641 #endif
   1642 }
   1643 
   1644 /*
   1645 ========================
   1646 idRenderMatrix::CullPointToMVPbits
   1647 
   1648 Returns true if the point transformed by the given Model View Projection (MVP) matrix is
   1649 outside the clip space.
   1650 
   1651 Normally the clip space extends from -1.0 to 1.0 on each axis, but by setting 'zeroToOne'
   1652 to true, the clip space will extend from 0.0 to 1.0 on each axis for a light projection matrix.
   1653 ========================
   1654 */
   1655 bool idRenderMatrix::CullPointToMVPbits( const idRenderMatrix & mvp, const idVec3 & p, byte * outBits, bool zeroToOne ) {
   1656 
   1657 	idVec4 c;
   1658 	for ( int i = 0; i < 4; i++ ) {
   1659 		c[i] = p[0] * mvp[i][0] + p[1] * mvp[i][1] + p[2] * mvp[i][2] + mvp[i][3];
   1660 	}
   1661 
   1662 	const float minW = zeroToOne ? 0.0f : -c[3];
   1663 	const float maxW = c[3];
   1664 #if defined( CLIP_SPACE_D3D )	// the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false
   1665 	const float minZ = 0.0f;
   1666 #else
   1667 	const float minZ = minW;
   1668 #endif
   1669 
   1670 	int bits = 0;
   1671 	if ( c[0] > minW ) { bits |= ( 1 << 0 ); }
   1672 	if ( c[0] < maxW ) { bits |= ( 1 << 1 ); }
   1673 	if ( c[1] > minW ) { bits |= ( 1 << 2 ); }
   1674 	if ( c[1] < maxW ) { bits |= ( 1 << 3 ); }
   1675 	if ( c[2] > minZ ) { bits |= ( 1 << 4 ); }	// NOTE: using minZ
   1676 	if ( c[2] < maxW ) { bits |= ( 1 << 5 ); }
   1677 
   1678 	// store out a bit set for each side where the point is outside the clip space
   1679 	*outBits = (byte)( bits ^ 63 );
   1680 
   1681 	// if any bits weren't set, the point is completely off one side of the frustum
   1682 	return ( bits != 63 );
   1683 }
   1684 
   1685 /*
   1686 ========================
   1687 idRenderMatrix::CullBoundsToMVPbits
   1688 
   1689 Returns true if nothing contained in the bounds is transformed by the given
   1690 Model View Projection (MVP) matrix to anything inside the clip space.
   1691 
   1692 Normally the clip space extends from -1.0 to 1.0 on each axis, but by setting 'zeroToOne'
   1693 to true, the clip space will extend from 0.0 to 1.0 on each axis for a light projection matrix.
   1694 
   1695 When all the corners of the bounding box are behind one of the six frustum planes, the box is
   1696 culled. This is conservative, because some boxes may "cross corners" and can be in front of a
   1697 frustum plane, but only while also being behind another one.
   1698 ========================
   1699 */
   1700 bool idRenderMatrix::CullBoundsToMVPbits( const idRenderMatrix & mvp, const idBounds & bounds, byte * outBits, bool zeroToOne ) {
   1701 
   1702 #ifdef ID_WIN_X86_SSE2_INTRIN
   1703 
   1704 	__m128 mvp0 = _mm_loadu_ps( mvp[0] );
   1705 	__m128 mvp1 = _mm_loadu_ps( mvp[1] );
   1706 	__m128 mvp2 = _mm_loadu_ps( mvp[2] );
   1707 	__m128 mvp3 = _mm_loadu_ps( mvp[3] );
   1708 
   1709 	__m128 minMul = zeroToOne ? vector_float_zero : vector_float_neg_one;
   1710 
   1711 	__m128 b0 = _mm_loadu_bounds_0( bounds );
   1712 	__m128 b1 = _mm_loadu_bounds_1( bounds );
   1713 
   1714 	// take the four points on the X-Y plane
   1715 	__m128 vxy = _mm_unpacklo_ps( b0, b1 );						// min X, max X, min Y, max Y
   1716 	__m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) );	// min X, max X, min X, max X
   1717 	__m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) );	// min Y, min Y, max Y, max Y
   1718 
   1719 	__m128 vz0 = _mm_splat_ps( b0, 2 );							// min Z, min Z, min Z, min Z
   1720 	__m128 vz1 = _mm_splat_ps( b1, 2 );							// max Z, max Z, max Z, max Z
   1721 
   1722 	// compute four partial X,Y,Z,W values
   1723 	__m128 parx = _mm_splat_ps( mvp0, 3 );
   1724 	__m128 pary = _mm_splat_ps( mvp1, 3 );
   1725 	__m128 parz = _mm_splat_ps( mvp2, 3 );
   1726 	__m128 parw = _mm_splat_ps( mvp3, 3 );
   1727 
   1728 	parx = _mm_madd_ps( vx, _mm_splat_ps( mvp0, 0 ), parx );
   1729 	pary = _mm_madd_ps( vx, _mm_splat_ps( mvp1, 0 ), pary );
   1730 	parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz );
   1731 	parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw );
   1732 
   1733 	parx = _mm_madd_ps( vy, _mm_splat_ps( mvp0, 1 ), parx );
   1734 	pary = _mm_madd_ps( vy, _mm_splat_ps( mvp1, 1 ), pary );
   1735 	parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz );
   1736 	parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw );
   1737 
   1738 	// compute full X,Y,Z,W values
   1739 	__m128 mvp0Z = _mm_splat_ps( mvp0, 2 );
   1740 	__m128 mvp1Z = _mm_splat_ps( mvp1, 2 );
   1741 	__m128 mvp2Z = _mm_splat_ps( mvp2, 2 );
   1742 	__m128 mvp3Z = _mm_splat_ps( mvp3, 2 );
   1743 
   1744 	__m128 x0 = _mm_madd_ps( vz0, mvp0Z, parx );
   1745 	__m128 y0 = _mm_madd_ps( vz0, mvp1Z, pary );
   1746 	__m128 z0 = _mm_madd_ps( vz0, mvp2Z, parz );
   1747 	__m128 w0 = _mm_madd_ps( vz0, mvp3Z, parw );
   1748 
   1749 	__m128 x1 = _mm_madd_ps( vz1, mvp0Z, parx );
   1750 	__m128 y1 = _mm_madd_ps( vz1, mvp1Z, pary );
   1751 	__m128 z1 = _mm_madd_ps( vz1, mvp2Z, parz );
   1752 	__m128 w1 = _mm_madd_ps( vz1, mvp3Z, parw );
   1753 
   1754 	__m128 maxW0 = w0;
   1755 	__m128 maxW1 = w1;
   1756 	__m128 minW0 = _mm_mul_ps( w0, minMul );
   1757 	__m128 minW1 = _mm_mul_ps( w1, minMul );
   1758 #if defined( CLIP_SPACE_D3D )	// the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false
   1759 	__m128 minZ0 = vector_float_zero;
   1760 	__m128 minZ1 = vector_float_zero;
   1761 #else
   1762 	__m128 minZ0 = minW0;
   1763 	__m128 minZ1 = minW1;
   1764 #endif
   1765 
   1766 	__m128 cullBits0 = _mm_cmpgt_ps( x0, minW0 );
   1767 	__m128 cullBits1 = _mm_cmpgt_ps( maxW0, x0 );
   1768 	__m128 cullBits2 = _mm_cmpgt_ps( y0, minW0 );
   1769 	__m128 cullBits3 = _mm_cmpgt_ps( maxW0, y0 );
   1770 	__m128 cullBits4 = _mm_cmpgt_ps( z0, minZ0 );	// NOTE: using minZ0
   1771 	__m128 cullBits5 = _mm_cmpgt_ps( maxW0, z0 );
   1772 
   1773 	cullBits0 = _mm_or_ps( cullBits0, _mm_cmpgt_ps( x1, minW1 ) );
   1774 	cullBits1 = _mm_or_ps( cullBits1, _mm_cmpgt_ps( maxW1, x1 ) );
   1775 	cullBits2 = _mm_or_ps( cullBits2, _mm_cmpgt_ps( y1, minW1 ) );
   1776 	cullBits3 = _mm_or_ps( cullBits3, _mm_cmpgt_ps( maxW1, y1 ) );
   1777 	cullBits4 = _mm_or_ps( cullBits4, _mm_cmpgt_ps( z1, minZ1 ) );	// NOTE: using minZ1
   1778 	cullBits5 = _mm_or_ps( cullBits5, _mm_cmpgt_ps( maxW1, z1 ) );
   1779 
   1780 	cullBits0 = _mm_and_ps( cullBits0, vector_float_mask0 );
   1781 	cullBits1 = _mm_and_ps( cullBits1, vector_float_mask1 );
   1782 	cullBits2 = _mm_and_ps( cullBits2, vector_float_mask2 );
   1783 	cullBits3 = _mm_and_ps( cullBits3, vector_float_mask3 );
   1784 	cullBits4 = _mm_and_ps( cullBits4, vector_float_mask4 );
   1785 	cullBits5 = _mm_and_ps( cullBits5, vector_float_mask5 );
   1786 
   1787 	cullBits0 = _mm_or_ps( cullBits0, cullBits1 );
   1788 	cullBits2 = _mm_or_ps( cullBits2, cullBits3 );
   1789 	cullBits4 = _mm_or_ps( cullBits4, cullBits5 );
   1790 	cullBits0 = _mm_or_ps( cullBits0, cullBits2 );
   1791 	cullBits0 = _mm_or_ps( cullBits0, cullBits4 );
   1792 
   1793 	cullBits0 = _mm_or_ps( cullBits0, _mm_perm_ps( cullBits0, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   1794 	cullBits0 = _mm_or_ps( cullBits0, _mm_perm_ps( cullBits0, _MM_SHUFFLE( 0, 1, 0, 1 ) ) );
   1795 
   1796 	int bits = _mm_cvtsi128_si32( (const __m128i &)cullBits0 );
   1797 
   1798 	*outBits = (byte)( bits ^ 63 );
   1799 
   1800 	return ( bits != 63 );
   1801 
   1802 #else
   1803 
   1804 	int bits = 0;
   1805 
   1806 	idVec3 v;
   1807 	for ( int x = 0; x < 2; x++ ) {
   1808 		v[0] = bounds[x][0];
   1809 		for ( int y = 0; y < 2; y++ ) {
   1810 			v[1] = bounds[y][1];
   1811 			for ( int z = 0; z < 2; z++ ) {
   1812 				v[2] = bounds[z][2];
   1813 
   1814 				idVec4 c;
   1815 				for ( int i = 0; i < 4; i++ ) {
   1816 					c[i] = v[0] * mvp[i][0] + v[1] * mvp[i][1] + v[2] * mvp[i][2] + mvp[i][3];
   1817 				}
   1818 
   1819 				const float minW = zeroToOne ? 0.0f : -c[3];
   1820 				const float maxW = c[3];
   1821 #if defined( CLIP_SPACE_D3D )	// the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false
   1822 				const float minZ = 0.0f;
   1823 #else
   1824 				const float minZ = minW;
   1825 #endif
   1826 
   1827 				if ( c[0] > minW ) { bits |= ( 1 << 0 ); }
   1828 				if ( c[0] < maxW ) { bits |= ( 1 << 1 ); }
   1829 				if ( c[1] > minW ) { bits |= ( 1 << 2 ); }
   1830 				if ( c[1] < maxW ) { bits |= ( 1 << 3 ); }
   1831 				if ( c[2] > minZ ) { bits |= ( 1 << 4 ); }	// NOTE: using minZ
   1832 				if ( c[2] < maxW ) { bits |= ( 1 << 5 ); }
   1833 			}
   1834 		}
   1835 	}
   1836 
   1837 	// store out a bit set for each side where the bounds is outside the clip space
   1838 	*outBits = (byte)( bits ^ 63 );
   1839 
   1840 	// if any bits weren't set, the bounds is completely off one side of the frustum
   1841 	return ( bits != 63 );
   1842 
   1843 #endif
   1844 }
   1845 
   1846 /*
   1847 ========================
   1848 idRenderMatrix::CullExtrudedBoundsToMVPbits
   1849 
   1850 Returns true if nothing contained in the extruded bounds is transformed by the
   1851 given Model View Projection (MVP) matrix to anything inside the clip space.
   1852 
   1853 The given bounds is extruded in the 'extrudeDirection' up to the 'clipPlane'.
   1854 
   1855 Normally the clip space extends from -1.0 to 1.0 on each axis, but by setting 'zeroToOne'
   1856 to true, the clip space will extend from 0.0 to 1.0 on each axis for a light projection matrix.
   1857 
   1858 When all the corners of the bounding box are behind one of the six frustum planes, the box is
   1859 culled. This is conservative, because some boxes may "cross corners" and can be in front of a
   1860 frustum plane, but only while also being behind another one.
   1861 ========================
   1862 */
   1863 bool idRenderMatrix::CullExtrudedBoundsToMVPbits( const idRenderMatrix & mvp, const idBounds & bounds, const idVec3 & extrudeDirection, const idPlane & clipPlane, byte * outBits, bool zeroToOne ) {
   1864 	assert( idMath::Fabs( extrudeDirection * clipPlane.Normal() ) >= idMath::FLT_SMALLEST_NON_DENORMAL );
   1865 
   1866 #ifdef ID_WIN_X86_SSE2_INTRIN
   1867 
   1868 	__m128 mvp0 = _mm_loadu_ps( mvp[0] );
   1869 	__m128 mvp1 = _mm_loadu_ps( mvp[1] );
   1870 	__m128 mvp2 = _mm_loadu_ps( mvp[2] );
   1871 	__m128 mvp3 = _mm_loadu_ps( mvp[3] );
   1872 
   1873 	__m128 minMul = zeroToOne ? vector_float_zero : vector_float_neg_one;
   1874 
   1875 	__m128 b0 = _mm_loadu_bounds_0( bounds );
   1876 	__m128 b1 = _mm_loadu_bounds_1( bounds );
   1877 
   1878 	// take the four points on the X-Y plane
   1879 	__m128 vxy = _mm_unpacklo_ps( b0, b1 );						// min X, max X, min Y, max Y
   1880 	__m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) );	// min X, max X, min X, max X
   1881 	__m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) );	// min Y, min Y, max Y, max Y
   1882 
   1883 	__m128 vz0 = _mm_splat_ps( b0, 2 );							// min Z, min Z, min Z, min Z
   1884 	__m128 vz1 = _mm_splat_ps( b1, 2 );							// max Z, max Z, max Z, max Z
   1885 
   1886 	__m128 cullBits0;
   1887 	__m128 cullBits1;
   1888 	__m128 cullBits2;
   1889 	__m128 cullBits3;
   1890 	__m128 cullBits4;
   1891 	__m128 cullBits5;
   1892 
   1893 	// calculate the cull bits for the bounding box corners
   1894 	{
   1895 		// compute four partial X,Y,Z,W values
   1896 		__m128 parx = _mm_splat_ps( mvp0, 3 );
   1897 		__m128 pary = _mm_splat_ps( mvp1, 3 );
   1898 		__m128 parz = _mm_splat_ps( mvp2, 3 );
   1899 		__m128 parw = _mm_splat_ps( mvp3, 3 );
   1900 
   1901 		parx = _mm_madd_ps( vx, _mm_splat_ps( mvp0, 0 ), parx );
   1902 		pary = _mm_madd_ps( vx, _mm_splat_ps( mvp1, 0 ), pary );
   1903 		parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz );
   1904 		parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw );
   1905 
   1906 		parx = _mm_madd_ps( vy, _mm_splat_ps( mvp0, 1 ), parx );
   1907 		pary = _mm_madd_ps( vy, _mm_splat_ps( mvp1, 1 ), pary );
   1908 		parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz );
   1909 		parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw );
   1910 
   1911 		// compute full X,Y,Z,W values
   1912 		__m128 mvp0Z = _mm_splat_ps( mvp0, 2 );
   1913 		__m128 mvp1Z = _mm_splat_ps( mvp1, 2 );
   1914 		__m128 mvp2Z = _mm_splat_ps( mvp2, 2 );
   1915 		__m128 mvp3Z = _mm_splat_ps( mvp3, 2 );
   1916 
   1917 		__m128 x0 = _mm_madd_ps( vz0, mvp0Z, parx );
   1918 		__m128 y0 = _mm_madd_ps( vz0, mvp1Z, pary );
   1919 		__m128 z0 = _mm_madd_ps( vz0, mvp2Z, parz );
   1920 		__m128 w0 = _mm_madd_ps( vz0, mvp3Z, parw );
   1921 
   1922 		__m128 x1 = _mm_madd_ps( vz1, mvp0Z, parx );
   1923 		__m128 y1 = _mm_madd_ps( vz1, mvp1Z, pary );
   1924 		__m128 z1 = _mm_madd_ps( vz1, mvp2Z, parz );
   1925 		__m128 w1 = _mm_madd_ps( vz1, mvp3Z, parw );
   1926 
   1927 		__m128 maxW0 = w0;
   1928 		__m128 maxW1 = w1;
   1929 		__m128 minW0 = _mm_mul_ps( w0, minMul );
   1930 		__m128 minW1 = _mm_mul_ps( w1, minMul );
   1931 #if defined( CLIP_SPACE_D3D )	// the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false
   1932 		__m128 minZ0 = vector_float_zero;
   1933 		__m128 minZ1 = vector_float_zero;
   1934 #else
   1935 		__m128 minZ0 = minW0;
   1936 		__m128 minZ1 = minW1;
   1937 #endif
   1938 
   1939 		cullBits0 = _mm_cmpgt_ps( x0, minW0 );
   1940 		cullBits1 = _mm_cmpgt_ps( maxW0, x0 );
   1941 		cullBits2 = _mm_cmpgt_ps( y0, minW0 );
   1942 		cullBits3 = _mm_cmpgt_ps( maxW0, y0 );
   1943 		cullBits4 = _mm_cmpgt_ps( z0, minZ0 );	// NOTE: using minZ0
   1944 		cullBits5 = _mm_cmpgt_ps( maxW0, z0 );
   1945 
   1946 		cullBits0 = _mm_or_ps( cullBits0, _mm_cmpgt_ps( x1, minW1 ) );
   1947 		cullBits1 = _mm_or_ps( cullBits1, _mm_cmpgt_ps( maxW1, x1 ) );
   1948 		cullBits2 = _mm_or_ps( cullBits2, _mm_cmpgt_ps( y1, minW1 ) );
   1949 		cullBits3 = _mm_or_ps( cullBits3, _mm_cmpgt_ps( maxW1, y1 ) );
   1950 		cullBits4 = _mm_or_ps( cullBits4, _mm_cmpgt_ps( z1, minZ1 ) );	// NOTE: using minZ1
   1951 		cullBits5 = _mm_or_ps( cullBits5, _mm_cmpgt_ps( maxW1, z1 ) );
   1952 	}
   1953 
   1954 	// calculate and include the cull bits for the extruded bounding box corners
   1955 	{
   1956 		__m128 clipX = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 0 ), 0 );
   1957 		__m128 clipY = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 1 ), 0 );
   1958 		__m128 clipZ = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 2 ), 0 );
   1959 		__m128 clipW = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 3 ), 0 );
   1960 
   1961 		__m128 extrudeX = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 0 ), 0 );
   1962 		__m128 extrudeY = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 1 ), 0 );
   1963 		__m128 extrudeZ = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 2 ), 0 );
   1964 
   1965 		__m128 closing = _mm_madd_ps( clipX, extrudeX, _mm_madd_ps( clipY, extrudeY, _mm_mul_ps( clipZ, extrudeZ ) ) );
   1966 		__m128 invClosing = _mm_rcp32_ps( closing );
   1967 		invClosing = _mm_xor_ps( invClosing, vector_float_sign_bit );
   1968 
   1969 		__m128 dt = _mm_madd_ps( clipX, vx, _mm_madd_ps( clipY, vy, clipW ) );
   1970 		__m128 d0 = _mm_madd_ps( clipZ, vz0, dt );
   1971 		__m128 d1 = _mm_madd_ps( clipZ, vz1, dt );
   1972 
   1973 		d0 = _mm_mul_ps( d0, invClosing );
   1974 		d1 = _mm_mul_ps( d1, invClosing );
   1975 
   1976 		__m128 vx0 = _mm_madd_ps( extrudeX, d0, vx );
   1977 		__m128 vx1 = _mm_madd_ps( extrudeX, d1, vx );
   1978 
   1979 		__m128 vy0 = _mm_madd_ps( extrudeY, d0, vy );
   1980 		__m128 vy1 = _mm_madd_ps( extrudeY, d1, vy );
   1981 
   1982 		vz0 = _mm_madd_ps( extrudeZ, d0, vz0 );
   1983 		vz1 = _mm_madd_ps( extrudeZ, d1, vz1 );
   1984 
   1985 		__m128 mvp0X = _mm_splat_ps( mvp0, 0 );
   1986 		__m128 mvp1X = _mm_splat_ps( mvp1, 0 );
   1987 		__m128 mvp2X = _mm_splat_ps( mvp2, 0 );
   1988 		__m128 mvp3X = _mm_splat_ps( mvp3, 0 );
   1989 
   1990 		__m128 mvp0W = _mm_splat_ps( mvp0, 3 );
   1991 		__m128 mvp1W = _mm_splat_ps( mvp1, 3 );
   1992 		__m128 mvp2W = _mm_splat_ps( mvp2, 3 );
   1993 		__m128 mvp3W = _mm_splat_ps( mvp3, 3 );
   1994 
   1995 		__m128 x0 = _mm_madd_ps( vx0, mvp0X, mvp0W );
   1996 		__m128 y0 = _mm_madd_ps( vx0, mvp1X, mvp1W );
   1997 		__m128 z0 = _mm_madd_ps( vx0, mvp2X, mvp2W );
   1998 		__m128 w0 = _mm_madd_ps( vx0, mvp3X, mvp3W );
   1999 
   2000 		__m128 x1 = _mm_madd_ps( vx1, mvp0X, mvp0W );
   2001 		__m128 y1 = _mm_madd_ps( vx1, mvp1X, mvp1W );
   2002 		__m128 z1 = _mm_madd_ps( vx1, mvp2X, mvp2W );
   2003 		__m128 w1 = _mm_madd_ps( vx1, mvp3X, mvp3W );
   2004 
   2005 		__m128 mvp0Y = _mm_splat_ps( mvp0, 1 );
   2006 		__m128 mvp1Y = _mm_splat_ps( mvp1, 1 );
   2007 		__m128 mvp2Y = _mm_splat_ps( mvp2, 1 );
   2008 		__m128 mvp3Y = _mm_splat_ps( mvp3, 1 );
   2009 
   2010 		x0 = _mm_madd_ps( vy0, mvp0Y, x0 ); //-V537
   2011 		y0 = _mm_madd_ps( vy0, mvp1Y, y0 );
   2012 		z0 = _mm_madd_ps( vy0, mvp2Y, z0 ); //-V537
   2013 		w0 = _mm_madd_ps( vy0, mvp3Y, w0 );
   2014 
   2015 		x1 = _mm_madd_ps( vy1, mvp0Y, x1 ); //-V537
   2016 		y1 = _mm_madd_ps( vy1, mvp1Y, y1 );
   2017 		z1 = _mm_madd_ps( vy1, mvp2Y, z1 ); //-V537
   2018 		w1 = _mm_madd_ps( vy1, mvp3Y, w1 );
   2019 
   2020 		__m128 mvp0Z = _mm_splat_ps( mvp0, 2 );
   2021 		__m128 mvp1Z = _mm_splat_ps( mvp1, 2 );
   2022 		__m128 mvp2Z = _mm_splat_ps( mvp2, 2 );
   2023 		__m128 mvp3Z = _mm_splat_ps( mvp3, 2 );
   2024 
   2025 		x0 = _mm_madd_ps( vz0, mvp0Z, x0 );
   2026 		y0 = _mm_madd_ps( vz0, mvp1Z, y0 ); //-V537
   2027 		z0 = _mm_madd_ps( vz0, mvp2Z, z0 );
   2028 		w0 = _mm_madd_ps( vz0, mvp3Z, w0 );
   2029 
   2030 		x1 = _mm_madd_ps( vz1, mvp0Z, x1 );
   2031 		y1 = _mm_madd_ps( vz1, mvp1Z, y1 ); //-V537
   2032 		z1 = _mm_madd_ps( vz1, mvp2Z, z1 );
   2033 		w1 = _mm_madd_ps( vz1, mvp3Z, w1 );
   2034 
   2035 		__m128 maxW0 = w0;
   2036 		__m128 maxW1 = w1;
   2037 		__m128 minW0 = _mm_mul_ps( w0, minMul );
   2038 		__m128 minW1 = _mm_mul_ps( w1, minMul );
   2039 #if defined( CLIP_SPACE_D3D )	// the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false
   2040 		__m128 minZ0 = vector_float_zero;
   2041 		__m128 minZ1 = vector_float_zero;
   2042 #else
   2043 		__m128 minZ0 = minW0;
   2044 		__m128 minZ1 = minW1;
   2045 #endif
   2046 
   2047 		cullBits0 = _mm_or_ps( cullBits0, _mm_cmpgt_ps( x0, minW0 ) );
   2048 		cullBits1 = _mm_or_ps( cullBits1, _mm_cmpgt_ps( maxW0, x0 ) );
   2049 		cullBits2 = _mm_or_ps( cullBits2, _mm_cmpgt_ps( y0, minW0 ) );
   2050 		cullBits3 = _mm_or_ps( cullBits3, _mm_cmpgt_ps( maxW0, y0 ) );
   2051 		cullBits4 = _mm_or_ps( cullBits4, _mm_cmpgt_ps( z0, minZ0 ) );	// NOTE: using minZ0
   2052 		cullBits5 = _mm_or_ps( cullBits5, _mm_cmpgt_ps( maxW0, z0 ) );
   2053 
   2054 		cullBits0 = _mm_or_ps( cullBits0, _mm_cmpgt_ps( x1, minW1 ) );
   2055 		cullBits1 = _mm_or_ps( cullBits1, _mm_cmpgt_ps( maxW1, x1 ) );
   2056 		cullBits2 = _mm_or_ps( cullBits2, _mm_cmpgt_ps( y1, minW1 ) );
   2057 		cullBits3 = _mm_or_ps( cullBits3, _mm_cmpgt_ps( maxW1, y1 ) );
   2058 		cullBits4 = _mm_or_ps( cullBits4, _mm_cmpgt_ps( z1, minZ1 ) );	// NOTE: using minZ1
   2059 		cullBits5 = _mm_or_ps( cullBits5, _mm_cmpgt_ps( maxW1, z1 ) );
   2060 	}
   2061 
   2062 	cullBits0 = _mm_and_ps( cullBits0, vector_float_mask0 );
   2063 	cullBits1 = _mm_and_ps( cullBits1, vector_float_mask1 );
   2064 	cullBits2 = _mm_and_ps( cullBits2, vector_float_mask2 );
   2065 	cullBits3 = _mm_and_ps( cullBits3, vector_float_mask3 );
   2066 	cullBits4 = _mm_and_ps( cullBits4, vector_float_mask4 );
   2067 	cullBits5 = _mm_and_ps( cullBits5, vector_float_mask5 );
   2068 
   2069 	cullBits0 = _mm_or_ps( cullBits0, cullBits1 );
   2070 	cullBits2 = _mm_or_ps( cullBits2, cullBits3 );
   2071 	cullBits4 = _mm_or_ps( cullBits4, cullBits5 );
   2072 	cullBits0 = _mm_or_ps( cullBits0, cullBits2 );
   2073 	cullBits0 = _mm_or_ps( cullBits0, cullBits4 );
   2074 
   2075 	cullBits0 = _mm_or_ps( cullBits0, _mm_perm_ps( cullBits0, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   2076 	cullBits0 = _mm_or_ps( cullBits0, _mm_perm_ps( cullBits0, _MM_SHUFFLE( 0, 1, 0, 1 ) ) );
   2077 
   2078 	int bits = _mm_cvtsi128_si32( (const __m128i &)cullBits0 );
   2079 
   2080 	*outBits = (byte)(bits ^ 63);
   2081 
   2082 	return ( bits != 63 );
   2083 
   2084 #else
   2085 
   2086 	int bits = 0;
   2087 
   2088 	float closing = extrudeDirection * clipPlane.Normal();
   2089 	float invClosing = -1.0f / closing;
   2090 
   2091 	idVec3 v;
   2092 	for ( int x = 0; x < 2; x++ ) {
   2093 		v[0] = bounds[x][0];
   2094 		for ( int y = 0; y < 2; y++ ) {
   2095 			v[1] = bounds[y][1];
   2096 			for ( int z = 0; z < 2; z++ ) {
   2097 				v[2] = bounds[z][2];
   2098 
   2099 				for ( int extrude = 0; extrude <= 1; extrude++ ) {
   2100 
   2101 					idVec3 test;
   2102 					if ( extrude ) {
   2103 						const float extrudeDist = clipPlane.Distance( v ) * invClosing;
   2104 						test = v + extrudeDirection * extrudeDist;
   2105 					} else {
   2106 						test = v;
   2107 					}
   2108 
   2109 					idVec4 c;
   2110 					for ( int i = 0; i < 4; i++ ) {
   2111 						c[i] = test[0] * mvp[i][0] + test[1] * mvp[i][1] + test[2] * mvp[i][2] + mvp[i][3];
   2112 					}
   2113 
   2114 					const float minW = zeroToOne ? 0.0f : -c[3];
   2115 					const float maxW = c[3];
   2116 #if defined( CLIP_SPACE_D3D )	// the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false
   2117 					const float minZ = 0.0f;
   2118 #else
   2119 					const float minZ = minW;
   2120 #endif
   2121 
   2122 					if ( c[0] > minW ) { bits |= ( 1 << 0 ); }
   2123 					if ( c[0] < maxW ) { bits |= ( 1 << 1 ); }
   2124 					if ( c[1] > minW ) { bits |= ( 1 << 2 ); }
   2125 					if ( c[1] < maxW ) { bits |= ( 1 << 3 ); }
   2126 					if ( c[2] > minZ ) { bits |= ( 1 << 4 ); }	// NOTE: using minZ
   2127 					if ( c[2] < maxW ) { bits |= ( 1 << 5 ); }
   2128 				}
   2129 			}
   2130 		}
   2131 	}
   2132 
   2133 	// store out a bit set for each side where the bounds is outside the clip space
   2134 	*outBits = (byte)(bits ^ 63);
   2135 
   2136 	// if any bits weren't set, the bounds is completely off one side of the frustum
   2137 	return ( bits != 63 );
   2138 
   2139 #endif
   2140 }
   2141 
   2142 /*
   2143 ========================
   2144 idRenderMatrix::ProjectedBounds
   2145 
   2146 Calculates the bounds of the given bounding box projected with the given Model View Projection (MVP) matrix.
   2147 If 'windowSpace' is true then the calculated bounds along each axis are moved and clamped to the [0, 1] range.
   2148 
   2149 The given bounding box is not clipped to the MVP so the projected bounds may not be as tight as possible.
   2150 If the given bounding box is W=0 clipped then the projected bounds will cover the full X-Y range.
   2151 Note that while projected[0][1] will be set to the minimum when the given bounding box is W=0 clipped,
   2152 projected[1][1] will still be valid and will NOT be set to the maximum when the given bounding box
   2153 is W=0 clipped.
   2154 ========================
   2155 */
   2156 void idRenderMatrix::ProjectedBounds( idBounds & projected, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) {
   2157 #ifdef ID_WIN_X86_SSE2_INTRIN
   2158 
   2159 	__m128 mvp0 = _mm_loadu_ps( mvp[0] );
   2160 	__m128 mvp1 = _mm_loadu_ps( mvp[1] );
   2161 	__m128 mvp2 = _mm_loadu_ps( mvp[2] );
   2162 	__m128 mvp3 = _mm_loadu_ps( mvp[3] );
   2163 
   2164 	__m128 b0 = _mm_loadu_bounds_0( bounds );
   2165 	__m128 b1 = _mm_loadu_bounds_1( bounds );
   2166 
   2167 	// take the four points on the X-Y plane
   2168 	__m128 vxy = _mm_unpacklo_ps( b0, b1 );						// min X, max X, min Y, max Y
   2169 	__m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) );	// min X, max X, min X, max X
   2170 	__m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) );	// min Y, min Y, max Y, max Y
   2171 
   2172 	__m128 vz0 = _mm_splat_ps( b0, 2 );							// min Z, min Z, min Z, min Z
   2173 	__m128 vz1 = _mm_splat_ps( b1, 2 );							// max Z, max Z, max Z, max Z
   2174 
   2175 	// compute four partial X,Y,Z,W values
   2176 	__m128 parx = _mm_splat_ps( mvp0, 3 );
   2177 	__m128 pary = _mm_splat_ps( mvp1, 3 );
   2178 	__m128 parz = _mm_splat_ps( mvp2, 3 );
   2179 	__m128 parw = _mm_splat_ps( mvp3, 3 );
   2180 
   2181 	parx = _mm_madd_ps( vx, _mm_splat_ps( mvp0, 0 ), parx );
   2182 	pary = _mm_madd_ps( vx, _mm_splat_ps( mvp1, 0 ), pary );
   2183 	parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz );
   2184 	parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw );
   2185 
   2186 	parx = _mm_madd_ps( vy, _mm_splat_ps( mvp0, 1 ), parx );
   2187 	pary = _mm_madd_ps( vy, _mm_splat_ps( mvp1, 1 ), pary );
   2188 	parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz );
   2189 	parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw );
   2190 
   2191 	// compute full X,Y,Z,W values
   2192 	__m128 mvp0Z = _mm_splat_ps( mvp0, 2 );
   2193 	__m128 mvp1Z = _mm_splat_ps( mvp1, 2 );
   2194 	__m128 mvp2Z = _mm_splat_ps( mvp2, 2 );
   2195 	__m128 mvp3Z = _mm_splat_ps( mvp3, 2 );
   2196 
   2197 	__m128 x0 = _mm_madd_ps( vz0, mvp0Z, parx );
   2198 	__m128 y0 = _mm_madd_ps( vz0, mvp1Z, pary );
   2199 	__m128 z0 = _mm_madd_ps( vz0, mvp2Z, parz );
   2200 	__m128 w0 = _mm_madd_ps( vz0, mvp3Z, parw );
   2201 
   2202 	__m128 x1 = _mm_madd_ps( vz1, mvp0Z, parx );
   2203 	__m128 y1 = _mm_madd_ps( vz1, mvp1Z, pary );
   2204 	__m128 z1 = _mm_madd_ps( vz1, mvp2Z, parz );
   2205 	__m128 w1 = _mm_madd_ps( vz1, mvp3Z, parw );
   2206 
   2207 	__m128 s0 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w0 );
   2208 	__m128 s1 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w1 );
   2209 
   2210 	w0 = _mm_sel_ps( w0, vector_float_one, s0 );
   2211 	w1 = _mm_sel_ps( w1, vector_float_one, s1 );
   2212 
   2213 	__m128 rw0 = _mm_rcp32_ps( w0 );
   2214 	__m128 rw1 = _mm_rcp32_ps( w1 );
   2215 
   2216 	x0 = _mm_mul_ps( x0, rw0 );
   2217 	y0 = _mm_mul_ps( y0, rw0 );
   2218 	z0 = _mm_mul_ps( z0, rw0 );
   2219 
   2220 	x1 = _mm_mul_ps( x1, rw1 );
   2221 	y1 = _mm_mul_ps( y1, rw1 );
   2222 	z1 = _mm_mul_ps( z1, rw1 );
   2223 
   2224 	__m128 minX = _mm_min_ps( x0, x1 );
   2225 	__m128 minY = _mm_min_ps( y0, y1 );
   2226 	__m128 minZ = _mm_min_ps( z0, z1 );
   2227 
   2228 	__m128 maxX = _mm_max_ps( x0, x1 );
   2229 	__m128 maxY = _mm_max_ps( y0, y1 );
   2230 	__m128 maxZ = _mm_max_ps( z0, z1 );
   2231 
   2232 	minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   2233 	minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   2234 	minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   2235 
   2236 	minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   2237 	minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   2238 	minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   2239 
   2240 	maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   2241 	maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   2242 	maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   2243 
   2244 	maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   2245 	maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   2246 	maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   2247 
   2248 	s0 = _mm_or_ps( s0, s1 );
   2249 	s0 = _mm_or_ps( s0, _mm_perm_ps( s0, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   2250 	s0 = _mm_or_ps( s0, _mm_perm_ps( s0, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   2251 
   2252 	minX = _mm_sel_ps( minX, vector_float_neg_infinity, s0 );
   2253 	minY = _mm_sel_ps( minY, vector_float_neg_infinity, s0 );
   2254 	minZ = _mm_sel_ps( minZ, vector_float_neg_infinity, s0 );
   2255 
   2256 	maxX = _mm_sel_ps( maxX, vector_float_pos_infinity, s0 );
   2257 	maxY = _mm_sel_ps( maxY, vector_float_pos_infinity, s0 );
   2258 	// NOTE: maxZ is valid either way
   2259 
   2260 	if ( windowSpace ) {
   2261 		minX = _mm_madd_ps( minX, vector_float_half, vector_float_half );
   2262 		maxX = _mm_madd_ps( maxX, vector_float_half, vector_float_half );
   2263 
   2264 		minY = _mm_madd_ps( minY, vector_float_half, vector_float_half );
   2265 		maxY = _mm_madd_ps( maxY, vector_float_half, vector_float_half );
   2266 
   2267 #if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
   2268 		minZ = _mm_madd_ps( minZ, vector_float_half, vector_float_half );
   2269 		maxZ = _mm_madd_ps( maxZ, vector_float_half, vector_float_half );
   2270 #endif
   2271 
   2272 		minX = _mm_max_ps( _mm_min_ps( minX, vector_float_one ), vector_float_zero );
   2273 		maxX = _mm_max_ps( _mm_min_ps( maxX, vector_float_one ), vector_float_zero );
   2274 
   2275 		minY = _mm_max_ps( _mm_min_ps( minY, vector_float_one ), vector_float_zero );
   2276 		maxY = _mm_max_ps( _mm_min_ps( maxY, vector_float_one ), vector_float_zero );
   2277 
   2278 		minZ = _mm_max_ps( _mm_min_ps( minZ, vector_float_one ), vector_float_zero );
   2279 		maxZ = _mm_max_ps( _mm_min_ps( maxZ, vector_float_one ), vector_float_zero );
   2280 	}
   2281 
   2282 	_mm_store_ss( & projected[0].x, minX );
   2283 	_mm_store_ss( & projected[0].y, minY );
   2284 	_mm_store_ss( & projected[0].z, minZ );
   2285 
   2286 	_mm_store_ss( & projected[1].x, maxX );
   2287 	_mm_store_ss( & projected[1].y, maxY );
   2288 	_mm_store_ss( & projected[1].z, maxZ );
   2289 
   2290 #else
   2291 
   2292 	for ( int i = 0; i < 3; i++ ) {
   2293 		projected[0][i] = RENDER_MATRIX_INFINITY;
   2294 		projected[1][i] = - RENDER_MATRIX_INFINITY;
   2295 	}
   2296 
   2297 	idVec3 v;
   2298 	for ( int x = 0; x < 2; x++ ) {
   2299 		v[0] = bounds[x][0];
   2300 		for ( int y = 0; y < 2; y++ ) {
   2301 			v[1] = bounds[y][1];
   2302 			for ( int z = 0; z < 2; z++ ) {
   2303 				v[2] = bounds[z][2];
   2304 
   2305 				float tx = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3];
   2306 				float ty = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3];
   2307 				float tz = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
   2308 				float tw = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
   2309 
   2310 				if ( tw <= idMath::FLT_SMALLEST_NON_DENORMAL ) {
   2311 					projected[0][0] = -RENDER_MATRIX_INFINITY;
   2312 					projected[0][1] = -RENDER_MATRIX_INFINITY;
   2313 					projected[0][2] = -RENDER_MATRIX_INFINITY;
   2314 					projected[1][0] = RENDER_MATRIX_INFINITY;
   2315 					projected[1][1] = RENDER_MATRIX_INFINITY;
   2316 					// NOTE: projected[1][1] is still valid
   2317 					continue;
   2318 				}
   2319 
   2320 				float rw = 1.0f / tw;
   2321 
   2322 				tx = tx * rw;
   2323 				ty = ty * rw;
   2324 				tz = tz * rw;
   2325 
   2326 				projected[0][0] = Min( projected[0][0], tx );
   2327 				projected[0][1] = Min( projected[0][1], ty );
   2328 				projected[0][2] = Min( projected[0][2], tz );
   2329 
   2330 				projected[1][0] = Max( projected[1][0], tx );
   2331 				projected[1][1] = Max( projected[1][1], ty );
   2332 				projected[1][2] = Max( projected[1][2], tz );
   2333 			}
   2334 		}
   2335 	}
   2336 
   2337 	if ( windowSpace ) {
   2338 		// convert to window coords
   2339 		projected[0][0] = projected[0][0] * 0.5f + 0.5f;
   2340 		projected[1][0] = projected[1][0] * 0.5f + 0.5f;
   2341 
   2342 		projected[0][1] = projected[0][1] * 0.5f + 0.5f;
   2343 		projected[1][1] = projected[1][1] * 0.5f + 0.5f;
   2344 
   2345 #if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
   2346 		projected[0][2] = projected[0][2] * 0.5f + 0.5f;
   2347 		projected[1][2] = projected[1][2] * 0.5f + 0.5f;
   2348 #endif
   2349 
   2350 		// clamp to [0, 1] range
   2351 		projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] );
   2352 		projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] );
   2353 
   2354 		projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] );
   2355 		projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] );
   2356 
   2357 		projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] );
   2358 		projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] );
   2359 	}
   2360 
   2361 #endif
   2362 }
   2363 
   2364 /*
   2365 ========================
   2366 idRenderMatrix::ProjectedNearClippedBounds
   2367 
   2368 Calculates the bounds of the given bounding box projected with the given Model View Projection (MVP) matrix.
   2369 If 'windowSpace' is true then the calculated bounds along each axis are moved and clamped to the [0, 1] range.
   2370 
   2371 The given bounding box is first near clipped so the projected bounds do not cover the full X-Y range when
   2372 the given bounding box crosses the W=0 plane. However, the given bounding box is not clipped against the
   2373 other planes so the projected bounds are still not as tight as they could be if the given bounding box
   2374 crosses a corner. Fortunately, clipping to the near clipping planes typically provides more than 50% of
   2375 the gain between not clipping at all and fully clipping the bounding box to all planes. Only clipping to
   2376 the near clipping plane is much cheaper than clipping to all planes and can be easily implemented with
   2377 completely branchless SIMD.
   2378 ========================
   2379 */
   2380 void idRenderMatrix::ProjectedNearClippedBounds( idBounds & projected, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) {
   2381 /*
   2382             4----{E}---5
   2383  +         /|         /|
   2384  Z      {H} {I}    {F} |
   2385  -     /    |     /   {J}
   2386       7--{G}-----6     |
   2387       |     |    |     |
   2388      {L}    0----|-{A}-1
   2389       |    /    {K}   /       -
   2390       | {D}      | {B}       Y
   2391       |/         |/         +
   2392       3---{C}----2
   2393 
   2394 	    - X +
   2395 */
   2396 
   2397 #ifdef ID_WIN_X86_SSE2_INTRIN
   2398 
   2399 	const __m128 mvp0 = _mm_loadu_ps( mvp[0] );
   2400 	const __m128 mvp1 = _mm_loadu_ps( mvp[1] );
   2401 	const __m128 mvp2 = _mm_loadu_ps( mvp[2] );
   2402 	const __m128 mvp3 = _mm_loadu_ps( mvp[3] );
   2403 
   2404 	const __m128 b0 = _mm_loadu_bounds_0( bounds );
   2405 	const __m128 b1 = _mm_loadu_bounds_1( bounds );
   2406 
   2407 	// take the four points on the X-Y plane
   2408 	const __m128 vxy = _mm_unpacklo_ps( b0, b1 );						// min X, max X, min Y, max Y
   2409 	const __m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) );	// min X, max X, min X, max X
   2410 	const __m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) );	// min Y, min Y, max Y, max Y
   2411 
   2412 	const __m128 vz0 = _mm_splat_ps( b0, 2 );							// min Z, min Z, min Z, min Z
   2413 	const __m128 vz1 = _mm_splat_ps( b1, 2 );							// max Z, max Z, max Z, max Z
   2414 
   2415 	// compute four partial X,Y,Z,W values
   2416 	__m128 parx = _mm_splat_ps( mvp0, 3 );
   2417 	__m128 pary = _mm_splat_ps( mvp1, 3 );
   2418 	__m128 parz = _mm_splat_ps( mvp2, 3 );
   2419 	__m128 parw = _mm_splat_ps( mvp3, 3 );
   2420 
   2421 	parx = _mm_madd_ps( vx, _mm_splat_ps( mvp0, 0 ), parx );
   2422 	pary = _mm_madd_ps( vx, _mm_splat_ps( mvp1, 0 ), pary );
   2423 	parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz );
   2424 	parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw );
   2425 
   2426 	parx = _mm_madd_ps( vy, _mm_splat_ps( mvp0, 1 ), parx );
   2427 	pary = _mm_madd_ps( vy, _mm_splat_ps( mvp1, 1 ), pary );
   2428 	parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz );
   2429 	parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw );
   2430 
   2431 	// compute full X,Y,Z,W values
   2432 	const __m128 mvp0Z = _mm_splat_ps( mvp0, 2 );
   2433 	const __m128 mvp1Z = _mm_splat_ps( mvp1, 2 );
   2434 	const __m128 mvp2Z = _mm_splat_ps( mvp2, 2 );
   2435 	const __m128 mvp3Z = _mm_splat_ps( mvp3, 2 );
   2436 
   2437 	const __m128 x_0123 = _mm_madd_ps( vz0, mvp0Z, parx );
   2438 	const __m128 y_0123 = _mm_madd_ps( vz0, mvp1Z, pary );
   2439 	const __m128 z_0123 = _mm_madd_ps( vz0, mvp2Z, parz );
   2440 	const __m128 w_0123 = _mm_madd_ps( vz0, mvp3Z, parw );
   2441 
   2442 	const __m128 x_4567 = _mm_madd_ps( vz1, mvp0Z, parx );
   2443 	const __m128 y_4567 = _mm_madd_ps( vz1, mvp1Z, pary );
   2444 	const __m128 z_4567 = _mm_madd_ps( vz1, mvp2Z, parz );
   2445 	const __m128 w_4567 = _mm_madd_ps( vz1, mvp3Z, parw );
   2446 
   2447 	// rotate the X,Y,Z,W values up by one
   2448 	const __m128 x_1230 = _mm_perm_ps( x_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   2449 	const __m128 y_1230 = _mm_perm_ps( y_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   2450 	const __m128 z_1230 = _mm_perm_ps( z_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   2451 	const __m128 w_1230 = _mm_perm_ps( w_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   2452 
   2453 	const __m128 x_5674 = _mm_perm_ps( x_4567, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   2454 	const __m128 y_5674 = _mm_perm_ps( y_4567, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   2455 	const __m128 z_5674 = _mm_perm_ps( z_4567, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   2456 	const __m128 w_5674 = _mm_perm_ps( w_4567, _MM_SHUFFLE( 0, 3, 2, 1 ) );
   2457 
   2458 #if defined( CLIP_SPACE_D3D )	// the D3D near plane is at Z=0 instead of Z=-1
   2459 	const __m128 d_0123 = z_0123;
   2460 	const __m128 d_4567 = z_4567;
   2461 	const __m128 d_1230 = z_1230;
   2462 	const __m128 d_5674 = z_5674;
   2463 #else
   2464 	const __m128 d_0123 = _mm_add_ps( z_0123, w_0123 );
   2465 	const __m128 d_4567 = _mm_add_ps( z_4567, w_4567 );
   2466 	const __m128 d_1230 = _mm_add_ps( z_1230, w_1230 );
   2467 	const __m128 d_5674 = _mm_add_ps( z_5674, w_5674 );
   2468 #endif
   2469 
   2470 	const __m128 deltaABCD = _mm_sub_ps( d_0123, d_1230 );
   2471 	const __m128 deltaEFGH = _mm_sub_ps( d_4567, d_5674 );
   2472 	const __m128 deltaIJKL = _mm_sub_ps( d_0123, d_4567 );
   2473 
   2474 	const __m128 maskABCD = _mm_cmpgt_ps( _mm_and_ps( deltaABCD, vector_float_abs_mask ), vector_float_smallest_non_denorm );
   2475 	const __m128 maskEFGH = _mm_cmpgt_ps( _mm_and_ps( deltaEFGH, vector_float_abs_mask ), vector_float_smallest_non_denorm );
   2476 	const __m128 maskIJKL = _mm_cmpgt_ps( _mm_and_ps( deltaIJKL, vector_float_abs_mask ), vector_float_smallest_non_denorm );
   2477 
   2478 	const __m128 fractionABCD = _mm_and_ps( _mm_div32_ps( d_0123, _mm_sel_ps( vector_float_one, deltaABCD, maskABCD ) ), maskABCD );
   2479 	const __m128 fractionEFGH = _mm_and_ps( _mm_div32_ps( d_4567, _mm_sel_ps( vector_float_one, deltaEFGH, maskEFGH ) ), maskEFGH );
   2480 	const __m128 fractionIJKL = _mm_and_ps( _mm_div32_ps( d_0123, _mm_sel_ps( vector_float_one, deltaIJKL, maskIJKL ) ), maskIJKL );
   2481 
   2482 	const __m128 clipABCD = _mm_and_ps( _mm_cmpgt_ps( fractionABCD, vector_float_zero ), _mm_cmpgt_ps( vector_float_one, fractionABCD ) );
   2483 	const __m128 clipEFGH = _mm_and_ps( _mm_cmpgt_ps( fractionEFGH, vector_float_zero ), _mm_cmpgt_ps( vector_float_one, fractionEFGH ) );
   2484 	const __m128 clipIJKL = _mm_and_ps( _mm_cmpgt_ps( fractionIJKL, vector_float_zero ), _mm_cmpgt_ps( vector_float_one, fractionIJKL ) );
   2485 
   2486 	const __m128 intersectionABCD_x = _mm_madd_ps( fractionABCD, _mm_sub_ps( x_1230, x_0123 ), x_0123 );
   2487 	const __m128 intersectionABCD_y = _mm_madd_ps( fractionABCD, _mm_sub_ps( y_1230, y_0123 ), y_0123 );
   2488 	const __m128 intersectionABCD_z = _mm_madd_ps( fractionABCD, _mm_sub_ps( z_1230, z_0123 ), z_0123 );
   2489 	const __m128 intersectionABCD_w = _mm_madd_ps( fractionABCD, _mm_sub_ps( w_1230, w_0123 ), w_0123 );
   2490 
   2491 	const __m128 intersectionEFGH_x = _mm_madd_ps( fractionEFGH, _mm_sub_ps( x_5674, x_4567 ), x_4567 );
   2492 	const __m128 intersectionEFGH_y = _mm_madd_ps( fractionEFGH, _mm_sub_ps( y_5674, y_4567 ), y_4567 );
   2493 	const __m128 intersectionEFGH_z = _mm_madd_ps( fractionEFGH, _mm_sub_ps( z_5674, z_4567 ), z_4567 );
   2494 	const __m128 intersectionEFGH_w = _mm_madd_ps( fractionEFGH, _mm_sub_ps( w_5674, w_4567 ), w_4567 );
   2495 
   2496 	const __m128 intersectionIJKL_x = _mm_madd_ps( fractionIJKL, _mm_sub_ps( x_4567, x_0123 ), x_0123 );
   2497 	const __m128 intersectionIJKL_y = _mm_madd_ps( fractionIJKL, _mm_sub_ps( y_4567, y_0123 ), y_0123 );
   2498 	const __m128 intersectionIJKL_z = _mm_madd_ps( fractionIJKL, _mm_sub_ps( z_4567, z_0123 ), z_0123 );
   2499 	const __m128 intersectionIJKL_w = _mm_madd_ps( fractionIJKL, _mm_sub_ps( w_4567, w_0123 ), w_0123 );
   2500 
   2501 	const __m128 mask_0123 = _mm_cmpgt_ps( vector_float_zero, d_0123 );
   2502 	const __m128 mask_1230 = _mm_cmpgt_ps( vector_float_zero, d_1230 );
   2503 	const __m128 mask_4567 = _mm_cmpgt_ps( vector_float_zero, d_4567 );
   2504 	const __m128 mask_5674 = _mm_cmpgt_ps( vector_float_zero, d_5674 );
   2505 
   2506 	const __m128 maskABCD_0123 = _mm_and_ps( clipABCD, mask_0123 );
   2507 	const __m128 maskABCD_1230 = _mm_and_ps( clipABCD, mask_1230 );
   2508 	const __m128 maskEFGH_4567 = _mm_and_ps( clipEFGH, mask_4567 );
   2509 	const __m128 maskEFGH_5674 = _mm_and_ps( clipEFGH, mask_5674 );
   2510 	const __m128 maskIJKL_0123 = _mm_and_ps( clipIJKL, mask_0123 );
   2511 	const __m128 maskIJKL_4567 = _mm_and_ps( clipIJKL, mask_4567 );
   2512 
   2513 	__m128 edgeVertsABCD_x0 = _mm_sel_ps( x_0123, intersectionABCD_x, maskABCD_0123 );
   2514 	__m128 edgeVertsABCD_y0 = _mm_sel_ps( y_0123, intersectionABCD_y, maskABCD_0123 );
   2515 	__m128 edgeVertsABCD_z0 = _mm_sel_ps( z_0123, intersectionABCD_z, maskABCD_0123 );
   2516 	__m128 edgeVertsABCD_w0 = _mm_sel_ps( w_0123, intersectionABCD_w, maskABCD_0123 );
   2517 
   2518 	__m128 edgeVertsABCD_x1 = _mm_sel_ps( x_1230, intersectionABCD_x, maskABCD_1230 );
   2519 	__m128 edgeVertsABCD_y1 = _mm_sel_ps( y_1230, intersectionABCD_y, maskABCD_1230 );
   2520 	__m128 edgeVertsABCD_z1 = _mm_sel_ps( z_1230, intersectionABCD_z, maskABCD_1230 );
   2521 	__m128 edgeVertsABCD_w1 = _mm_sel_ps( w_1230, intersectionABCD_w, maskABCD_1230 );
   2522 
   2523 	__m128 edgeVertsEFGH_x0 = _mm_sel_ps( x_4567, intersectionEFGH_x, maskEFGH_4567 );
   2524 	__m128 edgeVertsEFGH_y0 = _mm_sel_ps( y_4567, intersectionEFGH_y, maskEFGH_4567 );
   2525 	__m128 edgeVertsEFGH_z0 = _mm_sel_ps( z_4567, intersectionEFGH_z, maskEFGH_4567 );
   2526 	__m128 edgeVertsEFGH_w0 = _mm_sel_ps( w_4567, intersectionEFGH_w, maskEFGH_4567 );
   2527 
   2528 	__m128 edgeVertsEFGH_x1 = _mm_sel_ps( x_5674, intersectionEFGH_x, maskEFGH_5674 );
   2529 	__m128 edgeVertsEFGH_y1 = _mm_sel_ps( y_5674, intersectionEFGH_y, maskEFGH_5674 );
   2530 	__m128 edgeVertsEFGH_z1 = _mm_sel_ps( z_5674, intersectionEFGH_z, maskEFGH_5674 );
   2531 	__m128 edgeVertsEFGH_w1 = _mm_sel_ps( w_5674, intersectionEFGH_w, maskEFGH_5674 );
   2532 
   2533 	__m128 edgeVertsIJKL_x0 = _mm_sel_ps( x_0123, intersectionIJKL_x, maskIJKL_0123 );
   2534 	__m128 edgeVertsIJKL_y0 = _mm_sel_ps( y_0123, intersectionIJKL_y, maskIJKL_0123 );
   2535 	__m128 edgeVertsIJKL_z0 = _mm_sel_ps( z_0123, intersectionIJKL_z, maskIJKL_0123 );
   2536 	__m128 edgeVertsIJKL_w0 = _mm_sel_ps( w_0123, intersectionIJKL_w, maskIJKL_0123 );
   2537 
   2538 	__m128 edgeVertsIJKL_x1 = _mm_sel_ps( x_4567, intersectionIJKL_x, maskIJKL_4567 );
   2539 	__m128 edgeVertsIJKL_y1 = _mm_sel_ps( y_4567, intersectionIJKL_y, maskIJKL_4567 );
   2540 	__m128 edgeVertsIJKL_z1 = _mm_sel_ps( z_4567, intersectionIJKL_z, maskIJKL_4567 );
   2541 	__m128 edgeVertsIJKL_w1 = _mm_sel_ps( w_4567, intersectionIJKL_w, maskIJKL_4567 );
   2542 
   2543 	const __m128 maskABCD_w0 = _mm_cmpgt_ps( edgeVertsABCD_w0, vector_float_smallest_non_denorm );
   2544 	const __m128 maskABCD_w1 = _mm_cmpgt_ps( edgeVertsABCD_w1, vector_float_smallest_non_denorm );
   2545 	const __m128 maskEFGH_w0 = _mm_cmpgt_ps( edgeVertsEFGH_w0, vector_float_smallest_non_denorm );
   2546 	const __m128 maskEFGH_w1 = _mm_cmpgt_ps( edgeVertsEFGH_w1, vector_float_smallest_non_denorm );
   2547 	const __m128 maskIJKL_w0 = _mm_cmpgt_ps( edgeVertsIJKL_w0, vector_float_smallest_non_denorm );
   2548 	const __m128 maskIJKL_w1 = _mm_cmpgt_ps( edgeVertsIJKL_w1, vector_float_smallest_non_denorm );
   2549 
   2550 	edgeVertsABCD_w0 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsABCD_w0, maskABCD_w0 ) );
   2551 	edgeVertsABCD_w1 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsABCD_w1, maskABCD_w1 ) );
   2552 	edgeVertsEFGH_w0 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsEFGH_w0, maskEFGH_w0 ) );
   2553 	edgeVertsEFGH_w1 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsEFGH_w1, maskEFGH_w1 ) );
   2554 	edgeVertsIJKL_w0 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsIJKL_w0, maskIJKL_w0 ) );
   2555 	edgeVertsIJKL_w1 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsIJKL_w1, maskIJKL_w1 ) );
   2556 
   2557 	edgeVertsABCD_x0 = _mm_mul_ps( edgeVertsABCD_x0, edgeVertsABCD_w0 );
   2558 	edgeVertsABCD_x1 = _mm_mul_ps( edgeVertsABCD_x1, edgeVertsABCD_w1 );
   2559 	edgeVertsEFGH_x0 = _mm_mul_ps( edgeVertsEFGH_x0, edgeVertsEFGH_w0 );
   2560 	edgeVertsEFGH_x1 = _mm_mul_ps( edgeVertsEFGH_x1, edgeVertsEFGH_w1 );
   2561 	edgeVertsIJKL_x0 = _mm_mul_ps( edgeVertsIJKL_x0, edgeVertsIJKL_w0 );
   2562 	edgeVertsIJKL_x1 = _mm_mul_ps( edgeVertsIJKL_x1, edgeVertsIJKL_w1 );
   2563 
   2564 	edgeVertsABCD_y0 = _mm_mul_ps( edgeVertsABCD_y0, edgeVertsABCD_w0 );
   2565 	edgeVertsABCD_y1 = _mm_mul_ps( edgeVertsABCD_y1, edgeVertsABCD_w1 );
   2566 	edgeVertsEFGH_y0 = _mm_mul_ps( edgeVertsEFGH_y0, edgeVertsEFGH_w0 );
   2567 	edgeVertsEFGH_y1 = _mm_mul_ps( edgeVertsEFGH_y1, edgeVertsEFGH_w1 );
   2568 	edgeVertsIJKL_y0 = _mm_mul_ps( edgeVertsIJKL_y0, edgeVertsIJKL_w0 );
   2569 	edgeVertsIJKL_y1 = _mm_mul_ps( edgeVertsIJKL_y1, edgeVertsIJKL_w1 );
   2570 
   2571 	edgeVertsABCD_z0 = _mm_mul_ps( edgeVertsABCD_z0, edgeVertsABCD_w0 );
   2572 	edgeVertsABCD_z1 = _mm_mul_ps( edgeVertsABCD_z1, edgeVertsABCD_w1 );
   2573 	edgeVertsEFGH_z0 = _mm_mul_ps( edgeVertsEFGH_z0, edgeVertsEFGH_w0 );
   2574 	edgeVertsEFGH_z1 = _mm_mul_ps( edgeVertsEFGH_z1, edgeVertsEFGH_w1 );
   2575 	edgeVertsIJKL_z0 = _mm_mul_ps( edgeVertsIJKL_z0, edgeVertsIJKL_w0 );
   2576 	edgeVertsIJKL_z1 = _mm_mul_ps( edgeVertsIJKL_z1, edgeVertsIJKL_w1 );
   2577 
   2578 	const __m128 posInf = vector_float_pos_infinity;
   2579 	const __m128 negInf = vector_float_neg_infinity;
   2580 
   2581 	const __m128 minX0 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsABCD_x0, maskABCD_w0 ), _mm_sel_ps( posInf, edgeVertsABCD_x1, maskABCD_w1 ) );
   2582 	const __m128 minX1 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsEFGH_x0, maskEFGH_w0 ), _mm_sel_ps( posInf, edgeVertsEFGH_x1, maskEFGH_w1 ) );
   2583 	const __m128 minX2 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsIJKL_x0, maskIJKL_w0 ), _mm_sel_ps( posInf, edgeVertsIJKL_x1, maskIJKL_w1 ) );
   2584 
   2585 	const __m128 minY0 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsABCD_y0, maskABCD_w0 ), _mm_sel_ps( posInf, edgeVertsABCD_y1, maskABCD_w1 ) );
   2586 	const __m128 minY1 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsEFGH_y0, maskEFGH_w0 ), _mm_sel_ps( posInf, edgeVertsEFGH_y1, maskEFGH_w1 ) );
   2587 	const __m128 minY2 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsIJKL_y0, maskIJKL_w0 ), _mm_sel_ps( posInf, edgeVertsIJKL_y1, maskIJKL_w1 ) );
   2588 
   2589 	const __m128 minZ0 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsABCD_z0, maskABCD_w0 ), _mm_sel_ps( posInf, edgeVertsABCD_z1, maskABCD_w1 ) );
   2590 	const __m128 minZ1 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsEFGH_z0, maskEFGH_w0 ), _mm_sel_ps( posInf, edgeVertsEFGH_z1, maskEFGH_w1 ) );
   2591 	const __m128 minZ2 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsIJKL_z0, maskIJKL_w0 ), _mm_sel_ps( posInf, edgeVertsIJKL_z1, maskIJKL_w1 ) );
   2592 
   2593 	const __m128 maxX0 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsABCD_x0, maskABCD_w0 ), _mm_sel_ps( negInf, edgeVertsABCD_x1, maskABCD_w1 ) );
   2594 	const __m128 maxX1 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsEFGH_x0, maskEFGH_w0 ), _mm_sel_ps( negInf, edgeVertsEFGH_x1, maskEFGH_w1 ) );
   2595 	const __m128 maxX2 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsIJKL_x0, maskIJKL_w0 ), _mm_sel_ps( negInf, edgeVertsIJKL_x1, maskIJKL_w1 ) );
   2596 
   2597 	const __m128 maxY0 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsABCD_y0, maskABCD_w0 ), _mm_sel_ps( negInf, edgeVertsABCD_y1, maskABCD_w1 ) );
   2598 	const __m128 maxY1 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsEFGH_y0, maskEFGH_w0 ), _mm_sel_ps( negInf, edgeVertsEFGH_y1, maskEFGH_w1 ) );
   2599 	const __m128 maxY2 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsIJKL_y0, maskIJKL_w0 ), _mm_sel_ps( negInf, edgeVertsIJKL_y1, maskIJKL_w1 ) );
   2600 
   2601 	const __m128 maxZ0 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsABCD_z0, maskABCD_w0 ), _mm_sel_ps( negInf, edgeVertsABCD_z1, maskABCD_w1 ) );
   2602 	const __m128 maxZ1 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsEFGH_z0, maskEFGH_w0 ), _mm_sel_ps( negInf, edgeVertsEFGH_z1, maskEFGH_w1 ) );
   2603 	const __m128 maxZ2 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsIJKL_z0, maskIJKL_w0 ), _mm_sel_ps( negInf, edgeVertsIJKL_z1, maskIJKL_w1 ) );
   2604 
   2605 	__m128 minX = _mm_min_ps( minX0, _mm_min_ps( minX1, minX2 ) );
   2606 	__m128 minY = _mm_min_ps( minY0, _mm_min_ps( minY1, minY2 ) );
   2607 	__m128 minZ = _mm_min_ps( minZ0, _mm_min_ps( minZ1, minZ2 ) );
   2608 
   2609 	__m128 maxX = _mm_max_ps( maxX0, _mm_max_ps( maxX1, maxX2 ) );
   2610 	__m128 maxY = _mm_max_ps( maxY0, _mm_max_ps( maxY1, maxY2 ) );
   2611 	__m128 maxZ = _mm_max_ps( maxZ0, _mm_max_ps( maxZ1, maxZ2 ) );
   2612 
   2613 	minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   2614 	minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   2615 	minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   2616 
   2617 	minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   2618 	minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   2619 	minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   2620 
   2621 	maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   2622 	maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   2623 	maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   2624 
   2625 	maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   2626 	maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   2627 	maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   2628 
   2629 	if ( windowSpace ) {
   2630 		minX = _mm_madd_ps( minX, vector_float_half, vector_float_half );
   2631 		maxX = _mm_madd_ps( maxX, vector_float_half, vector_float_half );
   2632 
   2633 		minY = _mm_madd_ps( minY, vector_float_half, vector_float_half );
   2634 		maxY = _mm_madd_ps( maxY, vector_float_half, vector_float_half );
   2635 
   2636 #if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
   2637 		minZ = _mm_madd_ps( minZ, vector_float_half, vector_float_half );
   2638 		maxZ = _mm_madd_ps( maxZ, vector_float_half, vector_float_half );
   2639 #endif
   2640 
   2641 		minX = _mm_max_ps( _mm_min_ps( minX, vector_float_one ), vector_float_zero );
   2642 		maxX = _mm_max_ps( _mm_min_ps( maxX, vector_float_one ), vector_float_zero );
   2643 
   2644 		minY = _mm_max_ps( _mm_min_ps( minY, vector_float_one ), vector_float_zero );
   2645 		maxY = _mm_max_ps( _mm_min_ps( maxY, vector_float_one ), vector_float_zero );
   2646 
   2647 		minZ = _mm_max_ps( _mm_min_ps( minZ, vector_float_one ), vector_float_zero );
   2648 		maxZ = _mm_max_ps( _mm_min_ps( maxZ, vector_float_one ), vector_float_zero );
   2649 	}
   2650 
   2651 	_mm_store_ss( & projected[0].x, minX );
   2652 	_mm_store_ss( & projected[0].y, minY );
   2653 	_mm_store_ss( & projected[0].z, minZ );
   2654 
   2655 	_mm_store_ss( & projected[1].x, maxX );
   2656 	_mm_store_ss( & projected[1].y, maxY );
   2657 	_mm_store_ss( & projected[1].z, maxZ );
   2658 
   2659 #elif 1
   2660 
   2661 {
   2662 	const idVec3 points[8] = {
   2663 		idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ),
   2664 		idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ),
   2665 		idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ),
   2666 		idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ),
   2667 		idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ),
   2668 		idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ),
   2669 		idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ),
   2670 		idVec3( bounds[0][0], bounds[1][1], bounds[1][2] )
   2671 	};
   2672 
   2673 	idVec4 projectedPoints[8];
   2674 	for ( int i = 0; i < 8; i++ ) {
   2675 		const idVec3 & v = points[i];
   2676 		projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3];
   2677 		projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3];
   2678 		projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
   2679 		projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
   2680 	}
   2681 
   2682 	const idVec4 & p0 = projectedPoints[0];
   2683 	const idVec4 & p1 = projectedPoints[1];
   2684 	const idVec4 & p2 = projectedPoints[2];
   2685 	const idVec4 & p3 = projectedPoints[3];
   2686 	const idVec4 & p4 = projectedPoints[4];
   2687 	const idVec4 & p5 = projectedPoints[5];
   2688 	const idVec4 & p6 = projectedPoints[6];
   2689 	const idVec4 & p7 = projectedPoints[7];
   2690 
   2691 #if defined( CLIP_SPACE_D3D )	// the D3D near plane is at Z=0 instead of Z=-1
   2692 	const float d0 = p0.z;
   2693 	const float d1 = p1.z;
   2694 	const float d2 = p2.z;
   2695 	const float d3 = p3.z;
   2696 	const float d4 = p4.z;
   2697 	const float d5 = p5.z;
   2698 	const float d6 = p6.z;
   2699 	const float d7 = p7.z;
   2700 #else
   2701 	const float d0 = p0.z + p0.w;
   2702 	const float d1 = p1.z + p1.w;
   2703 	const float d2 = p2.z + p2.w;
   2704 	const float d3 = p3.z + p3.w;
   2705 	const float d4 = p4.z + p4.w;
   2706 	const float d5 = p5.z + p5.w;
   2707 	const float d6 = p6.z + p6.w;
   2708 	const float d7 = p7.z + p7.w;
   2709 #endif
   2710 
   2711 	const float deltaA = d0 - d1;
   2712 	const float deltaB = d1 - d2;
   2713 	const float deltaC = d2 - d3;
   2714 	const float deltaD = d3 - d0;
   2715 
   2716 	const float deltaE = d4 - d5;
   2717 	const float deltaF = d5 - d6;
   2718 	const float deltaG = d6 - d7;
   2719 	const float deltaH = d7 - d4;
   2720 
   2721 	const float deltaI = d0 - d4;
   2722 	const float deltaJ = d1 - d5;
   2723 	const float deltaK = d2 - d6;
   2724 	const float deltaL = d3 - d7;
   2725 
   2726 	const float fractionA = ( fabs( deltaA ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d0 / deltaA ) : 0.0f;
   2727 	const float fractionB = ( fabs( deltaB ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d1 / deltaB ) : 0.0f;
   2728 	const float fractionC = ( fabs( deltaC ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d2 / deltaC ) : 0.0f;
   2729 	const float fractionD = ( fabs( deltaD ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d3 / deltaD ) : 0.0f;
   2730 
   2731 	const float fractionE = ( fabs( deltaE ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d4 / deltaE ) : 0.0f;
   2732 	const float fractionF = ( fabs( deltaF ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d5 / deltaF ) : 0.0f;
   2733 	const float fractionG = ( fabs( deltaG ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d6 / deltaG ) : 0.0f;
   2734 	const float fractionH = ( fabs( deltaH ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d7 / deltaH ) : 0.0f;
   2735 
   2736 	const float fractionI = ( fabs( deltaI ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d0 / deltaI ) : 0.0f;
   2737 	const float fractionJ = ( fabs( deltaJ ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d1 / deltaJ ) : 0.0f;
   2738 	const float fractionK = ( fabs( deltaK ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d2 / deltaK ) : 0.0f;
   2739 	const float fractionL = ( fabs( deltaL ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d3 / deltaL ) : 0.0f;
   2740 
   2741 	const bool clipA = ( fractionA > 0.0f && fractionA < 1.0f );
   2742 	const bool clipB = ( fractionB > 0.0f && fractionB < 1.0f );
   2743 	const bool clipC = ( fractionC > 0.0f && fractionC < 1.0f );
   2744 	const bool clipD = ( fractionD > 0.0f && fractionD < 1.0f );
   2745 
   2746 	const bool clipE = ( fractionE > 0.0f && fractionE < 1.0f );
   2747 	const bool clipF = ( fractionF > 0.0f && fractionF < 1.0f );
   2748 	const bool clipG = ( fractionG > 0.0f && fractionG < 1.0f );
   2749 	const bool clipH = ( fractionH > 0.0f && fractionH < 1.0f );
   2750 
   2751 	const bool clipI = ( fractionI > 0.0f && fractionI < 1.0f );
   2752 	const bool clipJ = ( fractionJ > 0.0f && fractionJ < 1.0f );
   2753 	const bool clipK = ( fractionK > 0.0f && fractionK < 1.0f );
   2754 	const bool clipL = ( fractionL > 0.0f && fractionL < 1.0f );
   2755 
   2756 	const idVec4 intersectionA = p0 + fractionA * ( p1 - p0 );
   2757 	const idVec4 intersectionB = p1 + fractionB * ( p2 - p1 );
   2758 	const idVec4 intersectionC = p2 + fractionC * ( p3 - p2 );
   2759 	const idVec4 intersectionD = p3 + fractionD * ( p0 - p3 );
   2760 
   2761 	const idVec4 intersectionE = p4 + fractionE * ( p5 - p4 );
   2762 	const idVec4 intersectionF = p5 + fractionF * ( p6 - p5 );
   2763 	const idVec4 intersectionG = p6 + fractionG * ( p7 - p6 );
   2764 	const idVec4 intersectionH = p7 + fractionH * ( p4 - p7 );
   2765 
   2766 	const idVec4 intersectionI = p0 + fractionI * ( p4 - p0 );
   2767 	const idVec4 intersectionJ = p1 + fractionJ * ( p5 - p1 );
   2768 	const idVec4 intersectionK = p2 + fractionK * ( p6 - p2 );
   2769 	const idVec4 intersectionL = p3 + fractionL * ( p7 - p3 );
   2770 
   2771 	idVec4 edgeVerts[24];
   2772 
   2773 	edgeVerts[ 0] = ( clipA && d0 < 0.0f ) ? intersectionA : p0;
   2774 	edgeVerts[ 2] = ( clipB && d1 < 0.0f ) ? intersectionB : p1;
   2775 	edgeVerts[ 4] = ( clipC && d2 < 0.0f ) ? intersectionC : p2;
   2776 	edgeVerts[ 6] = ( clipD && d3 < 0.0f ) ? intersectionD : p3;
   2777 
   2778 	edgeVerts[ 1] = ( clipA && d1 < 0.0f ) ? intersectionA : p1;
   2779 	edgeVerts[ 3] = ( clipB && d2 < 0.0f ) ? intersectionB : p2;
   2780 	edgeVerts[ 5] = ( clipC && d3 < 0.0f ) ? intersectionC : p3;
   2781 	edgeVerts[ 7] = ( clipD && d0 < 0.0f ) ? intersectionD : p0;
   2782 
   2783 	edgeVerts[ 8] = ( clipE && d4 < 0.0f ) ? intersectionE : p4;
   2784 	edgeVerts[10] = ( clipF && d5 < 0.0f ) ? intersectionF : p5;
   2785 	edgeVerts[12] = ( clipG && d6 < 0.0f ) ? intersectionG : p6;
   2786 	edgeVerts[14] = ( clipH && d7 < 0.0f ) ? intersectionH : p7;
   2787 
   2788 	edgeVerts[ 9] = ( clipE && d5 < 0.0f ) ? intersectionE : p5;
   2789 	edgeVerts[11] = ( clipF && d6 < 0.0f ) ? intersectionF : p6;
   2790 	edgeVerts[13] = ( clipG && d7 < 0.0f ) ? intersectionG : p7;
   2791 	edgeVerts[15] = ( clipH && d4 < 0.0f ) ? intersectionH : p4;
   2792 
   2793 	edgeVerts[16] = ( clipI && d0 < 0.0f ) ? intersectionI : p0;
   2794 	edgeVerts[18] = ( clipJ && d1 < 0.0f ) ? intersectionJ : p1;
   2795 	edgeVerts[20] = ( clipK && d2 < 0.0f ) ? intersectionK : p2;
   2796 	edgeVerts[22] = ( clipL && d3 < 0.0f ) ? intersectionL : p3;
   2797 
   2798 	edgeVerts[17] = ( clipI && d4 < 0.0f ) ? intersectionI : p4;
   2799 	edgeVerts[19] = ( clipJ && d5 < 0.0f ) ? intersectionJ : p5;
   2800 	edgeVerts[21] = ( clipK && d6 < 0.0f ) ? intersectionK : p6;
   2801 	edgeVerts[23] = ( clipL && d7 < 0.0f ) ? intersectionL : p7;
   2802 
   2803 	idBounds projBnds;
   2804 	for ( int i = 0; i < 3; i++ ) {
   2805 		projBnds[0][i] = RENDER_MATRIX_INFINITY;
   2806 		projBnds[1][i] = - RENDER_MATRIX_INFINITY;
   2807 	}
   2808 
   2809 	for ( int i = 0; i < 24; i++ ) {
   2810 		const idVec4 & v = edgeVerts[i];
   2811 
   2812 		if ( v.w <= idMath::FLT_SMALLEST_NON_DENORMAL ) {
   2813 			continue;
   2814 		}
   2815 
   2816 		const float rw = 1.0f / v.w;
   2817 
   2818 		const float px = v.x * rw;
   2819 		const float py = v.y * rw;
   2820 		const float pz = v.z * rw;
   2821 
   2822 		projBnds[0][0] = Min( projBnds[0][0], px );
   2823 		projBnds[0][1] = Min( projBnds[0][1], py );
   2824 		projBnds[0][2] = Min( projBnds[0][2], pz );
   2825 
   2826 		projBnds[1][0] = Max( projBnds[1][0], px );
   2827 		projBnds[1][1] = Max( projBnds[1][1], py );
   2828 		projBnds[1][2] = Max( projBnds[1][2], pz );
   2829 	}
   2830 
   2831 	if ( windowSpace ) {
   2832 		// convert to window coords
   2833 		projBnds[0][0] = projBnds[0][0] * 0.5f + 0.5f;
   2834 		projBnds[1][0] = projBnds[1][0] * 0.5f + 0.5f;
   2835 
   2836 		projBnds[0][1] = projBnds[0][1] * 0.5f + 0.5f;
   2837 		projBnds[1][1] = projBnds[1][1] * 0.5f + 0.5f;
   2838 
   2839 #if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
   2840 		projBnds[0][2] = projBnds[0][2] * 0.5f + 0.5f;
   2841 		projBnds[1][2] = projBnds[1][2] * 0.5f + 0.5f;
   2842 #endif
   2843 
   2844 		// clamp to [0, 1] range
   2845 		projBnds[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][0] );
   2846 		projBnds[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][0] );
   2847 
   2848 		projBnds[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][1] );
   2849 		projBnds[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][1] );
   2850 
   2851 		projBnds[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][2] );
   2852 		projBnds[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][2] );
   2853 	}
   2854 
   2855 	assert( projected[0].Compare( projBnds[0], 0.01f ) );
   2856 	assert( projected[1].Compare( projBnds[1], 0.01f ) );
   2857 }
   2858 
   2859 #else
   2860 
   2861 	const idVec3 points[8] = {
   2862 		idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ),
   2863 		idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ),
   2864 		idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ),
   2865 		idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ),
   2866 		idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ),
   2867 		idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ),
   2868 		idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ),
   2869 		idVec3( bounds[0][0], bounds[1][1], bounds[1][2] )
   2870 	};
   2871 
   2872 	idVec4 projectedPoints[8];
   2873 	for ( int i = 0; i < 8; i++ ) {
   2874 		const idVec3 & v = points[i];
   2875 		projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3];
   2876 		projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3];
   2877 		projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
   2878 		projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
   2879 	}
   2880 
   2881 	idVec4 edgeVerts[24];
   2882 	for ( int i = 0; i < 3; i++ ) {
   2883 		int offset0 = ( i & 1 ) * 4;
   2884 		int offset1 = ( i & 1 ) * 4 + ( i & 2 ) * 2;
   2885 		int offset3 = ~( i >> 1 ) & 1;
   2886 		for ( int j = 0; j < 4; j++ ) {
   2887 			const idVec4 p0 = projectedPoints[offset0 + ( ( j + 0 ) & 3 )];
   2888 			const idVec4 p1 = projectedPoints[offset1 + ( ( j + offset3 ) & 3 )];
   2889 
   2890 #if defined( CLIP_SPACE_D3D )	// the D3D near plane is at Z=0 instead of Z=-1
   2891 			const float d0 = p0.z;
   2892 			const float d1 = p1.z;
   2893 #else
   2894 			const float d0 = p0.z + p0.w;
   2895 			const float d1 = p1.z + p1.w;
   2896 #endif
   2897 			const float delta = d0 - d1;
   2898 			const float fraction = idMath::Fabs( delta ) > idMath::FLT_SMALLEST_NON_DENORMAL ? ( d0 / delta ) : 1.0f;
   2899 			const bool clip = ( fraction > 0.0f && fraction < 1.0f );
   2900 			const idVec4 intersection = p0 + fraction * ( p1 - p0 );
   2901 
   2902 			edgeVerts[i * 8 + j * 2 + 0] = ( clip && d0 < 0.0f ) ? intersection : p0;
   2903 			edgeVerts[i * 8 + j * 2 + 1] = ( clip && d1 < 0.0f ) ? intersection : p1;
   2904 		}
   2905 	}
   2906 
   2907 	for ( int i = 0; i < 3; i++ ) {
   2908 		projected[0][i] = RENDER_MATRIX_INFINITY;
   2909 		projected[1][i] = - RENDER_MATRIX_INFINITY;
   2910 	}
   2911 
   2912 	for ( int i = 0; i < 24; i++ ) {
   2913 		const idVec4 & v = edgeVerts[i];
   2914 
   2915 		if ( v.w <= idMath::FLT_SMALLEST_NON_DENORMAL ) {
   2916 			continue;
   2917 		}
   2918 
   2919 		const float rw = 1.0f / v.w;
   2920 
   2921 		const float px = v.x * rw;
   2922 		const float py = v.y * rw;
   2923 		const float pz = v.z * rw;
   2924 
   2925 		projected[0][0] = Min( projected[0][0], px );
   2926 		projected[0][1] = Min( projected[0][1], py );
   2927 		projected[0][2] = Min( projected[0][2], pz );
   2928 
   2929 		projected[1][0] = Max( projected[1][0], px );
   2930 		projected[1][1] = Max( projected[1][1], py );
   2931 		projected[1][2] = Max( projected[1][2], pz );
   2932 	}
   2933 
   2934 	if ( windowSpace ) {
   2935 		// convert to window coords
   2936 		projected[0][0] = projected[0][0] * 0.5f + 0.5f;
   2937 		projected[1][0] = projected[1][0] * 0.5f + 0.5f;
   2938 
   2939 		projected[0][1] = projected[0][1] * 0.5f + 0.5f;
   2940 		projected[1][1] = projected[1][1] * 0.5f + 0.5f;
   2941 
   2942 #if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
   2943 		projected[0][2] = projected[0][2] * 0.5f + 0.5f;
   2944 		projected[1][2] = projected[1][2] * 0.5f + 0.5f;
   2945 #endif
   2946 
   2947 		// clamp to [0, 1] range
   2948 		projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] );
   2949 		projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] );
   2950 
   2951 		projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] );
   2952 		projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] );
   2953 
   2954 		projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] );
   2955 		projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] );
   2956 	}
   2957 
   2958 #endif
   2959 }
   2960 
   2961 #if 0
   2962 
   2963 /*
   2964 ========================
   2965 LocalViewOriginFromMVP
   2966 ========================
   2967 */
   2968 static idVec3 LocalViewOriginFromMVP( const idRenderMatrix & mvp ) {
   2969 	const float nearX = mvp[3][0] + mvp[2][0];
   2970 	const float nearY = mvp[3][1] + mvp[2][1];
   2971 	const float nearZ = mvp[3][2] + mvp[2][2];
   2972 	const float s = idMath::InvSqrt( nearX * nearX + nearY * nearY + nearZ * nearZ );
   2973 
   2974 	idRenderMatrix inverseMVP;
   2975 	idRenderMatrix::Inverse( mvp, inverseMVP );
   2976 	const float invW = 1.0f / inverseMVP[3][3];
   2977 	const float x = ( inverseMVP[0][3] - nearX * s ) * invW;
   2978 	const float y = ( inverseMVP[1][3] - nearY * s ) * invW;
   2979 	const float z = ( inverseMVP[2][3] - nearZ * s ) * invW;
   2980 
   2981 	return idVec3( x, y, z );
   2982 }
   2983 
   2984 #endif
   2985 
   2986 /*
   2987 ========================
   2988 LocalNearClipCenterFromMVP
   2989 
   2990 Based on whether the depth range is [0,1] or [-1,1], either transform (0,0,0) or (0,0,-1) with the inverse MVP.
   2991 ========================
   2992 */
   2993 static idVec3 LocalNearClipCenterFromMVP( const idRenderMatrix & mvp ) {
   2994 	idRenderMatrix inverseMVP;
   2995 	idRenderMatrix::Inverse( mvp, inverseMVP );
   2996 #if defined( CLIP_SPACE_D3D )	// the D3D near plane is at Z=0 instead of Z=-1
   2997 	const float x = inverseMVP[0][3];
   2998 	const float y = inverseMVP[1][3];
   2999 	const float z = inverseMVP[2][3];
   3000 	const float w = inverseMVP[3][3];
   3001 #else
   3002 	const float x = inverseMVP[0][3] - inverseMVP[0][2];
   3003 	const float y = inverseMVP[1][3] - inverseMVP[1][2];
   3004 	const float z = inverseMVP[2][3] - inverseMVP[2][2];
   3005 	const float w = inverseMVP[3][3] - inverseMVP[3][2];
   3006 #endif
   3007 	const float invW = 1.0f / w;
   3008 	return idVec3( x * invW, y * invW, z * invW );
   3009 }
   3010 
   3011 #ifdef ID_WIN_X86_SSE2_INTRIN
   3012 
   3013 /*
   3014 ========================
   3015 ClipHomogeneousPolygonToSide
   3016 
   3017 Clips a polygon with homogeneous coordinates to the axis aligned plane[axis] = sign * offset.
   3018 ========================
   3019 */
   3020 static void ClipHomogeneousPolygonToSide_SSE2( idVec4 * __restrict newPoints, idVec4 * __restrict points, int & numPoints,
   3021 												const int axis, const __m128 & sign, const __m128 & offset ) {
   3022 	assert( newPoints != points );
   3023 
   3024 	const __m128 side = _mm_mul_ps( sign, offset );
   3025 	__m128i mask = _mm_sub_epi32( vector_int_0123, _mm_shuffle_epi32( _mm_cvtsi32_si128( numPoints ), 0 ) );
   3026 	__m128i index = _mm_setzero_si128();
   3027 
   3028 	ALIGNTYPE16 unsigned short indices[16 * 2];
   3029 	ALIGNTYPE16 float clipFractions[16];
   3030 
   3031 	int localNumPoint = numPoints;
   3032 
   3033 	for ( int i = 0; i < localNumPoint; i += 4 ) {
   3034 		const int i0 = ( i + 0 ) & ( ( i + 0 - localNumPoint ) >> 31 );
   3035 		const int i1 = ( i + 1 ) & ( ( i + 1 - localNumPoint ) >> 31 );
   3036 		const int i2 = ( i + 2 ) & ( ( i + 2 - localNumPoint ) >> 31 );
   3037 		const int i3 = ( i + 3 ) & ( ( i + 3 - localNumPoint ) >> 31 );
   3038 		const int i4 = ( i + 4 ) & ( ( i + 4 - localNumPoint ) >> 31 );
   3039 
   3040 		const __m128 p0A = _mm_load_ss( &points[i0][axis] );
   3041 		const __m128 p1A = _mm_load_ss( &points[i1][axis] );
   3042 		const __m128 p2A = _mm_load_ss( &points[i2][axis] );
   3043 		const __m128 p3A = _mm_load_ss( &points[i3][axis] );
   3044 		const __m128 p4A = _mm_load_ss( &points[i4][axis] );
   3045 
   3046 		const __m128 p0W = _mm_load_ss( &points[i0][3] );
   3047 		const __m128 p1W = _mm_load_ss( &points[i1][3] );
   3048 		const __m128 p2W = _mm_load_ss( &points[i2][3] );
   3049 		const __m128 p3W = _mm_load_ss( &points[i3][3] );
   3050 		const __m128 p4W = _mm_load_ss( &points[i4][3] );
   3051 
   3052 		const __m128 t0 = _mm_unpacklo_ps( p0A, p2A );
   3053 		const __m128 t1 = _mm_unpacklo_ps( p1A, p3A );
   3054 		const __m128 pa0 = _mm_unpacklo_ps( t0, t1 );
   3055 		const __m128 pa1 = _mm_sld_ps( pa0, p4A, 4 );
   3056 
   3057 		const __m128 r0 = _mm_unpacklo_ps( p0W, p2W );
   3058 		const __m128 r1 = _mm_unpacklo_ps( p1W, p3W );
   3059 		const __m128 pw0 = _mm_unpacklo_ps( r0, r1 );
   3060 		const __m128 pw1 = _mm_sld_ps( pw0, p4W, 4 );
   3061 
   3062 		{
   3063 			const __m128 bside0 = _mm_cmpgt_ps( _mm_mul_ps( offset, pw0 ), _mm_mul_ps( sign, pa0 ) );
   3064 			const __m128 bside1 = _mm_cmpgt_ps( _mm_mul_ps( offset, pw1 ), _mm_mul_ps( sign, pa1 ) );
   3065 			const __m128i side0 = _mm_and_si128( __m128c( bside0 ), vector_int_1 );
   3066 			const __m128i side1 = _mm_and_si128( __m128c( bside1 ), vector_int_1 );
   3067 			const __m128i xorSide = _mm_xor_si128( side0, side1 );
   3068 			const __m128i interleavedSide0 = _mm_unpacklo_epi32( side0, xorSide );
   3069 			const __m128i interleavedSide1 = _mm_unpackhi_epi32( side0, xorSide );
   3070 			const __m128i packedSide = _mm_packs_epi32( interleavedSide0, interleavedSide1 );
   3071 			const __m128i packedMaskedSide = _mm_and_si128( packedSide, _mm_srai_epi32( mask, 31 ) );
   3072 
   3073 			index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide,  2 ) );
   3074 			index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide,  4 ) );
   3075 			index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide,  6 ) );
   3076 			index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide,  8 ) );
   3077 			index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 10 ) );
   3078 			index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 12 ) );
   3079 			index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 14 ) );
   3080 
   3081 			_mm_store_si128( (__m128i *)&indices[i * 2], index );
   3082 
   3083 			mask = _mm_add_epi32( mask, vector_int_4 );
   3084 			index = _mm_add_epi16( index, packedMaskedSide );
   3085 			index = _mm_shufflehi_epi16( index, _MM_SHUFFLE( 3, 3, 3, 3 ) );
   3086 			index = _mm_shuffle_epi32( index, _MM_SHUFFLE( 3, 3, 3, 3 ) );
   3087 		}
   3088 
   3089 		{
   3090 			const __m128 d0 = _mm_nmsub_ps( pw0, side, pa0 );
   3091 			const __m128 d1 = _mm_nmsub_ps( pw1, side, pa1 );
   3092 			const __m128 delta = _mm_sub_ps( d0, d1 );
   3093 			const __m128 deltaAbs = _mm_and_ps( delta, vector_float_abs_mask );
   3094 			const __m128 clamp = _mm_cmpgt_ps( vector_float_smallest_non_denorm, deltaAbs );
   3095 			const __m128 deltaClamped = _mm_sel_ps( delta, vector_float_one, clamp );
   3096 			const __m128 fraction = _mm_mul_ps( d0, _mm_rcp32_ps( deltaClamped ) );
   3097 			const __m128 fractionClamped0 = _mm_sel_ps( fraction, vector_float_one, clamp );
   3098 			const __m128 fractionClamped1 = _mm_max_ps( fractionClamped0, vector_float_zero );
   3099 			const __m128 fractionClamped2 = _mm_min_ps( fractionClamped1, vector_float_one );
   3100 
   3101 			_mm_store_ps( &clipFractions[i], fractionClamped2 );
   3102 		}
   3103 	}
   3104 
   3105 	numPoints = _mm_cvtsi128_si32( index ) & 0xFFFF;
   3106 
   3107 	for ( int i = 0; i < localNumPoint; i += 4 ) {
   3108 		const int i0 = ( i + 0 ) & ( ( i + 0 - localNumPoint ) >> 31 );
   3109 		const int i1 = ( i + 1 ) & ( ( i + 1 - localNumPoint ) >> 31 );
   3110 		const int i2 = ( i + 2 ) & ( ( i + 2 - localNumPoint ) >> 31 );
   3111 		const int i3 = ( i + 3 ) & ( ( i + 3 - localNumPoint ) >> 31 );
   3112 		const int i4 = ( i + 4 ) & ( ( i + 4 - localNumPoint ) >> 31 );
   3113 
   3114 		const __m128 p0 = _mm_load_ps( points[i0].ToFloatPtr() );
   3115 		const __m128 p1 = _mm_load_ps( points[i1].ToFloatPtr() );
   3116 		const __m128 p2 = _mm_load_ps( points[i2].ToFloatPtr() );
   3117 		const __m128 p3 = _mm_load_ps( points[i3].ToFloatPtr() );
   3118 		const __m128 p4 = _mm_load_ps( points[i4].ToFloatPtr() );
   3119 
   3120 		const __m128 fraction = _mm_load_ps( &clipFractions[i] );
   3121 
   3122 		const __m128 c0 = _mm_madd_ps( _mm_splat_ps( fraction, 0 ), _mm_sub_ps( p1, p0 ), p0 );
   3123 		const __m128 c1 = _mm_madd_ps( _mm_splat_ps( fraction, 1 ), _mm_sub_ps( p2, p1 ), p1 );
   3124 		const __m128 c2 = _mm_madd_ps( _mm_splat_ps( fraction, 2 ), _mm_sub_ps( p3, p2 ), p2 );
   3125 		const __m128 c3 = _mm_madd_ps( _mm_splat_ps( fraction, 3 ), _mm_sub_ps( p4, p3 ), p3 );
   3126 
   3127 		_mm_store_ps( newPoints[indices[i * 2 + 0]].ToFloatPtr(), p0 );
   3128 		_mm_store_ps( newPoints[indices[i * 2 + 1]].ToFloatPtr(), c0 );
   3129 		_mm_store_ps( newPoints[indices[i * 2 + 2]].ToFloatPtr(), p1 );
   3130 		_mm_store_ps( newPoints[indices[i * 2 + 3]].ToFloatPtr(), c1 );
   3131 		_mm_store_ps( newPoints[indices[i * 2 + 4]].ToFloatPtr(), p2 );
   3132 		_mm_store_ps( newPoints[indices[i * 2 + 5]].ToFloatPtr(), c2 );
   3133 		_mm_store_ps( newPoints[indices[i * 2 + 6]].ToFloatPtr(), p3 );
   3134 		_mm_store_ps( newPoints[indices[i * 2 + 7]].ToFloatPtr(), c3 );
   3135 	}
   3136 }
   3137 
   3138 /*
   3139 ========================
   3140 ClipHomogeneousPolygonToUnitCube
   3141 
   3142 Clips a polygon with homogeneous coordinates to all six axis aligned unit cube planes.
   3143 ========================
   3144 */
   3145 static int ClipHomogeneousPolygonToUnitCube_SSE2( idVec4 * points, int numPoints ) {
   3146 	assert( numPoints < 16 - 6 );
   3147 	ALIGNTYPE16 idVec4 newPoints[16 * 2];
   3148 
   3149 #if defined( CLIP_SPACE_D3D )	// the D3D near plane is at Z=0 instead of Z=-1
   3150 	ClipHomogeneousPolygonToSide_SSE2( newPoints, points, numPoints, 2, vector_float_neg_one, vector_float_zero );	// near
   3151 #else
   3152 	ClipHomogeneousPolygonToSide_SSE2( newPoints, points, numPoints, 2, vector_float_neg_one, vector_float_one );	// near
   3153 #endif
   3154 	ClipHomogeneousPolygonToSide_SSE2( points, newPoints, numPoints, 2, vector_float_pos_one, vector_float_one );	// far
   3155 	ClipHomogeneousPolygonToSide_SSE2( newPoints, points, numPoints, 1, vector_float_neg_one, vector_float_one );	// bottom
   3156 	ClipHomogeneousPolygonToSide_SSE2( points, newPoints, numPoints, 1, vector_float_pos_one, vector_float_one );	// top
   3157 	ClipHomogeneousPolygonToSide_SSE2( newPoints, points, numPoints, 0, vector_float_neg_one, vector_float_one );	// left
   3158 	ClipHomogeneousPolygonToSide_SSE2( points, newPoints, numPoints, 0, vector_float_pos_one, vector_float_one );	// right
   3159 	return numPoints;
   3160 }
   3161 
   3162 #else
   3163 
   3164 /*
   3165 ========================
   3166 ClipHomogeneousLineToSide
   3167 
   3168 Clips a line with homogeneous coordinates to the axis aligned plane[axis] = side.
   3169 ========================
   3170 */
   3171 static idVec4 ClipHomogeneousLineToSide( const idVec4 & p0, const idVec4 & p1, int axis, float side ) {
   3172 	const float d0 = p0.w * side - p0[axis];
   3173 	const float d1 = p1.w * side - p1[axis];
   3174 	const float delta = d0 - d1;
   3175 	const float f = idMath::Fabs( delta ) > idMath::FLT_SMALLEST_NON_DENORMAL ? ( d0 / delta ) : 1.0f;
   3176 	const float c = idMath::ClampFloat( 0.0f, 1.0f, f );
   3177 	return p0 + c * ( p1 - p0 );
   3178 }
   3179 
   3180 /*
   3181 ========================
   3182 ClipHomogeneousPolygonToSide
   3183 
   3184 Clips a polygon with homogeneous coordinates to the axis aligned plane[axis] = sign * offset.
   3185 ========================
   3186 */
   3187 static int ClipHomogeneousPolygonToSide_Generic( idVec4 * __restrict newPoints, idVec4 * __restrict points, int numPoints, int axis, float sign, float offset ) {
   3188 	assert( newPoints != points );
   3189 
   3190 	assert( numPoints < 16 );
   3191 	int sides[16];
   3192 
   3193 	const float side = sign * offset;
   3194 
   3195 	// calculate the plane side for each original point and calculate all potential new points
   3196 	for ( int i = 0; i < numPoints; i++ ) {
   3197 		int j = ( i + 1 ) & ( ( i + 1 - numPoints ) >> 31 );
   3198 		sides[i] = sign * points[i][axis] < offset * points[i].w;
   3199 		newPoints[i * 2 + 0] = points[i];
   3200 		newPoints[i * 2 + 1] = ClipHomogeneousLineToSide( points[i], points[j], axis, side );
   3201 	};
   3202 
   3203 	// repeat the first side at the end to avoid having to wrap around
   3204 	sides[numPoints] = sides[0];
   3205 
   3206 	// compact the array of points
   3207 	int numNewPoints = 0;
   3208 	for ( int i = 0; i < numPoints; i++ ) {
   3209 		if ( sides[i + 0] != 0 ) {
   3210 			newPoints[numNewPoints++] = newPoints[i * 2 + 0];
   3211 		}
   3212 		if ( ( sides[i + 0] ^ sides[i + 1] ) != 0 ) {
   3213 			newPoints[numNewPoints++] = newPoints[i * 2 + 1];
   3214 		}
   3215 	}
   3216 
   3217 	assert( numNewPoints <= 16 );
   3218 	return numNewPoints;
   3219 }
   3220 
   3221 /*
   3222 ========================
   3223 ClipHomogeneousPolygonToUnitCube
   3224 
   3225 Clips a polygon with homogeneous coordinates to all six axis aligned unit cube planes.
   3226 ========================
   3227 */
   3228 static int ClipHomogeneousPolygonToUnitCube_Generic( idVec4 * points, int numPoints ) {
   3229 	assert( numPoints < 16 - 6 );
   3230 	ALIGNTYPE16 idVec4 newPoints[2 * 16];	// the C clip code temporarily doubles the points
   3231 
   3232 #if defined( CLIP_SPACE_D3D )	// the D3D near plane is at Z=0 instead of Z=-1
   3233 	numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 2, -1.0f, 0.0f );	// near
   3234 #else
   3235 	numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 2, -1.0f, 1.0f );	// near
   3236 #endif
   3237 	numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 2, +1.0f, 1.0f );	// far
   3238 	numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 1, -1.0f, 1.0f );	// bottom
   3239 	numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 1, +1.0f, 1.0f );	// top
   3240 	numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 0, -1.0f, 1.0f );	// left
   3241 	numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 0, +1.0f, 1.0f );	// right
   3242 	return numPoints;
   3243 }
   3244 
   3245 #endif
   3246 
   3247 /*
   3248 ========================
   3249 idRenderMatrix::ProjectedFullyClippedBounds
   3250 
   3251 Calculates the bounds of the given bounding box projected with the given Model View Projection (MVP) matrix.
   3252 If 'windowSpace' is true then the calculated bounds along each axis are moved and clamped to the [0, 1] range.
   3253 
   3254 The given bounding box is first fully clipped to the MVP to get the smallest projected bounds.
   3255 
   3256 Note that this code assumes the MVP matrix has an infinite far clipping plane. When the far plane is at
   3257 infinity the bounds are never far clipped and it is sufficient to test whether or not the center of the
   3258 near clip plane is inside the bounds to calculate the correct minimum Z. If the far plane is not at
   3259 infinity then this code would also have to test for the view frustum being completely contained inside
   3260 the given bounds in which case the projected bounds should be set to fully cover the view frustum.
   3261 ========================
   3262 */
   3263 void idRenderMatrix::ProjectedFullyClippedBounds( idBounds & projected, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) {
   3264 #ifdef ID_WIN_X86_SSE2_INTRIN
   3265 
   3266 	const __m128 mvp0 = _mm_loadu_ps( mvp[0] );
   3267 	const __m128 mvp1 = _mm_loadu_ps( mvp[1] );
   3268 	const __m128 mvp2 = _mm_loadu_ps( mvp[2] );
   3269 	const __m128 mvp3 = _mm_loadu_ps( mvp[3] );
   3270 
   3271 	const __m128 t0 = _mm_unpacklo_ps( mvp0, mvp2 );	// mvp[0][0], mvp[2][0], mvp[0][1], mvp[2][1]
   3272 	const __m128 t1 = _mm_unpackhi_ps( mvp0, mvp2 );	// mvp[0][2], mvp[2][2], mvp[0][3], mvp[2][3]
   3273 	const __m128 t2 = _mm_unpacklo_ps( mvp1, mvp3 );	// mvp[1][0], mvp[3][0], mvp[1][1], mvp[3][1]
   3274 	const __m128 t3 = _mm_unpackhi_ps( mvp1, mvp3 );	// mvp[1][2], mvp[3][2], mvp[1][3], mvp[3][3]
   3275 
   3276 	const __m128 mvpX = _mm_unpacklo_ps( t0, t2 );		// mvp[0][0], mvp[1][0], mvp[2][0], mvp[3][0]
   3277 	const __m128 mvpY = _mm_unpackhi_ps( t0, t2 );		// mvp[0][1], mvp[1][1], mvp[2][1], mvp[3][1]
   3278 	const __m128 mvpZ = _mm_unpacklo_ps( t1, t3 );		// mvp[0][2], mvp[1][2], mvp[2][2], mvp[3][2]
   3279 	const __m128 mvpW = _mm_unpackhi_ps( t1, t3 );		// mvp[0][3], mvp[1][3], mvp[2][3], mvp[3][3]
   3280 
   3281 	const __m128 b0 = _mm_loadu_bounds_0( bounds );
   3282 	const __m128 b1 = _mm_loadu_bounds_1( bounds );
   3283 
   3284 	const __m128 b0X = _mm_splat_ps( b0, 0 );
   3285 	const __m128 b0Y = _mm_splat_ps( b0, 1 );
   3286 	const __m128 b0Z = _mm_splat_ps( b0, 2 );
   3287 
   3288 	const __m128 b1X = _mm_splat_ps( b1, 0 );
   3289 	const __m128 b1Y = _mm_splat_ps( b1, 1 );
   3290 	const __m128 b1Z = _mm_splat_ps( b1, 2 );
   3291 
   3292 	const __m128 p0 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
   3293 	const __m128 p1 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
   3294 	const __m128 p2 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
   3295 	const __m128 p3 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
   3296 	const __m128 p4 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
   3297 	const __m128 p5 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
   3298 	const __m128 p6 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
   3299 	const __m128 p7 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
   3300 
   3301 	ALIGNTYPE16 idVec4 projectedPoints[8];
   3302 	_mm_store_ps( projectedPoints[0].ToFloatPtr(), p0 );
   3303 	_mm_store_ps( projectedPoints[1].ToFloatPtr(), p1 );
   3304 	_mm_store_ps( projectedPoints[2].ToFloatPtr(), p2 );
   3305 	_mm_store_ps( projectedPoints[3].ToFloatPtr(), p3 );
   3306 	_mm_store_ps( projectedPoints[4].ToFloatPtr(), p4 );
   3307 	_mm_store_ps( projectedPoints[5].ToFloatPtr(), p5 );
   3308 	_mm_store_ps( projectedPoints[6].ToFloatPtr(), p6 );
   3309 	_mm_store_ps( projectedPoints[7].ToFloatPtr(), p7 );
   3310 
   3311 	ALIGNTYPE16 idVec4 clippedPoints[6 * 16];
   3312 	int numClippedPoints = 0;
   3313 	for ( int i = 0; i < 6; i++ ) {
   3314 		_mm_store_ps( clippedPoints[numClippedPoints + 0].ToFloatPtr(), _mm_load_ps( projectedPoints[boxPolygonVertices[i][0]].ToFloatPtr() ) );
   3315 		_mm_store_ps( clippedPoints[numClippedPoints + 1].ToFloatPtr(), _mm_load_ps( projectedPoints[boxPolygonVertices[i][1]].ToFloatPtr() ) );
   3316 		_mm_store_ps( clippedPoints[numClippedPoints + 2].ToFloatPtr(), _mm_load_ps( projectedPoints[boxPolygonVertices[i][2]].ToFloatPtr() ) );
   3317 		_mm_store_ps( clippedPoints[numClippedPoints + 3].ToFloatPtr(), _mm_load_ps( projectedPoints[boxPolygonVertices[i][3]].ToFloatPtr() ) );
   3318 		numClippedPoints += ClipHomogeneousPolygonToUnitCube_SSE2( &clippedPoints[numClippedPoints], 4 );
   3319 	}
   3320 
   3321 	// repeat the first clipped point at the end to get a multiple of 4 clipped points
   3322 	const __m128 point0 = _mm_load_ps( clippedPoints[0].ToFloatPtr() );
   3323 	for ( int i = numClippedPoints; ( i & 3 ) != 0; i++ ) {
   3324 		_mm_store_ps( clippedPoints[i].ToFloatPtr(), point0 );
   3325 	}
   3326 
   3327 	// test if the center of the near clip plane is inside the given bounding box
   3328 	const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp );
   3329 	const bool inside = bounds.Expand( RENDER_MATRIX_PROJECTION_EPSILON ).ContainsPoint( localNearClipCenter );
   3330 
   3331 	__m128 minX = vector_float_pos_infinity;
   3332 	__m128 minY = vector_float_pos_infinity;
   3333 	__m128 minZ = inside ? vector_float_neg_one : vector_float_pos_infinity;
   3334 
   3335 	__m128 maxX = vector_float_neg_infinity;
   3336 	__m128 maxY = vector_float_neg_infinity;
   3337 	__m128 maxZ = vector_float_neg_infinity;
   3338 
   3339 	for ( int i = 0; i < numClippedPoints; i += 4 ) {
   3340 		const __m128 cp0 = _mm_load_ps( clippedPoints[i + 0].ToFloatPtr() );
   3341 		const __m128 cp1 = _mm_load_ps( clippedPoints[i + 1].ToFloatPtr() );
   3342 		const __m128 cp2 = _mm_load_ps( clippedPoints[i + 2].ToFloatPtr() );
   3343 		const __m128 cp3 = _mm_load_ps( clippedPoints[i + 3].ToFloatPtr() );
   3344 
   3345 		const __m128 s0 = _mm_unpacklo_ps( cp0, cp2 );		// cp0[0], cp2[0], cp0[1], cp2[1]
   3346 		const __m128 s1 = _mm_unpackhi_ps( cp0, cp2 );		// cp0[2], cp2[2], cp0[3], cp2[3]
   3347 		const __m128 s2 = _mm_unpacklo_ps( cp1, cp3 );		// cp1[0], cp3[0], cp1[1], cp3[1]
   3348 		const __m128 s3 = _mm_unpackhi_ps( cp1, cp3 );		// cp1[2], cp3[2], cp1[3], cp3[3]
   3349 
   3350 		const __m128 cpX = _mm_unpacklo_ps( s0, s2 );		// cp0[0], cp1[0], cp2[0], cp3[0]
   3351 		const __m128 cpY = _mm_unpackhi_ps( s0, s2 );		// cp0[1], cp1[1], cp2[1], cp3[1]
   3352 		const __m128 cpZ = _mm_unpacklo_ps( s1, s3 );		// cp0[2], cp1[2], cp2[2], cp3[2]
   3353 		const __m128 cpW = _mm_unpackhi_ps( s1, s3 );		// cp0[3], cp1[3], cp2[3], cp3[3]
   3354 
   3355 		const __m128 rW = _mm_rcp32_ps( cpW );
   3356 		const __m128 rX = _mm_mul_ps( cpX, rW );
   3357 		const __m128 rY = _mm_mul_ps( cpY, rW );
   3358 		const __m128 rZ = _mm_mul_ps( cpZ, rW );
   3359 
   3360 		minX = _mm_min_ps( minX, rX );
   3361 		minY = _mm_min_ps( minY, rY );
   3362 		minZ = _mm_min_ps( minZ, rZ );
   3363 
   3364 		maxX = _mm_max_ps( maxX, rX );
   3365 		maxY = _mm_max_ps( maxY, rY );
   3366 		maxZ = _mm_max_ps( maxZ, rZ );
   3367 	}
   3368 
   3369 	minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   3370 	minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   3371 	minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   3372 
   3373 	minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   3374 	minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   3375 	minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   3376 
   3377 	maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   3378 	maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   3379 	maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   3380 
   3381 	maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   3382 	maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   3383 	maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   3384 
   3385 	if ( windowSpace ) {
   3386 		minX = _mm_madd_ps( minX, vector_float_half, vector_float_half );
   3387 		maxX = _mm_madd_ps( maxX, vector_float_half, vector_float_half );
   3388 
   3389 		minY = _mm_madd_ps( minY, vector_float_half, vector_float_half );
   3390 		maxY = _mm_madd_ps( maxY, vector_float_half, vector_float_half );
   3391 
   3392 #if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
   3393 		minZ = _mm_madd_ps( minZ, vector_float_half, vector_float_half );
   3394 		maxZ = _mm_madd_ps( maxZ, vector_float_half, vector_float_half );
   3395 #endif
   3396 
   3397 		minX = _mm_max_ps( _mm_min_ps( minX, vector_float_one ), vector_float_zero );
   3398 		maxX = _mm_max_ps( _mm_min_ps( maxX, vector_float_one ), vector_float_zero );
   3399 
   3400 		minY = _mm_max_ps( _mm_min_ps( minY, vector_float_one ), vector_float_zero );
   3401 		maxY = _mm_max_ps( _mm_min_ps( maxY, vector_float_one ), vector_float_zero );
   3402 
   3403 		minZ = _mm_max_ps( _mm_min_ps( minZ, vector_float_one ), vector_float_zero );
   3404 		maxZ = _mm_max_ps( _mm_min_ps( maxZ, vector_float_one ), vector_float_zero );
   3405 	}
   3406 
   3407 	_mm_store_ss( & projected[0].x, minX );
   3408 	_mm_store_ss( & projected[0].y, minY );
   3409 	_mm_store_ss( & projected[0].z, minZ );
   3410 
   3411 	_mm_store_ss( & projected[1].x, maxX );
   3412 	_mm_store_ss( & projected[1].y, maxY );
   3413 	_mm_store_ss( & projected[1].z, maxZ );
   3414 
   3415 #else
   3416 
   3417 	const idVec3 points[8] = {
   3418 		idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ),
   3419 		idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ),
   3420 		idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ),
   3421 		idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ),
   3422 		idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ),
   3423 		idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ),
   3424 		idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ),
   3425 		idVec3( bounds[0][0], bounds[1][1], bounds[1][2] )
   3426 	};
   3427 
   3428 	idVec4 projectedPoints[8];
   3429 	for ( int i = 0; i < 8; i++ ) {
   3430 		const idVec3 & v = points[i];
   3431 		projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3];
   3432 		projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3];
   3433 		projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
   3434 		projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
   3435 	}
   3436 
   3437 	idVec4 clippedPoints[6 * 16];
   3438 	int numClippedPoints = 0;
   3439 	for ( int i = 0; i < 6; i++ ) {
   3440 		clippedPoints[numClippedPoints + 0] = projectedPoints[boxPolygonVertices[i][0]];
   3441 		clippedPoints[numClippedPoints + 1] = projectedPoints[boxPolygonVertices[i][1]];
   3442 		clippedPoints[numClippedPoints + 2] = projectedPoints[boxPolygonVertices[i][2]];
   3443 		clippedPoints[numClippedPoints + 3] = projectedPoints[boxPolygonVertices[i][3]];
   3444 		numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 );
   3445 	}
   3446 
   3447 	// test if the center of the near clip plane is inside the given bounding box
   3448 	const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp );
   3449 	const bool inside = bounds.Expand( RENDER_MATRIX_PROJECTION_EPSILON ).ContainsPoint( localNearClipCenter );
   3450 
   3451 	for ( int i = 0; i < 3; i++ ) {
   3452 		projected[0][i] = RENDER_MATRIX_INFINITY;
   3453 		projected[1][i] = - RENDER_MATRIX_INFINITY;
   3454 	}
   3455 	if ( inside ) {
   3456 		projected[0][2] = -1.0f;
   3457 	}
   3458 
   3459 	for ( int i = 0; i < numClippedPoints; i++ ) {
   3460 		const idVec4 & c = clippedPoints[i];
   3461 
   3462 		assert( c.w > idMath::FLT_SMALLEST_NON_DENORMAL );
   3463 
   3464 		const float rw = 1.0f / c.w;
   3465 
   3466 		const float px = c.x * rw;
   3467 		const float py = c.y * rw;
   3468 		const float pz = c.z * rw;
   3469 
   3470 		projected[0][0] = Min( projected[0][0], px );
   3471 		projected[0][1] = Min( projected[0][1], py );
   3472 		projected[0][2] = Min( projected[0][2], pz );
   3473 
   3474 		projected[1][0] = Max( projected[1][0], px );
   3475 		projected[1][1] = Max( projected[1][1], py );
   3476 		projected[1][2] = Max( projected[1][2], pz );
   3477 	}
   3478 
   3479 	if ( windowSpace ) {
   3480 		// convert to window coords
   3481 		projected[0][0] = projected[0][0] * 0.5f + 0.5f;
   3482 		projected[1][0] = projected[1][0] * 0.5f + 0.5f;
   3483 
   3484 		projected[0][1] = projected[0][1] * 0.5f + 0.5f;
   3485 		projected[1][1] = projected[1][1] * 0.5f + 0.5f;
   3486 
   3487 #if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
   3488 		projected[0][2] = projected[0][2] * 0.5f + 0.5f;
   3489 		projected[1][2] = projected[1][2] * 0.5f + 0.5f;
   3490 #endif
   3491 
   3492 		// clamp to [0, 1] range
   3493 		projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] );
   3494 		projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] );
   3495 
   3496 		projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] );
   3497 		projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] );
   3498 
   3499 		projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] );
   3500 		projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] );
   3501 	}
   3502 
   3503 #endif
   3504 }
   3505 
   3506 /*
   3507 ========================
   3508 idRenderMatrix::DepthBoundsForBounds
   3509 
   3510 Calculates the depth bounds of the given bounding box projected with the given Model View Projection (MVP) matrix.
   3511 If 'windowSpace' is true then the calculated depth bounds are moved and clamped to the [0, 1] range.
   3512 
   3513 The given bounding box is not clipped to the MVP so the depth bounds may not be as tight as possible.
   3514 ========================
   3515 */
   3516 void idRenderMatrix::DepthBoundsForBounds( float & min, float & max, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) {
   3517 #ifdef ID_WIN_X86_SSE2_INTRIN
   3518 
   3519 	__m128 mvp2 = _mm_loadu_ps( mvp[2] );
   3520 	__m128 mvp3 = _mm_loadu_ps( mvp[3] );
   3521 
   3522 	__m128 b0 = _mm_loadu_bounds_0( bounds );
   3523 	__m128 b1 = _mm_loadu_bounds_1( bounds );
   3524 
   3525 	// take the four points on the X-Y plane
   3526 	__m128 vxy = _mm_unpacklo_ps( b0, b1 );						// min X, max X, min Y, max Y
   3527 	__m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) );	// min X, max X, min X, max X
   3528 	__m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) );	// min Y, min Y, max Y, max Y
   3529 
   3530 	__m128 vz0 = _mm_splat_ps( b0, 2 );							// min Z, min Z, min Z, min Z
   3531 	__m128 vz1 = _mm_splat_ps( b1, 2 );							// max Z, max Z, max Z, max Z
   3532 
   3533 	// compute four partial Z,W values
   3534 	__m128 parz = _mm_splat_ps( mvp2, 3 );
   3535 	__m128 parw = _mm_splat_ps( mvp3, 3 );
   3536 
   3537 	parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz );
   3538 	parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw );
   3539 
   3540 	parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw );
   3541 	parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz );
   3542 
   3543 	__m128 z0 = _mm_madd_ps( vz0, _mm_splat_ps( mvp2, 2 ), parz );
   3544 	__m128 w0 = _mm_madd_ps( vz0, _mm_splat_ps( mvp3, 2 ), parw );
   3545 
   3546 	__m128 z1 = _mm_madd_ps( vz1, _mm_splat_ps( mvp2, 2 ), parz );
   3547 	__m128 w1 = _mm_madd_ps( vz1, _mm_splat_ps( mvp3, 2 ), parw );
   3548 
   3549 	__m128 s0 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w0 );
   3550 	w0 = _mm_or_ps( w0, _mm_and_ps( vector_float_smallest_non_denorm, s0 ) );
   3551 
   3552 	__m128 rw0 = _mm_rcp32_ps( w0 );
   3553 	z0 = _mm_mul_ps( z0, rw0 );
   3554 	z0 = _mm_sel_ps( z0, vector_float_neg_infinity, s0 );
   3555 
   3556 	__m128 s1 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w1 );
   3557 	w1 = _mm_or_ps( w1, _mm_and_ps( vector_float_smallest_non_denorm, s1 ) );
   3558 
   3559 	__m128 rw1 = _mm_rcp32_ps( w1 );
   3560 	z1 = _mm_mul_ps( z1, rw1 );
   3561 	z1 = _mm_sel_ps( z1, vector_float_neg_infinity, s1 );
   3562 
   3563 	__m128 minv = _mm_min_ps( z0, z1 );
   3564 	__m128 maxv = _mm_max_ps( z0, z1 );
   3565 
   3566 	minv = _mm_min_ps( minv, _mm_perm_ps( minv, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   3567 	minv = _mm_min_ps( minv, _mm_perm_ps( minv, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   3568 
   3569 	maxv = _mm_max_ps( maxv, _mm_perm_ps( maxv, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   3570 	maxv = _mm_max_ps( maxv, _mm_perm_ps( maxv, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   3571 
   3572 	if ( windowSpace ) {
   3573 #if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
   3574 		minv = _mm_madd_ps( minv, vector_float_half, vector_float_half );
   3575 		maxv = _mm_madd_ps( maxv, vector_float_half, vector_float_half );
   3576 #endif
   3577 		minv = _mm_max_ps( minv, vector_float_zero );
   3578 		maxv = _mm_min_ps( maxv, vector_float_one );
   3579 	}
   3580 
   3581 	_mm_store_ss( & min, minv );
   3582 	_mm_store_ss( & max, maxv );
   3583 
   3584 #else
   3585 
   3586 	float localMin = RENDER_MATRIX_INFINITY;
   3587 	float localMax = - RENDER_MATRIX_INFINITY;
   3588 
   3589 	idVec3 v;
   3590 	for ( int x = 0; x < 2; x++ ) {
   3591 		v[0] = bounds[x][0];
   3592 		for ( int y = 0; y < 2; y++ ) {
   3593 			v[1] = bounds[y][1];
   3594 			for ( int z = 0; z < 2; z++ ) {
   3595 				v[2] = bounds[z][2];
   3596 
   3597 				float tz = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
   3598 				float tw = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
   3599 
   3600 				if ( tw > idMath::FLT_SMALLEST_NON_DENORMAL ) {
   3601 					tz = tz / tw;
   3602 				} else {
   3603 					tz = -RENDER_MATRIX_INFINITY;
   3604 				}
   3605 
   3606 				localMin = Min( localMin, tz );
   3607 				localMax = Max( localMax, tz );
   3608 			}
   3609 		}
   3610 	}
   3611 
   3612 	if ( windowSpace ) {
   3613 		// convert to window coords
   3614 #if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
   3615  		min = localMin * 0.5f + 0.5f;
   3616 		max = localMax * 0.5f + 0.5f;
   3617 #endif
   3618 		// clamp to the [0, 1] range
   3619 		min = Max( min, 0.0f );
   3620 		max = Min( max, 1.0f );
   3621 	}
   3622 
   3623 #endif
   3624 }
   3625 
   3626 /*
   3627 ========================
   3628 idRenderMatrix::DepthBoundsForExtrudedBounds
   3629 
   3630 Calculates the depth bounds of the given extruded bounding box projected with the given Model View Projection (MVP) matrix.
   3631 The given bounding box is extruded in the 'extrudeDirection' up to the 'clipPlane'.
   3632 If 'windowSpace' is true then the calculated depth bounds are moved and clamped to the [0, 1] range.
   3633 
   3634 The extruded bounding box is not clipped to the MVP so the depth bounds may not be as tight as possible.
   3635 ========================
   3636 */
   3637 void idRenderMatrix::DepthBoundsForExtrudedBounds( float & min, float & max, const idRenderMatrix & mvp, const idBounds & bounds, const idVec3 & extrudeDirection, const idPlane & clipPlane, bool windowSpace ) {
   3638 	assert( idMath::Fabs( extrudeDirection * clipPlane.Normal() ) >= idMath::FLT_SMALLEST_NON_DENORMAL );
   3639 
   3640 #ifdef ID_WIN_X86_SSE2_INTRIN
   3641 
   3642  	__m128 mvp2 = _mm_loadu_ps( mvp[2] );
   3643 	__m128 mvp3 = _mm_loadu_ps( mvp[3] );
   3644 
   3645 	__m128 b0 = _mm_loadu_bounds_0( bounds );
   3646 	__m128 b1 = _mm_loadu_bounds_1( bounds );
   3647 
   3648 	// take the four points on the X-Y plane
   3649 	__m128 vxy = _mm_unpacklo_ps( b0, b1 );						// min X, max X, min Y, max Y
   3650 	__m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) );	// min X, max X, min X, max X
   3651 	__m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) );	// min Y, min Y, max Y, max Y
   3652 
   3653 	__m128 vz0 = _mm_splat_ps( b0, 2 );							// min Z, min Z, min Z, min Z
   3654 	__m128 vz1 = _mm_splat_ps( b1, 2 );							// max Z, max Z, max Z, max Z
   3655 
   3656 	__m128 minv;
   3657 	__m128 maxv;
   3658 
   3659 	// calculate the min/max depth values for the bounding box corners
   3660 	{
   3661 		// compute four partial Z,W values
   3662 		__m128 parz = _mm_splat_ps( mvp2, 3 );
   3663 		__m128 parw = _mm_splat_ps( mvp3, 3 );
   3664 
   3665 		parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz );
   3666 		parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw );
   3667 
   3668 		parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw );
   3669 		parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz );
   3670 
   3671 		__m128 z0 = _mm_madd_ps( vz0, _mm_splat_ps( mvp2, 2 ), parz );
   3672 		__m128 w0 = _mm_madd_ps( vz0, _mm_splat_ps( mvp3, 2 ), parw );
   3673 
   3674 		__m128 z1 = _mm_madd_ps( vz1, _mm_splat_ps( mvp2, 2 ), parz );
   3675 		__m128 w1 = _mm_madd_ps( vz1, _mm_splat_ps( mvp3, 2 ), parw );
   3676 
   3677 		__m128 s0 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w0 );
   3678 		w0 = _mm_or_ps( w0, _mm_and_ps( vector_float_smallest_non_denorm, s0 ) );
   3679 
   3680 		__m128 rw0 = _mm_rcp32_ps( w0 );
   3681 		z0 = _mm_mul_ps( z0, rw0 );
   3682 		z0 = _mm_sel_ps( z0, vector_float_neg_infinity, s0 );
   3683 
   3684 		__m128 s1 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w1 );
   3685 		w1 = _mm_or_ps( w1, _mm_and_ps( vector_float_smallest_non_denorm, s1 ) );
   3686 
   3687 		__m128 rw1 = _mm_rcp32_ps( w1 );
   3688 		z1 = _mm_mul_ps( z1, rw1 );
   3689 		z1 = _mm_sel_ps( z1, vector_float_neg_infinity, s1 );
   3690 
   3691 		minv = _mm_min_ps( z0, z1 );
   3692 		maxv = _mm_max_ps( z0, z1 );
   3693 	}
   3694 
   3695 	// calculate and include the min/max depth value for the extruded bounding box corners
   3696 	{
   3697 		__m128 clipX = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 0 ), 0 );
   3698 		__m128 clipY = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 1 ), 0 );
   3699 		__m128 clipZ = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 2 ), 0 );
   3700 		__m128 clipW = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 3 ), 0 );
   3701 
   3702 		__m128 extrudeX = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 0 ), 0 );
   3703 		__m128 extrudeY = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 1 ), 0 );
   3704 		__m128 extrudeZ = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 2 ), 0 );
   3705 
   3706 		__m128 closing = _mm_madd_ps( clipX, extrudeX, _mm_madd_ps( clipY, extrudeY, _mm_mul_ps( clipZ, extrudeZ ) ) );
   3707 		__m128 invClosing = _mm_rcp32_ps( closing );
   3708 		invClosing = _mm_xor_ps( invClosing, vector_float_sign_bit );
   3709 
   3710 		__m128 dt = _mm_madd_ps( clipX, vx, _mm_madd_ps( clipY, vy, clipW ) );
   3711 		__m128 d0 = _mm_madd_ps( clipZ, vz0, dt );
   3712 		__m128 d1 = _mm_madd_ps( clipZ, vz1, dt );
   3713 
   3714 		d0 = _mm_mul_ps( d0, invClosing );
   3715 		d1 = _mm_mul_ps( d1, invClosing );
   3716 
   3717 		__m128 vx0 = _mm_madd_ps( extrudeX, d0, vx );
   3718 		__m128 vx1 = _mm_madd_ps( extrudeX, d1, vx );
   3719 
   3720 		__m128 vy0 = _mm_madd_ps( extrudeY, d0, vy );
   3721 		__m128 vy1 = _mm_madd_ps( extrudeY, d1, vy );
   3722 
   3723 		vz0 = _mm_madd_ps( extrudeZ, d0, vz0 );
   3724 		vz1 = _mm_madd_ps( extrudeZ, d1, vz1 );
   3725 
   3726 		__m128 mvp2X = _mm_splat_ps( mvp2, 0 );
   3727 		__m128 mvp3X = _mm_splat_ps( mvp3, 0 );
   3728 
   3729 		__m128 mvp2W = _mm_splat_ps( mvp2, 3 );
   3730 		__m128 mvp3W = _mm_splat_ps( mvp3, 3 );
   3731 
   3732 		__m128 z0 = _mm_madd_ps( vx0, mvp2X, mvp2W );
   3733 		__m128 w0 = _mm_madd_ps( vx0, mvp3X, mvp3W );
   3734 
   3735 		__m128 z1 = _mm_madd_ps( vx1, mvp2X, mvp2W );
   3736 		__m128 w1 = _mm_madd_ps( vx1, mvp3X, mvp3W );
   3737 
   3738 		__m128 mvp2Y = _mm_splat_ps( mvp2, 1 );
   3739 		__m128 mvp3Y = _mm_splat_ps( mvp3, 1 );
   3740 
   3741 		z0 = _mm_madd_ps( vy0, mvp2Y, z0 );
   3742 		w0 = _mm_madd_ps( vy0, mvp3Y, w0 );
   3743 
   3744 		z1 = _mm_madd_ps( vy1, mvp2Y, z1 );
   3745 		w1 = _mm_madd_ps( vy1, mvp3Y, w1 );
   3746 
   3747 		__m128 mvp2Z = _mm_splat_ps( mvp2, 2 );
   3748 		__m128 mvp3Z = _mm_splat_ps( mvp3, 2 );
   3749 
   3750 		z0 = _mm_madd_ps( vz0, mvp2Z, z0 );
   3751 		w0 = _mm_madd_ps( vz0, mvp3Z, w0 );
   3752 
   3753 		z1 = _mm_madd_ps( vz1, mvp2Z, z1 );
   3754 		w1 = _mm_madd_ps( vz1, mvp3Z, w1 );
   3755 
   3756 		__m128 s0 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w0 );
   3757 		w0 = _mm_or_ps( w0, _mm_and_ps( vector_float_smallest_non_denorm, s0 ) );
   3758 
   3759 		__m128 rw0 = _mm_rcp32_ps( w0 );
   3760 		z0 = _mm_mul_ps( z0, rw0 );
   3761 		z0 = _mm_sel_ps( z0, vector_float_neg_infinity, s0 );
   3762 
   3763 		__m128 s1 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w1 );
   3764 		w1 = _mm_or_ps( w1, _mm_and_ps( vector_float_smallest_non_denorm, s1 ) );
   3765 
   3766 		__m128 rw1 = _mm_rcp32_ps( w1 );
   3767 		z1 = _mm_mul_ps( z1, rw1 );
   3768 		z1 = _mm_sel_ps( z1, vector_float_neg_infinity, s1 );
   3769 
   3770 		minv = _mm_min_ps( minv, z0 );
   3771 		maxv = _mm_max_ps( maxv, z0 );
   3772 
   3773 		minv = _mm_min_ps( minv, z1 );
   3774 		maxv = _mm_max_ps( maxv, z1 );
   3775 	}
   3776 
   3777 	minv = _mm_min_ps( minv, _mm_perm_ps( minv, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   3778 	minv = _mm_min_ps( minv, _mm_perm_ps( minv, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   3779 
   3780 	maxv = _mm_max_ps( maxv, _mm_perm_ps( maxv, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   3781 	maxv = _mm_max_ps( maxv, _mm_perm_ps( maxv, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   3782 
   3783 	if ( windowSpace ) {
   3784 #if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
   3785 		minv = _mm_madd_ps( minv, vector_float_half, vector_float_half );
   3786 		maxv = _mm_madd_ps( maxv, vector_float_half, vector_float_half );
   3787 #endif
   3788 		minv = _mm_max_ps( minv, vector_float_zero );
   3789 		maxv = _mm_min_ps( maxv, vector_float_one );
   3790 	}
   3791 
   3792 	_mm_store_ss( & min, minv );
   3793 	_mm_store_ss( & max, maxv );
   3794 
   3795 #else
   3796 
   3797 	const float closing = extrudeDirection * clipPlane.Normal();
   3798 	const float invClosing = -1.0f / closing;
   3799 
   3800 	float localMin = RENDER_MATRIX_INFINITY;
   3801 	float localMax = - RENDER_MATRIX_INFINITY;
   3802 
   3803 	idVec3 v;
   3804 	for ( int x = 0; x < 2; x++ ) {
   3805 		v[0] = bounds[x][0];
   3806 		for ( int y = 0; y < 2; y++ ) {
   3807 			v[1] = bounds[y][1];
   3808 			for ( int z = 0; z < 2; z++ ) {
   3809 				v[2] = bounds[z][2];
   3810 
   3811 				for ( int extrude = 0; extrude <= 1; extrude++ ) {
   3812 
   3813 					idVec3 test;
   3814 					if ( extrude ) {
   3815 						float extrudeDist = clipPlane.Distance( v ) * invClosing;
   3816 						test = v + extrudeDirection * extrudeDist;
   3817 					} else {
   3818 						test = v;
   3819 					}
   3820 
   3821 					float tz = test[0] * mvp[2][0] + test[1] * mvp[2][1] + test[2] * mvp[2][2] + mvp[2][3];
   3822 					float tw = test[0] * mvp[3][0] + test[1] * mvp[3][1] + test[2] * mvp[3][2] + mvp[3][3];
   3823 
   3824 					if ( tw > idMath::FLT_SMALLEST_NON_DENORMAL ) {
   3825 						tz = tz / tw;
   3826 					} else {
   3827 						tz = -RENDER_MATRIX_INFINITY;
   3828 					}
   3829 
   3830 					localMin = Min( localMin, tz );
   3831 					localMax = Max( localMax, tz );
   3832 				}
   3833 			}
   3834 		}
   3835 	}
   3836 
   3837 	if ( windowSpace ) {
   3838 		// convert to window coords
   3839 #if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
   3840 		min = localMin * 0.5f + 0.5f;
   3841 		max = localMax * 0.5f + 0.5f;
   3842 #endif
   3843 		// clamp to the [0, 1] range
   3844 		min = Max( min, 0.0f );
   3845 		max = Min( max, 1.0f );
   3846 	}
   3847 
   3848 #endif
   3849 }
   3850 
   3851 /*
   3852 ========================
   3853 PointInsideInfiniteShadow
   3854 
   3855 Returns true if the 'localPoint' is inside the infinite shadow volume cast
   3856 from the given occluder bounding box and the given light position.
   3857 ========================
   3858 */
   3859 static bool PointInsideInfiniteShadow( const idBounds & occluderBounds, const idVec3 & localLightOrigin, const idVec3 & localPoint, const float epsilon ) {
   3860 	// Expand the bounds with an epsilon.
   3861 	const idBounds expandedBounds = occluderBounds.Expand( epsilon );
   3862 
   3863 	// If the point is inside the bounding box then the point
   3864 	// is also inside the shadow projection.
   3865 	if ( expandedBounds.ContainsPoint( localPoint ) ) {
   3866 		return true;
   3867 	}
   3868 
   3869 	// If the light is inside the bounding box then the shadow is projected
   3870 	// in all directions and any point is inside the infinte shadow projection.
   3871 	if ( expandedBounds.ContainsPoint( localPoint ) ) {
   3872 		return true;
   3873 	}
   3874 
   3875 	// If the line from localLightOrigin to localPoint intersects the
   3876 	// bounding box then the point is inside the infinite shadow projection.
   3877 	if ( expandedBounds.LineIntersection( localLightOrigin, localPoint ) ) {
   3878 		return true;
   3879 	}
   3880 
   3881 	// The point is definitely not inside the projected shadow.
   3882 	return false;
   3883 }
   3884 
   3885 /*
   3886 ========================
   3887 idRenderMatrix::DepthBoundsForShadowBounds
   3888 
   3889 Calculates the depth bounds of the infinite shadow volume projected with the given Model View Projection (MVP) matrix.
   3890 The infinite shadow volume is cast from the given occluder bounding box and the given light position.
   3891 If 'windowSpace' is true then the calculated depth bounds are moved and clamped to the [0, 1] range.
   3892 
   3893 The infinite shadow volume is fully clipped to the MVP to get the tightest possible bounds.
   3894 
   3895 Note that this code assumes the MVP matrix has an infinite far clipping plane. When the far plane is at
   3896 infinity the shadow volume is never far clipped and it is sufficient to test whether or not the center
   3897 of the near clip plane is inside the shadow volume to calculate the correct minimum Z. If the far plane
   3898 is not at infinity then this code would also have to test for the view frustum being completely contained
   3899 inside the shadow volume to also calculate the correct maximum Z. This could be done, for instance, by
   3900 testing if the center of the far clipping plane is contained inside the shadow volume.
   3901 ========================
   3902 */
   3903 void idRenderMatrix::DepthBoundsForShadowBounds( float & min, float & max, const idRenderMatrix & mvp, const idBounds & bounds, const idVec3 & localLightOrigin, bool windowSpace ) {
   3904 #ifdef ID_WIN_X86_SSE2_INTRIN
   3905 
   3906 	const __m128 mvp0 = _mm_loadu_ps( mvp[0] );
   3907 	const __m128 mvp1 = _mm_loadu_ps( mvp[1] );
   3908 	const __m128 mvp2 = _mm_loadu_ps( mvp[2] );
   3909 	const __m128 mvp3 = _mm_loadu_ps( mvp[3] );
   3910 
   3911 	const __m128 t0 = _mm_unpacklo_ps( mvp0, mvp2 );	// mvp[0][0], mvp[2][0], mvp[0][1], mvp[2][1]
   3912 	const __m128 t1 = _mm_unpackhi_ps( mvp0, mvp2 );	// mvp[0][2], mvp[2][2], mvp[0][3], mvp[2][3]
   3913 	const __m128 t2 = _mm_unpacklo_ps( mvp1, mvp3 );	// mvp[1][0], mvp[3][0], mvp[1][1], mvp[3][1]
   3914 	const __m128 t3 = _mm_unpackhi_ps( mvp1, mvp3 );	// mvp[1][2], mvp[3][2], mvp[1][3], mvp[3][3]
   3915 
   3916 	const __m128 mvpX = _mm_unpacklo_ps( t0, t2 );		// mvp[0][0], mvp[1][0], mvp[2][0], mvp[3][0]
   3917 	const __m128 mvpY = _mm_unpackhi_ps( t0, t2 );		// mvp[0][1], mvp[1][1], mvp[2][1], mvp[3][1]
   3918 	const __m128 mvpZ = _mm_unpacklo_ps( t1, t3 );		// mvp[0][2], mvp[1][2], mvp[2][2], mvp[3][2]
   3919 	const __m128 mvpW = _mm_unpackhi_ps( t1, t3 );		// mvp[0][3], mvp[1][3], mvp[2][3], mvp[3][3]
   3920 
   3921 	const __m128 b0 = _mm_loadu_bounds_0( bounds );
   3922 	const __m128 b1 = _mm_loadu_bounds_1( bounds );
   3923 
   3924 	const __m128 lightOriginX = _mm_load_ss( localLightOrigin.ToFloatPtr() + 0 );
   3925 	const __m128 lightOriginY = _mm_load_ss( localLightOrigin.ToFloatPtr() + 1 );
   3926 	const __m128 lightOriginZ = _mm_load_ss( localLightOrigin.ToFloatPtr() + 2 );
   3927 	const __m128 lightOrigin = _mm_unpacklo_ps( _mm_unpacklo_ps( lightOriginX, lightOriginZ ), lightOriginY );
   3928 
   3929 	// calculate the front facing polygon bits
   3930 	int frontBits = GetBoxFrontBits_SSE2( b0, b1, lightOrigin );
   3931 
   3932 	const __m128 b0X = _mm_splat_ps( b0, 0 );
   3933 	const __m128 b0Y = _mm_splat_ps( b0, 1 );
   3934 	const __m128 b0Z = _mm_splat_ps( b0, 2 );
   3935 
   3936 	const __m128 b1X = _mm_splat_ps( b1, 0 );
   3937 	const __m128 b1Y = _mm_splat_ps( b1, 1 );
   3938 	const __m128 b1Z = _mm_splat_ps( b1, 2 );
   3939 
   3940 	// bounding box corners
   3941 	const __m128 np0 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
   3942 	const __m128 np1 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
   3943 	const __m128 np2 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
   3944 	const __m128 np3 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) );
   3945 	const __m128 np4 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
   3946 	const __m128 np5 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
   3947 	const __m128 np6 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
   3948 	const __m128 np7 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) );
   3949 
   3950 	ALIGNTYPE16 idVec4 projectedNearPoints[8];
   3951 	_mm_store_ps( projectedNearPoints[0].ToFloatPtr(), np0 );
   3952 	_mm_store_ps( projectedNearPoints[1].ToFloatPtr(), np1 );
   3953 	_mm_store_ps( projectedNearPoints[2].ToFloatPtr(), np2 );
   3954 	_mm_store_ps( projectedNearPoints[3].ToFloatPtr(), np3 );
   3955 	_mm_store_ps( projectedNearPoints[4].ToFloatPtr(), np4 );
   3956 	_mm_store_ps( projectedNearPoints[5].ToFloatPtr(), np5 );
   3957 	_mm_store_ps( projectedNearPoints[6].ToFloatPtr(), np6 );
   3958 	_mm_store_ps( projectedNearPoints[7].ToFloatPtr(), np7 );
   3959 
   3960 	// subtract the light position from the bounding box
   3961 	const __m128 lightX = _mm_splat_ps( lightOriginX, 0 );
   3962 	const __m128 lightY = _mm_splat_ps( lightOriginY, 0 );
   3963 	const __m128 lightZ = _mm_splat_ps( lightOriginZ, 0 );
   3964 
   3965 	const __m128 d0X = _mm_sub_ps( b0X, lightX );
   3966 	const __m128 d0Y = _mm_sub_ps( b0Y, lightY );
   3967 	const __m128 d0Z = _mm_sub_ps( b0Z, lightZ );
   3968 
   3969 	const __m128 d1X = _mm_sub_ps( b1X, lightX );
   3970 	const __m128 d1Y = _mm_sub_ps( b1Y, lightY );
   3971 	const __m128 d1Z = _mm_sub_ps( b1Z, lightZ );
   3972 
   3973 	// bounding box corners projected to infinity from the light position
   3974 	const __m128 fp0 = _mm_madd_ps( d0X, mvpX, _mm_madd_ps( d0Y, mvpY, _mm_mul_ps( d0Z, mvpZ ) ) );
   3975 	const __m128 fp1 = _mm_madd_ps( d1X, mvpX, _mm_madd_ps( d0Y, mvpY, _mm_mul_ps( d0Z, mvpZ ) ) );
   3976 	const __m128 fp2 = _mm_madd_ps( d1X, mvpX, _mm_madd_ps( d1Y, mvpY, _mm_mul_ps( d0Z, mvpZ ) ) );
   3977 	const __m128 fp3 = _mm_madd_ps( d0X, mvpX, _mm_madd_ps( d1Y, mvpY, _mm_mul_ps( d0Z, mvpZ ) ) );
   3978 	const __m128 fp4 = _mm_madd_ps( d0X, mvpX, _mm_madd_ps( d0Y, mvpY, _mm_mul_ps( d1Z, mvpZ ) ) );
   3979 	const __m128 fp5 = _mm_madd_ps( d1X, mvpX, _mm_madd_ps( d0Y, mvpY, _mm_mul_ps( d1Z, mvpZ ) ) );
   3980 	const __m128 fp6 = _mm_madd_ps( d1X, mvpX, _mm_madd_ps( d1Y, mvpY, _mm_mul_ps( d1Z, mvpZ ) ) );
   3981 	const __m128 fp7 = _mm_madd_ps( d0X, mvpX, _mm_madd_ps( d1Y, mvpY, _mm_mul_ps( d1Z, mvpZ ) ) );
   3982 
   3983 	ALIGNTYPE16 idVec4 projectedFarPoints[8];
   3984 	_mm_store_ps( projectedFarPoints[0].ToFloatPtr(), fp0 );
   3985 	_mm_store_ps( projectedFarPoints[1].ToFloatPtr(), fp1 );
   3986 	_mm_store_ps( projectedFarPoints[2].ToFloatPtr(), fp2 );
   3987 	_mm_store_ps( projectedFarPoints[3].ToFloatPtr(), fp3 );
   3988 	_mm_store_ps( projectedFarPoints[4].ToFloatPtr(), fp4 );
   3989 	_mm_store_ps( projectedFarPoints[5].ToFloatPtr(), fp5 );
   3990 	_mm_store_ps( projectedFarPoints[6].ToFloatPtr(), fp6 );
   3991 	_mm_store_ps( projectedFarPoints[7].ToFloatPtr(), fp7 );
   3992 
   3993 	ALIGNTYPE16 idVec4 clippedPoints[( 6 + 12 ) * 16];
   3994 	int numClippedPoints = 0;
   3995 
   3996 	// clip the front facing bounding box polygons at the near cap
   3997 	const frontPolygons_t & frontPolygons = boxFrontPolygonsForFrontBits[frontBits];
   3998 	for ( int i = 0; i < frontPolygons.count; i++ ) {
   3999 		const int polygon = frontPolygons.indices[i];
   4000 		_mm_store_ps( clippedPoints[numClippedPoints + 0].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxPolygonVertices[polygon][0]].ToFloatPtr() ) );
   4001 		_mm_store_ps( clippedPoints[numClippedPoints + 1].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxPolygonVertices[polygon][1]].ToFloatPtr() ) );
   4002 		_mm_store_ps( clippedPoints[numClippedPoints + 2].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxPolygonVertices[polygon][2]].ToFloatPtr() ) );
   4003 		_mm_store_ps( clippedPoints[numClippedPoints + 3].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxPolygonVertices[polygon][3]].ToFloatPtr() ) );
   4004 		numClippedPoints += ClipHomogeneousPolygonToUnitCube_SSE2( &clippedPoints[numClippedPoints], 4 );
   4005 	}
   4006 
   4007 	// clip the front facing bounding box polygons projected to the far cap
   4008 	for ( int i = 0; i < frontPolygons.count; i++ ) {
   4009 		const int polygon = frontPolygons.indices[i];
   4010 		_mm_store_ps( clippedPoints[numClippedPoints + 0].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxPolygonVertices[polygon][0]].ToFloatPtr() ) );
   4011 		_mm_store_ps( clippedPoints[numClippedPoints + 1].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxPolygonVertices[polygon][1]].ToFloatPtr() ) );
   4012 		_mm_store_ps( clippedPoints[numClippedPoints + 2].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxPolygonVertices[polygon][2]].ToFloatPtr() ) );
   4013 		_mm_store_ps( clippedPoints[numClippedPoints + 3].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxPolygonVertices[polygon][3]].ToFloatPtr() ) );
   4014 		numClippedPoints += ClipHomogeneousPolygonToUnitCube_SSE2( &clippedPoints[numClippedPoints], 4 );
   4015 	}
   4016 
   4017 	// clip the silhouette edge polygons that stretch to infinity
   4018 	const silhouetteEdges_t & silhouetteEdges = boxSilhouetteEdgesForFrontBits[frontBits];
   4019 	for ( int i = 0; i < silhouetteEdges.count; i++ ) {
   4020 		const int edge = silhouetteEdges.indices[i];
   4021 		_mm_store_ps( clippedPoints[numClippedPoints + 0].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxEdgeVertices[edge][0]].ToFloatPtr() ) );
   4022 		_mm_store_ps( clippedPoints[numClippedPoints + 1].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxEdgeVertices[edge][1]].ToFloatPtr() ) );
   4023 		_mm_store_ps( clippedPoints[numClippedPoints + 2].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxEdgeVertices[edge][1]].ToFloatPtr() ) );
   4024 		_mm_store_ps( clippedPoints[numClippedPoints + 3].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxEdgeVertices[edge][0]].ToFloatPtr() ) );
   4025 		numClippedPoints += ClipHomogeneousPolygonToUnitCube_SSE2( &clippedPoints[numClippedPoints], 4 );
   4026 	}
   4027 
   4028 	// repeat the first clipped point at the end to get a multiple of 4 clipped points
   4029 	const __m128 point0 = _mm_load_ps( clippedPoints[0].ToFloatPtr() );
   4030 	for ( int i = numClippedPoints; ( i & 3 ) != 0; i++ ) {
   4031 		_mm_store_ps( clippedPoints[i].ToFloatPtr(), point0 );
   4032 	}
   4033 
   4034 	// test if the center of the near clip plane is inside the infinite shadow volume
   4035 	const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp );
   4036 	const bool inside = PointInsideInfiniteShadow( bounds, localLightOrigin, localNearClipCenter, RENDER_MATRIX_PROJECTION_EPSILON );
   4037 
   4038 	__m128 minZ = inside ? vector_float_neg_one : vector_float_pos_infinity;
   4039 	__m128 maxZ = vector_float_neg_infinity;
   4040 
   4041 	for ( int i = 0; i < numClippedPoints; i += 4 ) {
   4042 		const __m128 cp0 = _mm_load_ps( clippedPoints[i + 0].ToFloatPtr() );
   4043 		const __m128 cp1 = _mm_load_ps( clippedPoints[i + 1].ToFloatPtr() );
   4044 		const __m128 cp2 = _mm_load_ps( clippedPoints[i + 2].ToFloatPtr() );
   4045 		const __m128 cp3 = _mm_load_ps( clippedPoints[i + 3].ToFloatPtr() );
   4046 
   4047 		const __m128 s1 = _mm_unpackhi_ps( cp0, cp2 );		// cp0[2], cp2[2], cp0[3], cp2[3]
   4048 		const __m128 s3 = _mm_unpackhi_ps( cp1, cp3 );		// cp1[2], cp3[2], cp1[3], cp3[3]
   4049 
   4050 		const __m128 cpZ = _mm_unpacklo_ps( s1, s3 );		// cp0[2], cp1[2], cp2[2], cp3[2]
   4051 		const __m128 cpW = _mm_unpackhi_ps( s1, s3 );		// cp0[3], cp1[3], cp2[3], cp3[3]
   4052 
   4053 		const __m128 rW = _mm_rcp32_ps( cpW );
   4054 		const __m128 rZ = _mm_mul_ps( cpZ, rW );
   4055 
   4056 		minZ = _mm_min_ps( minZ, rZ );
   4057 		maxZ = _mm_max_ps( maxZ, rZ );
   4058 	}
   4059 
   4060 	minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   4061 	minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   4062 
   4063 	maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
   4064 	maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
   4065 
   4066 	if ( windowSpace ) {
   4067 #if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
   4068 		minZ = _mm_madd_ps( minZ, vector_float_half, vector_float_half );
   4069 		maxZ = _mm_madd_ps( maxZ, vector_float_half, vector_float_half );
   4070 #endif
   4071 
   4072 		minZ = _mm_max_ps( _mm_min_ps( minZ, vector_float_one ), vector_float_zero );
   4073 		maxZ = _mm_max_ps( _mm_min_ps( maxZ, vector_float_one ), vector_float_zero );
   4074 	}
   4075 
   4076 	_mm_store_ss( & min, minZ );
   4077 	_mm_store_ss( & max, maxZ );
   4078 
   4079 #else
   4080 
   4081 	const idVec3 points[8] = {
   4082 		idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ),
   4083 		idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ),
   4084 		idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ),
   4085 		idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ),
   4086 		idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ),
   4087 		idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ),
   4088 		idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ),
   4089 		idVec3( bounds[0][0], bounds[1][1], bounds[1][2] )
   4090 	};
   4091 
   4092 	// calculate the front facing polygon bits
   4093 	int frontBits = GetBoxFrontBits_Generic( bounds, localLightOrigin );
   4094 
   4095 	// bounding box corners
   4096 	ALIGNTYPE16 idVec4 projectedNearPoints[8];
   4097 	for ( int i = 0; i < 8; i++ ) {
   4098 		const idVec3 & v = points[i];
   4099 		projectedNearPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3];
   4100 		projectedNearPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3];
   4101 		projectedNearPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
   4102 		projectedNearPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
   4103 	}
   4104 
   4105 	// bounding box corners projected to infinity from the light position
   4106 	ALIGNTYPE16 idVec4 projectedFarPoints[8];
   4107 	for ( int i = 0; i < 8; i++ ) {
   4108 		const idVec3 v = points[i] - localLightOrigin;
   4109 		projectedFarPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2];
   4110 		projectedFarPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2];
   4111 		projectedFarPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2];
   4112 		projectedFarPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2];
   4113 	}
   4114 
   4115 	ALIGNTYPE16 idVec4 clippedPoints[( 6 + 12 ) * 16];
   4116 	int numClippedPoints = 0;
   4117 
   4118 	// clip the front facing bounding box polygons at the near cap
   4119 	const frontPolygons_t & frontPolygons = boxFrontPolygonsForFrontBits[frontBits];
   4120 	for ( int i = 0; i < frontPolygons.count; i++ ) {
   4121 		const int polygon = frontPolygons.indices[i];
   4122 		clippedPoints[numClippedPoints + 0] = projectedNearPoints[boxPolygonVertices[polygon][0]];
   4123 		clippedPoints[numClippedPoints + 1] = projectedNearPoints[boxPolygonVertices[polygon][1]];
   4124 		clippedPoints[numClippedPoints + 2] = projectedNearPoints[boxPolygonVertices[polygon][2]];
   4125 		clippedPoints[numClippedPoints + 3] = projectedNearPoints[boxPolygonVertices[polygon][3]];
   4126 		numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 );
   4127 	}
   4128 
   4129 	// clip the front facing bounding box polygons projected to the far cap
   4130 	for ( int i = 0; i < frontPolygons.count; i++ ) {
   4131 		const int polygon = frontPolygons.indices[i];
   4132 		clippedPoints[numClippedPoints + 0] = projectedFarPoints[boxPolygonVertices[polygon][0]];
   4133 		clippedPoints[numClippedPoints + 1] = projectedFarPoints[boxPolygonVertices[polygon][1]];
   4134 		clippedPoints[numClippedPoints + 2] = projectedFarPoints[boxPolygonVertices[polygon][2]];
   4135 		clippedPoints[numClippedPoints + 3] = projectedFarPoints[boxPolygonVertices[polygon][3]];
   4136 		numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 );
   4137 	}
   4138 
   4139 	// clip the silhouette edge polygons that stretch to infinity
   4140 	const silhouetteEdges_t & silhouetteEdges = boxSilhouetteEdgesForFrontBits[frontBits];
   4141 	for ( int i = 0; i < silhouetteEdges.count; i++ ) {
   4142 		const int edge = silhouetteEdges.indices[i];
   4143 		clippedPoints[numClippedPoints + 0] = projectedNearPoints[boxEdgeVertices[edge][0]];
   4144 		clippedPoints[numClippedPoints + 1] = projectedNearPoints[boxEdgeVertices[edge][1]];
   4145 		clippedPoints[numClippedPoints + 2] = projectedFarPoints[boxEdgeVertices[edge][1]];
   4146 		clippedPoints[numClippedPoints + 3] = projectedFarPoints[boxEdgeVertices[edge][0]];
   4147 		numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 );
   4148 	}
   4149 
   4150 	// test if the center of the near clip plane is inside the infinite shadow volume
   4151 	const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp );
   4152 	const bool inside = PointInsideInfiniteShadow( bounds, localLightOrigin, localNearClipCenter, RENDER_MATRIX_PROJECTION_EPSILON );
   4153 
   4154 	min = inside ? -1.0f : RENDER_MATRIX_INFINITY;
   4155 	max = - RENDER_MATRIX_INFINITY;
   4156 
   4157 	for ( int i = 0; i < numClippedPoints; i++ ) {
   4158 		const idVec4 & c = clippedPoints[i];
   4159 
   4160 		assert( c.w > idMath::FLT_SMALLEST_NON_DENORMAL );
   4161 
   4162 		const float rw = 1.0f / c.w;
   4163 		const float pz = c.z * rw;
   4164 
   4165 		min = Min( min, pz );
   4166 		max = Max( max, pz );
   4167 	}
   4168 
   4169 	if ( windowSpace ) {
   4170 		// convert to window coords
   4171 #if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
   4172 		min = min * 0.5f + 0.5f;
   4173 		max = max * 0.5f + 0.5f;
   4174 #endif
   4175 		// clamp to [0, 1] range
   4176 		min = idMath::ClampFloat( 0.0f, 1.0f, min );
   4177 		max = idMath::ClampFloat( 0.0f, 1.0f, max );
   4178 	}
   4179 
   4180 #endif
   4181 }
   4182 
   4183 /*
   4184 ========================
   4185 idRenderMatrix::GetFrustumPlanes
   4186 
   4187 Normally the clip space extends from -1.0 to 1.0 on each axis, but by setting 'zeroToOne'
   4188 to true, the clip space will extend from 0.0 to 1.0 on each axis for a light projection matrix.
   4189 ========================
   4190 */
   4191 void idRenderMatrix::GetFrustumPlanes( idPlane planes[6], const idRenderMatrix & frustum, bool zeroToOne, bool normalize ) {
   4192 	// FIXME:	need to know whether or not this is a D3D MVP.
   4193 	//			We cannot just assume that it's an D3D MVP matrix when
   4194 	//			zeroToOne = false and CLIP_SPACE_D3D is defined because
   4195 	//			this code may be called for non-MVP matrices.
   4196 	const bool isZeroOneZ = false;
   4197 
   4198 	if ( zeroToOne ) {
   4199 		// left: inside(p) = p * frustum[0] > 0
   4200 		planes[0][0] = frustum[0][0];
   4201 		planes[0][1] = frustum[0][1];
   4202 		planes[0][2] = frustum[0][2];
   4203 		planes[0][3] = frustum[0][3];
   4204  
   4205 		// bottom: inside(p) = p * frustum[1] > 0
   4206 		planes[2][0] = frustum[1][0];
   4207 		planes[2][1] = frustum[1][1];
   4208 		planes[2][2] = frustum[1][2];
   4209 		planes[2][3] = frustum[1][3];
   4210 
   4211 		// near: inside(p) = p * frustum[2] > 0
   4212 		planes[4][0] = frustum[2][0];
   4213 		planes[4][1] = frustum[2][1];
   4214 		planes[4][2] = frustum[2][2];
   4215 		planes[4][3] = frustum[2][3];
   4216 	} else {
   4217 		// left: inside(p) = p * frustum[0] > - ( p * frustum[3] )
   4218 		planes[0][0] = frustum[3][0] + frustum[0][0];
   4219 		planes[0][1] = frustum[3][1] + frustum[0][1];
   4220 		planes[0][2] = frustum[3][2] + frustum[0][2];
   4221 		planes[0][3] = frustum[3][3] + frustum[0][3];
   4222  
   4223 		// bottom: inside(p) = p * frustum[1] > -( p * frustum[3] )
   4224 		planes[2][0] = frustum[3][0] + frustum[1][0];
   4225 		planes[2][1] = frustum[3][1] + frustum[1][1];
   4226 		planes[2][2] = frustum[3][2] + frustum[1][2];
   4227 		planes[2][3] = frustum[3][3] + frustum[1][3];
   4228 
   4229 		// near: inside(p) = p * frustum[2] > -( p * frustum[3] )
   4230 		planes[4][0] = isZeroOneZ ? ( frustum[2][0] ) : ( frustum[3][0] + frustum[2][0] );
   4231 		planes[4][1] = isZeroOneZ ? ( frustum[2][1] ) : ( frustum[3][1] + frustum[2][1] );
   4232 		planes[4][2] = isZeroOneZ ? ( frustum[2][2] ) : ( frustum[3][2] + frustum[2][2] );
   4233 		planes[4][3] = isZeroOneZ ? ( frustum[2][3] ) : ( frustum[3][3] + frustum[2][3] );
   4234 	}
   4235 
   4236 	// right: inside(p) = p * frustum[0] < p * frustum[3]
   4237 	planes[1][0] = frustum[3][0] - frustum[0][0];
   4238 	planes[1][1] = frustum[3][1] - frustum[0][1];
   4239 	planes[1][2] = frustum[3][2] - frustum[0][2];
   4240 	planes[1][3] = frustum[3][3] - frustum[0][3];
   4241 
   4242 	// top: inside(p) = p * frustum[1] < p * frustum[3]
   4243 	planes[3][0] = frustum[3][0] - frustum[1][0];
   4244 	planes[3][1] = frustum[3][1] - frustum[1][1];
   4245 	planes[3][2] = frustum[3][2] - frustum[1][2];
   4246 	planes[3][3] = frustum[3][3] - frustum[1][3];
   4247  
   4248 	// far: inside(p) = p * frustum[2] < p * frustum[3]
   4249 	planes[5][0] = frustum[3][0] - frustum[2][0];
   4250 	planes[5][1] = frustum[3][1] - frustum[2][1];
   4251 	planes[5][2] = frustum[3][2] - frustum[2][2];
   4252 	planes[5][3] = frustum[3][3] - frustum[2][3];
   4253 
   4254 	// optionally normalize the planes
   4255 	if ( normalize ) {
   4256 		for ( int i = 0; i < 6; i++ ) {
   4257 			float s = idMath::InvSqrt( planes[i].Normal().LengthSqr() );
   4258 			planes[i][0] *= s;
   4259 			planes[i][1] *= s;
   4260 			planes[i][2] *= s;
   4261 			planes[i][3] *= s;
   4262 		}
   4263 	}
   4264 }
   4265 
   4266 /*
   4267 ========================
   4268 idRenderMatrix::GetFrustumCorners
   4269 ========================
   4270 */
   4271 void idRenderMatrix::GetFrustumCorners( frustumCorners_t & corners, const idRenderMatrix & frustumTransform, const idBounds & frustumBounds ) {
   4272 	assert_16_byte_aligned( &corners );
   4273 
   4274 #ifdef ID_WIN_X86_SSE2_INTRIN
   4275 
   4276 	__m128 mvp0 = _mm_loadu_ps( frustumTransform[0] );
   4277 	__m128 mvp1 = _mm_loadu_ps( frustumTransform[1] );
   4278 	__m128 mvp2 = _mm_loadu_ps( frustumTransform[2] );
   4279 	__m128 mvp3 = _mm_loadu_ps( frustumTransform[3] );
   4280 
   4281 	__m128 b0 = _mm_loadu_bounds_0( frustumBounds );
   4282 	__m128 b1 = _mm_loadu_bounds_1( frustumBounds );
   4283 
   4284 	// take the four points on the X-Y plane
   4285 	__m128 vxy = _mm_unpacklo_ps( b0, b1 );						// min X, max X, min Y, max Y
   4286 	__m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) );	// min X, max X, min X, max X
   4287 	__m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) );	// min Y, min Y, max Y, max Y
   4288 
   4289 	__m128 vz0 = _mm_splat_ps( b0, 2 );							// min Z, min Z, min Z, min Z
   4290 	__m128 vz1 = _mm_splat_ps( b1, 2 );							// max Z, max Z, max Z, max Z
   4291 
   4292 	// compute four partial X,Y,Z,W values
   4293 	__m128 parx = _mm_splat_ps( mvp0, 3 );
   4294 	__m128 pary = _mm_splat_ps( mvp1, 3 );
   4295 	__m128 parz = _mm_splat_ps( mvp2, 3 );
   4296 	__m128 parw = _mm_splat_ps( mvp3, 3 );
   4297 
   4298 	parx = _mm_madd_ps( vx, _mm_splat_ps( mvp0, 0 ), parx );
   4299 	pary = _mm_madd_ps( vx, _mm_splat_ps( mvp1, 0 ), pary );
   4300 	parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz );
   4301 	parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw );
   4302 
   4303 	parx = _mm_madd_ps( vy, _mm_splat_ps( mvp0, 1 ), parx );
   4304 	pary = _mm_madd_ps( vy, _mm_splat_ps( mvp1, 1 ), pary );
   4305 	parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz );
   4306 	parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw );
   4307 
   4308 	__m128 mvp0Z = _mm_splat_ps( mvp0, 2 );
   4309 	__m128 mvp1Z = _mm_splat_ps( mvp1, 2 );
   4310 	__m128 mvp2Z = _mm_splat_ps( mvp2, 2 );
   4311 	__m128 mvp3Z = _mm_splat_ps( mvp3, 2 );
   4312 
   4313 	__m128 x0 = _mm_madd_ps( vz0, mvp0Z, parx );
   4314 	__m128 y0 = _mm_madd_ps( vz0, mvp1Z, pary );
   4315 	__m128 z0 = _mm_madd_ps( vz0, mvp2Z, parz );
   4316 	__m128 w0 = _mm_madd_ps( vz0, mvp3Z, parw );
   4317 
   4318 	__m128 x1 = _mm_madd_ps( vz1, mvp0Z, parx );
   4319 	__m128 y1 = _mm_madd_ps( vz1, mvp1Z, pary );
   4320 	__m128 z1 = _mm_madd_ps( vz1, mvp2Z, parz );
   4321 	__m128 w1 = _mm_madd_ps( vz1, mvp3Z, parw );
   4322 
   4323 	__m128 s0 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w0 );
   4324 	__m128 s1 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w1 );
   4325 
   4326 	w0 = _mm_sel_ps( w0, vector_float_one, s0 );
   4327 	w1 = _mm_sel_ps( w1, vector_float_one, s1 );
   4328 
   4329 	__m128 rw0 = _mm_rcp32_ps( w0 );
   4330 	__m128 rw1 = _mm_rcp32_ps( w1 );
   4331 
   4332 	x0 = _mm_mul_ps( x0, rw0 );
   4333 	y0 = _mm_mul_ps( y0, rw0 );
   4334 	z0 = _mm_mul_ps( z0, rw0 );
   4335 
   4336 	x1 = _mm_mul_ps( x1, rw1 );
   4337 	y1 = _mm_mul_ps( y1, rw1 );
   4338 	z1 = _mm_mul_ps( z1, rw1 );
   4339 
   4340 	_mm_store_ps( corners.x + 0, x0 );
   4341 	_mm_store_ps( corners.x + 4, x1 );
   4342 	_mm_store_ps( corners.y + 0, y0 );
   4343 	_mm_store_ps( corners.y + 4, y1 );
   4344 	_mm_store_ps( corners.z + 0, z0 );
   4345 	_mm_store_ps( corners.z + 4, z1 );
   4346 
   4347 #else
   4348 
   4349 	idVec3 v;
   4350 	for ( int x = 0; x < 2; x++ ) {
   4351 		v[0] = frustumBounds[x][0];
   4352 		for ( int y = 0; y < 2; y++ ) {
   4353 			v[1] = frustumBounds[y][1];
   4354 			for ( int z = 0; z < 2; z++ ) {
   4355 				v[2] = frustumBounds[z][2];
   4356 
   4357 				float tx = v[0] * frustumTransform[0][0] + v[1] * frustumTransform[0][1] + v[2] * frustumTransform[0][2] + frustumTransform[0][3];
   4358 				float ty = v[0] * frustumTransform[1][0] + v[1] * frustumTransform[1][1] + v[2] * frustumTransform[1][2] + frustumTransform[1][3];
   4359 				float tz = v[0] * frustumTransform[2][0] + v[1] * frustumTransform[2][1] + v[2] * frustumTransform[2][2] + frustumTransform[2][3];
   4360 				float tw = v[0] * frustumTransform[3][0] + v[1] * frustumTransform[3][1] + v[2] * frustumTransform[3][2] + frustumTransform[3][3];
   4361 
   4362 				assert( tw > idMath::FLT_SMALLEST_NON_DENORMAL );
   4363 
   4364 				float rw = 1.0f / tw;
   4365 
   4366 				corners.x[(z<<2)|(y<<1)|(x<<0)] = tx * rw;
   4367 				corners.y[(z<<2)|(y<<1)|(x<<0)] = ty * rw;
   4368 				corners.z[(z<<2)|(y<<1)|(x<<0)] = tz * rw;
   4369 			}
   4370 		}
   4371 	}
   4372 
   4373 #endif
   4374 }
   4375 
   4376 /*
   4377 ========================
   4378 idRenderMatrix::CullFrustumCornersToPlane
   4379 ========================
   4380 */
   4381 frustumCull_t idRenderMatrix::CullFrustumCornersToPlane( const frustumCorners_t & corners, const idPlane & plane ) {
   4382 	assert_16_byte_aligned( &corners );
   4383 
   4384 #ifdef ID_WIN_X86_SSE2_INTRIN
   4385 
   4386 	__m128 vp = _mm_loadu_ps( plane.ToFloatPtr() );
   4387 
   4388 	__m128 x0 = _mm_load_ps( corners.x + 0 );
   4389 	__m128 y0 = _mm_load_ps( corners.y + 0 );
   4390 	__m128 z0 = _mm_load_ps( corners.z + 0 );
   4391 
   4392 	__m128 x1 = _mm_load_ps( corners.x + 4 );
   4393 	__m128 y1 = _mm_load_ps( corners.y + 4 );
   4394 	__m128 z1 = _mm_load_ps( corners.z + 4 );
   4395 
   4396 	__m128 p0 = _mm_splat_ps( vp, 0 );
   4397 	__m128 p1 = _mm_splat_ps( vp, 1 );
   4398 	__m128 p2 = _mm_splat_ps( vp, 2 );
   4399 	__m128 p3 = _mm_splat_ps( vp, 3 );
   4400 
   4401 	__m128 d0 = _mm_madd_ps( x0, p0, _mm_madd_ps( y0, p1, _mm_madd_ps( z0, p2, p3 ) ) );
   4402 	__m128 d1 = _mm_madd_ps( x1, p0, _mm_madd_ps( y1, p1, _mm_madd_ps( z1, p2, p3 ) ) );
   4403 
   4404 	int b0 = _mm_movemask_ps( d0 );
   4405 	int b1 = _mm_movemask_ps( d1 );
   4406 
   4407 	unsigned int front = ( (unsigned int) -( ( b0 & b1 ) ^ 15 ) ) >> 31;
   4408 	unsigned int back = ( (unsigned int) -( b0 | b1 ) ) >> 31;
   4409 
   4410 	compile_time_assert( FRUSTUM_CULL_FRONT == 1 );
   4411 	compile_time_assert( FRUSTUM_CULL_BACK == 2 );
   4412 	compile_time_assert( FRUSTUM_CULL_CROSS == 3 );
   4413 
   4414 	return (frustumCull_t) ( front | ( back << 1 ) );
   4415 
   4416 #else
   4417 
   4418 	bool front = false;
   4419 	bool back = false;
   4420 	for ( int i = 0; i < 8; i++ ) {
   4421 		const float d = corners.x[i] * plane[0] + corners.y[i] * plane[1] + corners.z[i] * plane[2] + plane[3];
   4422 		if ( d >= 0.0f ) {
   4423 		    front = true;
   4424 		} else if ( d <= 0.0f ) {
   4425 		    back = true;
   4426 		}
   4427 		if ( back && front ) {
   4428 			return FRUSTUM_CULL_CROSS;
   4429 		}
   4430 	}
   4431 	if ( front ) {
   4432 		return FRUSTUM_CULL_FRONT;
   4433 	} else {
   4434 		return FRUSTUM_CULL_BACK;
   4435 	}
   4436 
   4437 #endif
   4438 }