RenderMatrix.cpp (191624B)
1 /* 2 =========================================================================== 3 4 Doom 3 BFG Edition GPL Source Code 5 Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. 6 7 This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). 8 9 Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation, either version 3 of the License, or 12 (at your option) any later version. 13 14 Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License 20 along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>. 21 22 In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below. 23 24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA. 25 26 =========================================================================== 27 */ 28 29 #include "../ParallelJobList_JobHeaders.h" 30 #include "../math/Math.h" 31 #include "../math/Vector.h" 32 #include "../math/Matrix.h" 33 #include "../math/Rotation.h" 34 #include "../math/Plane.h" 35 #include "../bv/Sphere.h" 36 #include "../bv/Bounds.h" 37 #include "RenderMatrix.h" 38 39 // FIXME: it would be nice if all render matrices were 16-byte aligned 40 // so there is no need for unaligned loads and stores everywhere 41 42 #ifdef _lint 43 #undef ID_WIN_X86_SSE2_INTRIN 44 #endif 45 46 //lint -e438 // the non-SSE code isn't lint friendly, either 47 //lint -e550 48 49 #define RENDER_MATRIX_INVERSE_EPSILON 1e-16f // JDC: changed from 1e-14f to allow full wasteland parallel light projections to invert 50 #define RENDER_MATRIX_INFINITY 1e30f // NOTE: cannot initiaize a vec_float4 with idMath::INFINITY on the SPU 51 #define RENDER_MATRIX_PROJECTION_EPSILON 0.1f 52 53 #define CLIP_SPACE_OGL // the OpenGL clip space Z is in the range [-1, 1] 54 55 /* 56 ================================================================================================ 57 58 Constant render matrices 59 60 ================================================================================================ 61 */ 62 63 // identity matrix 64 ALIGNTYPE16 const idRenderMatrix renderMatrix_identity( 65 1.0f, 0.0f, 0.0f, 0.0f, 66 0.0f, 1.0f, 0.0f, 0.0f, 67 0.0f, 0.0f, 1.0f, 0.0f, 68 0.0f, 0.0f, 0.0f, 1.0f 69 ); 70 71 // convert from our coordinate system (looking down X) to OpenGL's coordinate system (looking down -Z) 72 ALIGNTYPE16 const idRenderMatrix renderMatrix_flipToOpenGL( 73 0.0f, -1.0f, 0.0f, 0.0f, 74 0.0f, 0.0f, 1.0f, 0.0f, 75 -1.0f, 0.0f, 0.0f, 0.0f, 76 0.0f, 0.0f, 0.0f, 1.0f 77 ); 78 79 // OpenGL -1 to 1. 80 ALIGNTYPE16 const idRenderMatrix renderMatrix_windowSpaceToClipSpace( 81 2.0f, 0.0f, 0.0f, -1.0f, 82 0.0f, 2.0f, 0.0f, -1.0f, 83 0.0f, 0.0f, 2.0f, -1.0f, 84 0.0f, 0.0f, 0.0f, 1.0f 85 ); 86 87 /* 88 ================================================================================================ 89 90 SIMD constants 91 92 ================================================================================================ 93 */ 94 95 #ifdef ID_WIN_X86_SSE2_INTRIN 96 97 static const __m128i vector_int_1 = _mm_set1_epi32( 1 ); 98 static const __m128i vector_int_4 = _mm_set1_epi32( 4 ); 99 static const __m128i vector_int_0123 = _mm_set_epi32( 3, 2, 1, 0 ); 100 static const __m128 vector_float_mask0 = __m128c( _mm_set1_epi32( 1<<0 ) ); 101 static const __m128 vector_float_mask1 = __m128c( _mm_set1_epi32( 1<<1 ) ); 102 static const __m128 vector_float_mask2 = __m128c( _mm_set1_epi32( 1<<2 ) ); 103 static const __m128 vector_float_mask3 = __m128c( _mm_set1_epi32( 1<<3 ) ); 104 static const __m128 vector_float_mask4 = __m128c( _mm_set1_epi32( 1<<4 ) ); 105 static const __m128 vector_float_mask5 = __m128c( _mm_set1_epi32( 1<<5 ) ); 106 static const __m128 vector_float_sign_bit = __m128c( _mm_set1_epi32( IEEE_FLT_SIGN_MASK ) ); 107 static const __m128 vector_float_abs_mask = __m128c( _mm_set1_epi32( ~IEEE_FLT_SIGN_MASK ) ); 108 static const __m128 vector_float_keep_last = __m128c( _mm_set_epi32( -1, 0, 0, 0 ) ); 109 static const __m128 vector_float_inverse_epsilon = { RENDER_MATRIX_INVERSE_EPSILON, RENDER_MATRIX_INVERSE_EPSILON, RENDER_MATRIX_INVERSE_EPSILON, RENDER_MATRIX_INVERSE_EPSILON }; 110 static const __m128 vector_float_smallest_non_denorm = { 1.1754944e-038f, 1.1754944e-038f, 1.1754944e-038f, 1.1754944e-038f }; 111 static const __m128 vector_float_pos_infinity = { RENDER_MATRIX_INFINITY, RENDER_MATRIX_INFINITY, RENDER_MATRIX_INFINITY, RENDER_MATRIX_INFINITY }; 112 static const __m128 vector_float_neg_infinity = { -RENDER_MATRIX_INFINITY, -RENDER_MATRIX_INFINITY, -RENDER_MATRIX_INFINITY, -RENDER_MATRIX_INFINITY }; 113 static const __m128 vector_float_zero = { 0.0f, 0.0f, 0.0f, 0.0f }; 114 static const __m128 vector_float_half = { 0.5f, 0.5f, 0.5f, 0.5f }; 115 static const __m128 vector_float_neg_half = { -0.5f, -0.5f, -0.5f, -0.5f }; 116 static const __m128 vector_float_one = { 1.0f, 1.0f, 1.0f, 1.0f }; 117 static const __m128 vector_float_pos_one = { +1.0f, +1.0f, +1.0f, +1.0f }; 118 static const __m128 vector_float_neg_one = { -1.0f, -1.0f, -1.0f, -1.0f }; 119 static const __m128 vector_float_last_one = { 0.0f, 0.0f, 0.0f, 1.0f }; 120 121 #endif 122 123 /* 124 ================================================================================================ 125 126 Box definition 127 128 ================================================================================================ 129 */ 130 131 /* 132 4----{E}---5 133 + /| /| 134 Z {H} {I} {F} | 135 - / | / {J} 136 7--{G}-----6 | 137 | | | | 138 {L} 0----|-{A}-1 139 | / {K} / - 140 | {D} | {B} Y 141 |/ |/ + 142 3---{C}----2 143 144 - X + 145 */ 146 147 static const short boxPolygonVertices[6][4] = { 148 { 0, 3, 7, 4 }, // neg-X 149 { 0, 1, 5, 4 }, // neg-Y 150 { 0, 1, 2, 3 }, // neg-Z 151 { 1, 2, 6, 5 }, // pos-X 152 { 2, 3, 7, 6 }, // pos-Y 153 { 4, 5, 6, 7 } // pos-Z 154 }; 155 156 static const short boxEdgeVertices[12][2] = { 157 /* A = */ { 0, 1 }, /* B = */ { 1, 2 }, /* C = */ { 2, 3 }, /* D = */ { 3, 0 }, // bottom 158 /* E = */ { 4, 5 }, /* F = */ { 5, 6 }, /* G = */ { 6, 7 }, /* H = */ { 7, 4 }, // top 159 /* I = */ { 0, 4 }, /* J = */ { 1, 5 }, /* K = */ { 2, 6 }, /* L = */ { 3, 7 } // sides 160 }; 161 162 static int boxEdgePolygons[12][2] = { 163 /* A = */ { 1, 2 }, /* B = */ { 3, 2 }, /* C = */ { 4, 2 }, /* D = */ { 0, 2 }, // bottom 164 /* E = */ { 1, 5 }, /* F = */ { 3, 5 }, /* G = */ { 4, 5 }, /* H = */ { 0, 5 }, // top 165 /* I = */ { 0, 1 }, /* J = */ { 3, 1 }, /* K = */ { 3, 4 }, /* L = */ { 0, 4 } // sides 166 }; 167 168 /* 169 #include <Windows.h> 170 171 class idCreateBoxFrontPolygonsForFrontBits { 172 public: 173 idCreateBoxFrontPolygonsForFrontBits() { 174 for ( int i = 0; i < 64; i++ ) { 175 int frontPolygons[7] = { 0 }; 176 int numFrontPolygons = 0; 177 char bits[7] = { 0 }; 178 for ( int j = 0; j < 6; j++ ) { 179 if ( ( i & ( 1 << j ) ) != 0 ) { 180 frontPolygons[numFrontPolygons++] = j; 181 bits[5 - j] = '1'; 182 } else { 183 bits[5 - j] = '0'; 184 } 185 } 186 const char * comment = ( ( i & ( i >> 3 ) & 7 ) != 0 ) ? " invalid" : ""; 187 if ( i == 0 ) { 188 comment = " inside the box, every polygon is considered front facing"; 189 numFrontPolygons = 6; 190 for ( int j = 0; j < 6; j++ ) { 191 frontPolygons[j] = j; 192 } 193 } 194 char buffer[1024]; 195 sprintf( buffer, "{ { %d, %d, %d, %d, %d, %d, %d }, %d }, // %s = %d%s\n", 196 frontPolygons[0], frontPolygons[1], frontPolygons[2], frontPolygons[3], 197 frontPolygons[4], frontPolygons[5], frontPolygons[6], 198 numFrontPolygons, bits, i, comment ); 199 OutputDebugString( buffer ); 200 } 201 } 202 } createBoxFrontPolygonsForFrontBits; 203 */ 204 205 // make sure this is a power of two for fast addressing an array of these without integer multiplication 206 static const struct frontPolygons_t { 207 byte indices[7]; 208 byte count; 209 } boxFrontPolygonsForFrontBits[64] = { 210 { { 0, 1, 2, 3, 4, 5, 0 }, 6 }, // 000000 = 0 inside the box, every polygon is considered front facing 211 { { 0, 0, 0, 0, 0, 0, 0 }, 1 }, // 000001 = 1 212 { { 1, 0, 0, 0, 0, 0, 0 }, 1 }, // 000010 = 2 213 { { 0, 1, 0, 0, 0, 0, 0 }, 2 }, // 000011 = 3 214 { { 2, 0, 0, 0, 0, 0, 0 }, 1 }, // 000100 = 4 215 { { 0, 2, 0, 0, 0, 0, 0 }, 2 }, // 000101 = 5 216 { { 1, 2, 0, 0, 0, 0, 0 }, 2 }, // 000110 = 6 217 { { 0, 1, 2, 0, 0, 0, 0 }, 3 }, // 000111 = 7 218 { { 3, 0, 0, 0, 0, 0, 0 }, 1 }, // 001000 = 8 219 { { 0, 3, 0, 0, 0, 0, 0 }, 2 }, // 001001 = 9 invalid 220 { { 1, 3, 0, 0, 0, 0, 0 }, 2 }, // 001010 = 10 221 { { 0, 1, 3, 0, 0, 0, 0 }, 3 }, // 001011 = 11 invalid 222 { { 2, 3, 0, 0, 0, 0, 0 }, 2 }, // 001100 = 12 223 { { 0, 2, 3, 0, 0, 0, 0 }, 3 }, // 001101 = 13 invalid 224 { { 1, 2, 3, 0, 0, 0, 0 }, 3 }, // 001110 = 14 225 { { 0, 1, 2, 3, 0, 0, 0 }, 4 }, // 001111 = 15 invalid 226 { { 4, 0, 0, 0, 0, 0, 0 }, 1 }, // 010000 = 16 227 { { 0, 4, 0, 0, 0, 0, 0 }, 2 }, // 010001 = 17 228 { { 1, 4, 0, 0, 0, 0, 0 }, 2 }, // 010010 = 18 invalid 229 { { 0, 1, 4, 0, 0, 0, 0 }, 3 }, // 010011 = 19 invalid 230 { { 2, 4, 0, 0, 0, 0, 0 }, 2 }, // 010100 = 20 231 { { 0, 2, 4, 0, 0, 0, 0 }, 3 }, // 010101 = 21 232 { { 1, 2, 4, 0, 0, 0, 0 }, 3 }, // 010110 = 22 invalid 233 { { 0, 1, 2, 4, 0, 0, 0 }, 4 }, // 010111 = 23 invalid 234 { { 3, 4, 0, 0, 0, 0, 0 }, 2 }, // 011000 = 24 235 { { 0, 3, 4, 0, 0, 0, 0 }, 3 }, // 011001 = 25 invalid 236 { { 1, 3, 4, 0, 0, 0, 0 }, 3 }, // 011010 = 26 invalid 237 { { 0, 1, 3, 4, 0, 0, 0 }, 4 }, // 011011 = 27 invalid 238 { { 2, 3, 4, 0, 0, 0, 0 }, 3 }, // 011100 = 28 239 { { 0, 2, 3, 4, 0, 0, 0 }, 4 }, // 011101 = 29 invalid 240 { { 1, 2, 3, 4, 0, 0, 0 }, 4 }, // 011110 = 30 invalid 241 { { 0, 1, 2, 3, 4, 0, 0 }, 5 }, // 011111 = 31 invalid 242 { { 5, 0, 0, 0, 0, 0, 0 }, 1 }, // 100000 = 32 243 { { 0, 5, 0, 0, 0, 0, 0 }, 2 }, // 100001 = 33 244 { { 1, 5, 0, 0, 0, 0, 0 }, 2 }, // 100010 = 34 245 { { 0, 1, 5, 0, 0, 0, 0 }, 3 }, // 100011 = 35 246 { { 2, 5, 0, 0, 0, 0, 0 }, 2 }, // 100100 = 36 invalid 247 { { 0, 2, 5, 0, 0, 0, 0 }, 3 }, // 100101 = 37 invalid 248 { { 1, 2, 5, 0, 0, 0, 0 }, 3 }, // 100110 = 38 invalid 249 { { 0, 1, 2, 5, 0, 0, 0 }, 4 }, // 100111 = 39 invalid 250 { { 3, 5, 0, 0, 0, 0, 0 }, 2 }, // 101000 = 40 251 { { 0, 3, 5, 0, 0, 0, 0 }, 3 }, // 101001 = 41 invalid 252 { { 1, 3, 5, 0, 0, 0, 0 }, 3 }, // 101010 = 42 253 { { 0, 1, 3, 5, 0, 0, 0 }, 4 }, // 101011 = 43 invalid 254 { { 2, 3, 5, 0, 0, 0, 0 }, 3 }, // 101100 = 44 invalid 255 { { 0, 2, 3, 5, 0, 0, 0 }, 4 }, // 101101 = 45 invalid 256 { { 1, 2, 3, 5, 0, 0, 0 }, 4 }, // 101110 = 46 invalid 257 { { 0, 1, 2, 3, 5, 0, 0 }, 5 }, // 101111 = 47 invalid 258 { { 4, 5, 0, 0, 0, 0, 0 }, 2 }, // 110000 = 48 259 { { 0, 4, 5, 0, 0, 0, 0 }, 3 }, // 110001 = 49 260 { { 1, 4, 5, 0, 0, 0, 0 }, 3 }, // 110010 = 50 invalid 261 { { 0, 1, 4, 5, 0, 0, 0 }, 4 }, // 110011 = 51 invalid 262 { { 2, 4, 5, 0, 0, 0, 0 }, 3 }, // 110100 = 52 invalid 263 { { 0, 2, 4, 5, 0, 0, 0 }, 4 }, // 110101 = 53 invalid 264 { { 1, 2, 4, 5, 0, 0, 0 }, 4 }, // 110110 = 54 invalid 265 { { 0, 1, 2, 4, 5, 0, 0 }, 5 }, // 110111 = 55 invalid 266 { { 3, 4, 5, 0, 0, 0, 0 }, 3 }, // 111000 = 56 267 { { 0, 3, 4, 5, 0, 0, 0 }, 4 }, // 111001 = 57 invalid 268 { { 1, 3, 4, 5, 0, 0, 0 }, 4 }, // 111010 = 58 invalid 269 { { 0, 1, 3, 4, 5, 0, 0 }, 5 }, // 111011 = 59 invalid 270 { { 2, 3, 4, 5, 0, 0, 0 }, 4 }, // 111100 = 60 invalid 271 { { 0, 2, 3, 4, 5, 0, 0 }, 5 }, // 111101 = 61 invalid 272 { { 1, 2, 3, 4, 5, 0, 0 }, 5 }, // 111110 = 62 invalid 273 { { 0, 1, 2, 3, 4, 5, 0 }, 6 }, // 111111 = 63 invalid 274 }; 275 276 /* 277 #include <Windows.h> 278 279 class idCreateBoxSilhouetteEdgesForFrontBits { 280 public: 281 idCreateBoxSilhouetteEdgesForFrontBits() { 282 for ( int i = 0; i < 64; i++ ) { 283 int silhouetteEdges[12] = { 0 }; 284 int numSilhouetteEdges = 0; 285 286 for ( int j = 0; j < 12; j++ ) { 287 if ( i == 0 || ( ( i >> boxEdgePolygons[j][0] ) & 1 ) != ( ( i >> boxEdgePolygons[j][1] ) & 1 ) ) { 288 silhouetteEdges[numSilhouetteEdges++] = j; 289 } 290 } 291 292 char bits[7] = { 0 }; 293 for ( int j = 0; j < 6; j++ ) { 294 if ( ( i & ( 1 << j ) ) != 0 ) { 295 bits[5 - j] = '1'; 296 } else { 297 bits[5 - j] = '0'; 298 } 299 } 300 const char * comment = ( ( i & ( i >> 3 ) & 7 ) != 0 ) ? " invalid" : ""; 301 if ( i == 0 ) { 302 comment = " inside the box, every edge is considered part of the silhouette"; 303 } 304 char buffer[1024]; 305 sprintf( buffer, "{ { %2d, %2d, %2d, %2d, %2d, %2d, %2d, %2d, %2d, %2d, %2d, %2d }, %2d }, // %s = %d%s\n", 306 silhouetteEdges[0], silhouetteEdges[1], silhouetteEdges[2], silhouetteEdges[3], 307 silhouetteEdges[4], silhouetteEdges[5], silhouetteEdges[6], silhouetteEdges[7], 308 silhouetteEdges[8], silhouetteEdges[9], silhouetteEdges[10], silhouetteEdges[11], 309 numSilhouetteEdges, bits, i, comment ); 310 OutputDebugString( buffer ); 311 } 312 } 313 } createBoxSilhouetteEdgesForFrontBits; 314 */ 315 316 // make sure this is a power of two for fast addressing an array of these without integer multiplication 317 static const struct silhouetteEdges_t { 318 byte indices[12]; 319 int32 count; 320 } boxSilhouetteEdgesForFrontBits[64] = { 321 { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, 12 }, // 000000 = 0 inside the box, every edge is considered part of the silhouette 322 { { 3, 7, 8, 11, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 000001 = 1 323 { { 0, 4, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 000010 = 2 324 { { 0, 3, 4, 7, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 000011 = 3 325 { { 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 000100 = 4 326 { { 0, 1, 2, 7, 8, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 000101 = 5 327 { { 1, 2, 3, 4, 8, 9, 0, 0, 0, 0, 0, 0 }, 6 }, // 000110 = 6 328 { { 1, 2, 4, 7, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 000111 = 7 329 { { 1, 5, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 001000 = 8 330 { { 1, 3, 5, 7, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 001001 = 9 invalid 331 { { 0, 1, 4, 5, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 001010 = 10 332 { { 0, 1, 3, 4, 5, 7, 10, 11, 0, 0, 0, 0 }, 8 }, // 001011 = 11 invalid 333 { { 0, 2, 3, 5, 9, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 001100 = 12 334 { { 0, 2, 5, 7, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 001101 = 13 invalid 335 { { 2, 3, 4, 5, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 001110 = 14 336 { { 2, 4, 5, 7, 10, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 001111 = 15 invalid 337 { { 2, 6, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 010000 = 16 338 { { 2, 3, 6, 7, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 010001 = 17 339 { { 0, 2, 4, 6, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 010010 = 18 invalid 340 { { 0, 2, 3, 4, 6, 7, 9, 10, 0, 0, 0, 0 }, 8 }, // 010011 = 19 invalid 341 { { 0, 1, 3, 6, 10, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 010100 = 20 342 { { 0, 1, 6, 7, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 010101 = 21 343 { { 1, 3, 4, 6, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 010110 = 22 invalid 344 { { 1, 4, 6, 7, 9, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 010111 = 23 invalid 345 { { 1, 2, 5, 6, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 011000 = 24 346 { { 1, 2, 3, 5, 6, 7, 8, 9, 0, 0, 0, 0 }, 8 }, // 011001 = 25 invalid 347 { { 0, 1, 2, 4, 5, 6, 8, 11, 0, 0, 0, 0 }, 8 }, // 011010 = 26 invalid 348 { { 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0 }, 8 }, // 011011 = 27 invalid 349 { { 0, 3, 5, 6, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 011100 = 28 350 { { 0, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0 }, 6 }, // 011101 = 29 invalid 351 { { 3, 4, 5, 6, 8, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 011110 = 30 invalid 352 { { 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 011111 = 31 invalid 353 { { 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 100000 = 32 354 { { 3, 4, 5, 6, 8, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 100001 = 33 355 { { 0, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0 }, 6 }, // 100010 = 34 356 { { 0, 3, 5, 6, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 100011 = 35 357 { { 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0 }, 8 }, // 100100 = 36 invalid 358 { { 0, 1, 2, 4, 5, 6, 8, 11, 0, 0, 0, 0 }, 8 }, // 100101 = 37 invalid 359 { { 1, 2, 3, 5, 6, 7, 8, 9, 0, 0, 0, 0 }, 8 }, // 100110 = 38 invalid 360 { { 1, 2, 5, 6, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 100111 = 39 invalid 361 { { 1, 4, 6, 7, 9, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 101000 = 40 362 { { 1, 3, 4, 6, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 101001 = 41 invalid 363 { { 0, 1, 6, 7, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 101010 = 42 364 { { 0, 1, 3, 6, 10, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 101011 = 43 invalid 365 { { 0, 2, 3, 4, 6, 7, 9, 10, 0, 0, 0, 0 }, 8 }, // 101100 = 44 invalid 366 { { 0, 2, 4, 6, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 101101 = 45 invalid 367 { { 2, 3, 6, 7, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 101110 = 46 invalid 368 { { 2, 6, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 101111 = 47 invalid 369 { { 2, 4, 5, 7, 10, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 110000 = 48 370 { { 2, 3, 4, 5, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 110001 = 49 371 { { 0, 2, 5, 7, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 110010 = 50 invalid 372 { { 0, 2, 3, 5, 9, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 110011 = 51 invalid 373 { { 0, 1, 3, 4, 5, 7, 10, 11, 0, 0, 0, 0 }, 8 }, // 110100 = 52 invalid 374 { { 0, 1, 4, 5, 8, 10, 0, 0, 0, 0, 0, 0 }, 6 }, // 110101 = 53 invalid 375 { { 1, 3, 5, 7, 8, 9, 10, 11, 0, 0, 0, 0 }, 8 }, // 110110 = 54 invalid 376 { { 1, 5, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 110111 = 55 invalid 377 { { 1, 2, 4, 7, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 111000 = 56 378 { { 1, 2, 3, 4, 8, 9, 0, 0, 0, 0, 0, 0 }, 6 }, // 111001 = 57 invalid 379 { { 0, 1, 2, 7, 8, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 111010 = 58 invalid 380 { { 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 111011 = 59 invalid 381 { { 0, 3, 4, 7, 9, 11, 0, 0, 0, 0, 0, 0 }, 6 }, // 111100 = 60 invalid 382 { { 0, 4, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 111101 = 61 invalid 383 { { 3, 7, 8, 11, 0, 0, 0, 0, 0, 0, 0, 0 }, 4 }, // 111110 = 62 invalid 384 { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 0 }, // 111111 = 63 invalid 385 }; 386 387 /* 388 #include <Windows.h> 389 390 class idCreateBoxSilhouetteVerticesForFrontBits { 391 public: 392 idCreateBoxSilhouetteVerticesForFrontBits() { 393 for ( int i = 0; i < 64; i++ ) { 394 int silhouetteEdges[12] = { 0 }; 395 int numSilhouetteEdges = 0; 396 397 for ( int j = 0; j < 12; j++ ) { 398 if ( i == 0 || ( ( i >> boxEdgePolygons[j][0] ) & 1 ) != ( ( i >> boxEdgePolygons[j][1] ) & 1 ) ) { 399 silhouetteEdges[numSilhouetteEdges++] = j; 400 } 401 } 402 403 int silhouetteVertices[8] = { 0 }; 404 int numSilhouetteVertices = 0; 405 406 int vertex = boxEdgeVertices[silhouetteEdges[0]][0]; 407 for ( int j = 0; j < 7; j++ ) { 408 int newVertex = -1; 409 for ( int j = 0; j < numSilhouetteEdges; j++ ) { 410 if ( silhouetteEdges[j] == -1 ) { 411 continue; 412 } 413 if ( boxEdgeVertices[silhouetteEdges[j]][0] == vertex ) { 414 newVertex = boxEdgeVertices[silhouetteEdges[j]][1]; 415 silhouetteEdges[j] = -1; 416 break; 417 } else if ( boxEdgeVertices[silhouetteEdges[j]][1] == vertex ) { 418 newVertex = boxEdgeVertices[silhouetteEdges[j]][0]; 419 silhouetteEdges[j] = -1; 420 break; 421 } 422 } 423 if ( newVertex == -1 ) { 424 break; 425 } 426 silhouetteVertices[numSilhouetteVertices++] = newVertex; 427 vertex = newVertex; 428 } 429 430 char bits[7] = { 0 }; 431 for ( int j = 0; j < 6; j++ ) { 432 if ( ( i & ( 1 << j ) ) != 0 ) { 433 bits[5 - j] = '1'; 434 } else { 435 bits[5 - j] = '0'; 436 } 437 } 438 const char * comment = ( ( i & ( i >> 3 ) & 7 ) != 0 ) ? " invalid" : ""; 439 if ( i == 0 ) { 440 comment = " inside the box, no silhouette"; 441 } 442 char buffer[1024]; 443 sprintf( buffer, "{ { %d, %d, %d, %d, %d, %d, %d }, %d }, // %s = %d%s\n", 444 silhouetteVertices[0], silhouetteVertices[1], silhouetteVertices[2], silhouetteVertices[3], 445 silhouetteVertices[4], silhouetteVertices[5], silhouetteVertices[6], numSilhouetteVertices, bits, i, comment ); 446 OutputDebugString( buffer ); 447 } 448 } 449 } createBoxSilhouetteVerticesForFrontBits; 450 */ 451 452 // make sure this is a power of two for fast addressing an array of these without integer multiplication 453 static const struct silhouetteVertices_t { 454 byte indices[7]; 455 byte count; 456 } boxSilhouetteVerticesForFrontBits[64] = { 457 { { 1, 2, 3, 0, 4, 5, 6 }, 7 }, // 000000 = 0 inside the box, no vertex is considered part of the silhouette 458 { { 0, 4, 7, 3, 0, 0, 0 }, 4 }, // 000001 = 1 459 { { 1, 5, 4, 0, 0, 0, 0 }, 4 }, // 000010 = 2 460 { { 1, 5, 4, 7, 3, 0, 0 }, 6 }, // 000011 = 3 461 { { 1, 2, 3, 0, 0, 0, 0 }, 4 }, // 000100 = 4 462 { { 1, 2, 3, 7, 4, 0, 0 }, 6 }, // 000101 = 5 463 { { 2, 3, 0, 4, 5, 1, 0 }, 6 }, // 000110 = 6 464 { { 2, 3, 7, 4, 5, 1, 0 }, 6 }, // 000111 = 7 465 { { 2, 6, 5, 1, 0, 0, 0 }, 4 }, // 001000 = 8 466 { { 2, 6, 5, 1, 0, 0, 0 }, 4 }, // 001001 = 9 invalid 467 { { 1, 2, 6, 5, 4, 0, 0 }, 6 }, // 001010 = 10 468 { { 1, 2, 6, 5, 4, 7, 3 }, 7 }, // 001011 = 11 invalid 469 { { 1, 5, 6, 2, 3, 0, 0 }, 6 }, // 001100 = 12 470 { { 1, 5, 6, 2, 3, 7, 4 }, 7 }, // 001101 = 13 invalid 471 { { 3, 0, 4, 5, 6, 2, 0 }, 6 }, // 001110 = 14 472 { { 3, 7, 4, 5, 6, 2, 0 }, 6 }, // 001111 = 15 invalid 473 { { 3, 7, 6, 2, 0, 0, 0 }, 4 }, // 010000 = 16 474 { { 3, 0, 4, 7, 6, 2, 0 }, 6 }, // 010001 = 17 475 { { 1, 5, 4, 0, 0, 0, 0 }, 4 }, // 010010 = 18 invalid 476 { { 1, 5, 4, 7, 6, 2, 3 }, 7 }, // 010011 = 19 invalid 477 { { 1, 2, 6, 7, 3, 0, 0 }, 6 }, // 010100 = 20 478 { { 1, 2, 6, 7, 4, 0, 0 }, 6 }, // 010101 = 21 479 { { 2, 6, 7, 3, 0, 4, 5 }, 7 }, // 010110 = 22 invalid 480 { { 2, 6, 7, 4, 5, 1, 0 }, 6 }, // 010111 = 23 invalid 481 { { 2, 3, 7, 6, 5, 1, 0 }, 6 }, // 011000 = 24 482 { { 2, 3, 0, 4, 7, 6, 5 }, 7 }, // 011001 = 25 invalid 483 { { 1, 2, 3, 7, 6, 5, 4 }, 7 }, // 011010 = 26 invalid 484 { { 1, 2, 3, 0, 0, 0, 0 }, 4 }, // 011011 = 27 invalid 485 { { 1, 5, 6, 7, 3, 0, 0 }, 6 }, // 011100 = 28 486 { { 1, 5, 6, 7, 4, 0, 0 }, 6 }, // 011101 = 29 invalid 487 { { 0, 4, 5, 6, 7, 3, 0 }, 6 }, // 011110 = 30 invalid 488 { { 5, 6, 7, 4, 0, 0, 0 }, 4 }, // 011111 = 31 invalid 489 { { 5, 6, 7, 4, 0, 0, 0 }, 4 }, // 100000 = 32 490 { { 0, 4, 5, 6, 7, 3, 0 }, 6 }, // 100001 = 33 491 { { 1, 5, 6, 7, 4, 0, 0 }, 6 }, // 100010 = 34 492 { { 1, 5, 6, 7, 3, 0, 0 }, 6 }, // 100011 = 35 493 { { 1, 2, 3, 0, 0, 0, 0 }, 4 }, // 100100 = 36 invalid 494 { { 1, 2, 3, 7, 6, 5, 4 }, 7 }, // 100101 = 37 invalid 495 { { 2, 3, 0, 4, 7, 6, 5 }, 7 }, // 100110 = 38 invalid 496 { { 2, 3, 7, 6, 5, 1, 0 }, 6 }, // 100111 = 39 invalid 497 { { 2, 6, 7, 4, 5, 1, 0 }, 6 }, // 101000 = 40 498 { { 2, 6, 7, 3, 0, 4, 5 }, 7 }, // 101001 = 41 invalid 499 { { 1, 2, 6, 7, 4, 0, 0 }, 6 }, // 101010 = 42 500 { { 1, 2, 6, 7, 3, 0, 0 }, 6 }, // 101011 = 43 invalid 501 { { 1, 5, 4, 7, 6, 2, 3 }, 7 }, // 101100 = 44 invalid 502 { { 1, 5, 4, 0, 0, 0, 0 }, 4 }, // 101101 = 45 invalid 503 { { 3, 0, 4, 7, 6, 2, 0 }, 6 }, // 101110 = 46 invalid 504 { { 3, 7, 6, 2, 0, 0, 0 }, 4 }, // 101111 = 47 invalid 505 { { 3, 7, 4, 5, 6, 2, 0 }, 6 }, // 110000 = 48 506 { { 3, 0, 4, 5, 6, 2, 0 }, 6 }, // 110001 = 49 507 { { 1, 5, 6, 2, 3, 7, 4 }, 7 }, // 110010 = 50 invalid 508 { { 1, 5, 6, 2, 3, 0, 0 }, 6 }, // 110011 = 51 invalid 509 { { 1, 2, 6, 5, 4, 7, 3 }, 7 }, // 110100 = 52 invalid 510 { { 1, 2, 6, 5, 4, 0, 0 }, 6 }, // 110101 = 53 invalid 511 { { 2, 6, 5, 1, 0, 0, 0 }, 4 }, // 110110 = 54 invalid 512 { { 2, 6, 5, 1, 0, 0, 0 }, 4 }, // 110111 = 55 invalid 513 { { 2, 3, 7, 4, 5, 1, 0 }, 6 }, // 111000 = 56 514 { { 2, 3, 0, 4, 5, 1, 0 }, 6 }, // 111001 = 57 invalid 515 { { 1, 2, 3, 7, 4, 0, 0 }, 6 }, // 111010 = 58 invalid 516 { { 1, 2, 3, 0, 0, 0, 0 }, 4 }, // 111011 = 59 invalid 517 { { 1, 5, 4, 7, 3, 0, 0 }, 6 }, // 111100 = 60 invalid 518 { { 1, 5, 4, 0, 0, 0, 0 }, 4 }, // 111101 = 61 invalid 519 { { 0, 4, 7, 3, 0, 0, 0 }, 4 }, // 111110 = 62 invalid 520 { { 0, 0, 0, 0, 0, 0, 0 }, 0 }, // 111111 = 63 invalid 521 }; 522 523 /* 524 ======================== 525 GetBoxFrontBits 526 527 front bits: 528 bit 0 = neg-X is front facing 529 bit 1 = neg-Y is front facing 530 bit 2 = neg-Z is front facing 531 bit 3 = pos-X is front facing 532 bit 4 = pos-Y is front facing 533 bit 5 = pos-Z is front facing 534 ======================== 535 */ 536 #ifdef ID_WIN_X86_SSE2_INTRIN 537 538 static int GetBoxFrontBits_SSE2( const __m128 & b0, const __m128 & b1, const __m128 & viewOrigin ) { 539 const __m128 dir0 = _mm_sub_ps( viewOrigin, b0 ); 540 const __m128 dir1 = _mm_sub_ps( b1, viewOrigin ); 541 const __m128 d0 = _mm_cmplt_ps( dir0, _mm_setzero_ps() ); 542 const __m128 d1 = _mm_cmplt_ps( dir1, _mm_setzero_ps() ); 543 544 int frontBits = _mm_movemask_ps( d0 ) | ( _mm_movemask_ps( d1 ) << 3 ); 545 return frontBits; 546 } 547 548 #else 549 550 static int GetBoxFrontBits_Generic( const idBounds & bounds, const idVec3 & viewOrigin ) { 551 idVec3 dir0 = viewOrigin - bounds[0]; 552 idVec3 dir1 = bounds[1] - viewOrigin; 553 int frontBits = 0; 554 frontBits |= IEEE_FLT_SIGNBITSET( dir0.x ) << 0; 555 frontBits |= IEEE_FLT_SIGNBITSET( dir0.y ) << 1; 556 frontBits |= IEEE_FLT_SIGNBITSET( dir0.z ) << 2; 557 frontBits |= IEEE_FLT_SIGNBITSET( dir1.x ) << 3; 558 frontBits |= IEEE_FLT_SIGNBITSET( dir1.y ) << 4; 559 frontBits |= IEEE_FLT_SIGNBITSET( dir1.z ) << 5; 560 return frontBits; 561 } 562 563 #endif 564 565 /* 566 ================================================================================================ 567 568 idRenderMatrix implementation 569 570 ================================================================================================ 571 */ 572 573 /* 574 ======================== 575 idRenderMatrix::CreateFromOriginAxis 576 ======================== 577 */ 578 void idRenderMatrix::CreateFromOriginAxis( const idVec3 & origin, const idMat3 & axis, idRenderMatrix & out ) { 579 out[0][0] = axis[0][0]; 580 out[0][1] = axis[1][0]; 581 out[0][2] = axis[2][0]; 582 out[0][3] = origin[0]; 583 584 out[1][0] = axis[0][1]; 585 out[1][1] = axis[1][1]; 586 out[1][2] = axis[2][1]; 587 out[1][3] = origin[1]; 588 589 out[2][0] = axis[0][2]; 590 out[2][1] = axis[1][2]; 591 out[2][2] = axis[2][2]; 592 out[2][3] = origin[2]; 593 594 out[3][0] = 0.0f; 595 out[3][1] = 0.0f; 596 out[3][2] = 0.0f; 597 out[3][3] = 1.0f; 598 } 599 600 /* 601 ======================== 602 idRenderMatrix::CreateFromOriginAxisScale 603 ======================== 604 */ 605 void idRenderMatrix::CreateFromOriginAxisScale( const idVec3 & origin, const idMat3 & axis, const idVec3 & scale, idRenderMatrix & out ) { 606 out[0][0] = axis[0][0] * scale[0]; 607 out[0][1] = axis[1][0] * scale[1]; 608 out[0][2] = axis[2][0] * scale[2]; 609 out[0][3] = origin[0]; 610 611 out[1][0] = axis[0][1] * scale[0]; 612 out[1][1] = axis[1][1] * scale[1]; 613 out[1][2] = axis[2][1] * scale[2]; 614 out[1][3] = origin[1]; 615 616 out[2][0] = axis[0][2] * scale[0]; 617 out[2][1] = axis[1][2] * scale[1]; 618 out[2][2] = axis[2][2] * scale[2]; 619 out[2][3] = origin[2]; 620 621 out[3][0] = 0.0f; 622 out[3][1] = 0.0f; 623 out[3][2] = 0.0f; 624 out[3][3] = 1.0f; 625 } 626 627 /* 628 ======================== 629 idRenderMatrix::CreateViewMatrix 630 631 Our axis looks down positive +X, render matrix looks down -Z. 632 ======================== 633 */ 634 void idRenderMatrix::CreateViewMatrix( const idVec3 & origin, const idMat3 & axis, idRenderMatrix & out ) { 635 out[0][0] = -axis[1][0]; 636 out[0][1] = -axis[1][1]; 637 out[0][2] = -axis[1][2]; 638 out[0][3] = origin[0] * axis[1][0] + origin[1] * axis[1][1] + origin[2] * axis[1][2]; 639 640 out[1][0] = axis[2][0]; 641 out[1][1] = axis[2][1]; 642 out[1][2] = axis[2][2]; 643 out[1][3] = -( origin[0] * axis[2][0] + origin[1] * axis[2][1] + origin[2] * axis[2][2] ); 644 645 out[2][0] = -axis[0][0]; 646 out[2][1] = -axis[0][1]; 647 out[2][2] = -axis[0][2]; 648 out[2][3] = origin[0] * axis[0][0] + origin[1] * axis[0][1] + origin[2] * axis[0][2]; 649 650 out[3][0] = 0.0f; 651 out[3][1] = 0.0f; 652 out[3][2] = 0.0f; 653 out[3][3] = 1.0f; 654 } 655 656 /* 657 ======================== 658 idRenderMatrix::CreateProjectionMatrix 659 660 If zFar == 0, an infinite far plane will be used. 661 ======================== 662 */ 663 void idRenderMatrix::CreateProjectionMatrix( float xMin, float xMax, float yMin, float yMax, float zNear, float zFar, idRenderMatrix & out ) { 664 const float width = xMax - xMin; 665 const float height = yMax - yMin; 666 667 out[0][0] = 2.0f * zNear / width; 668 out[0][1] = 0.0f; 669 out[0][2] = ( xMax + xMin ) / width; // normally 0 670 out[0][3] = 0.0f; 671 672 out[1][0] = 0.0f; 673 out[1][1] = 2.0f * zNear / height; 674 out[1][2] = ( yMax + yMin ) / height; // normally 0 675 out[1][3] = 0.0f; 676 677 if ( zFar <= zNear ) { 678 // this is the far-plane-at-infinity formulation 679 out[2][0] = 0.0f; 680 out[2][1] = 0.0f; 681 out[2][2] = -1.0f; 682 #if defined( CLIP_SPACE_D3D ) 683 // the D3D clip space Z is in range [0,1] instead of [-1,1] 684 out[2][3] = -zNear; 685 #else 686 out[2][3] = -2.0f * zNear; 687 #endif 688 } else { 689 out[2][0] = 0.0f; 690 out[2][1] = 0.0f; 691 #if defined( CLIP_SPACE_D3D ) 692 // the D3D clip space Z is in range [0,1] instead of [-1,1] 693 out[2][2] = -( zFar ) / ( zFar - zNear ); 694 out[2][3] = -( zFar * zNear ) / ( zFar - zNear ); 695 #else 696 out[2][2] = -( zFar + zNear ) / ( zFar - zNear ); 697 out[2][3] = -( 2.0f * zFar * zNear ) / ( zFar - zNear ); 698 #endif 699 } 700 701 out[3][0] = 0.0f; 702 out[3][1] = 0.0f; 703 out[3][2] = -1.0f; 704 out[3][3] = 0.0f; 705 } 706 707 /* 708 ======================== 709 idRenderMatrix::CreateProjectionMatrixFov 710 711 xOffset and yOffset should be in the -1 to 1 range for sub-pixel accumulation jitter. 712 xOffset can also be used for eye separation when rendering stereo. 713 ======================== 714 */ 715 void idRenderMatrix::CreateProjectionMatrixFov( float xFovDegrees, float yFovDegrees, float zNear, float zFar, float xOffset, float yOffset, idRenderMatrix & out ) { 716 float xMax = zNear * idMath::Tan( DEG2RAD( xFovDegrees ) * 0.5f ); 717 float xMin = -xMax; 718 719 float yMax = zNear * idMath::Tan( DEG2RAD( yFovDegrees ) * 0.5f ); 720 float yMin = -yMax; 721 722 xMin += xOffset; 723 xMax += xOffset; 724 725 yMin += yOffset; 726 yMax += yOffset; 727 728 CreateProjectionMatrix( xMin, xMax, yMin, yMax, zNear, zFar, out ); 729 } 730 731 /* 732 ======================== 733 idRenderMatrix::OffsetScaleForBounds 734 735 Add the offset to the center of the bounds and scale for the width of the bounds. 736 The result matrix will transform the unit-cube to exactly cover the bounds. 737 ======================== 738 */ 739 void idRenderMatrix::OffsetScaleForBounds( const idRenderMatrix & src, const idBounds & bounds, idRenderMatrix & out ) { 740 assert( &src != &out ); 741 742 #ifdef ID_WIN_X86_SSE2_INTRIN 743 744 __m128 b0 = _mm_loadu_bounds_0( bounds ); 745 __m128 b1 = _mm_loadu_bounds_1( bounds ); 746 747 __m128 offset = _mm_mul_ps( _mm_add_ps( b1, b0 ), vector_float_half ); 748 __m128 scale = _mm_mul_ps( _mm_sub_ps( b1, b0 ), vector_float_half ); 749 750 scale = _mm_or_ps( scale, vector_float_last_one ); 751 752 __m128 a0 = _mm_loadu_ps( src.m + 0*4 ); 753 __m128 a1 = _mm_loadu_ps( src.m + 1*4 ); 754 __m128 a2 = _mm_loadu_ps( src.m + 2*4 ); 755 __m128 a3 = _mm_loadu_ps( src.m + 3*4 ); 756 757 __m128 d0 = _mm_mul_ps( a0, offset ); 758 __m128 d1 = _mm_mul_ps( a1, offset ); 759 __m128 d2 = _mm_mul_ps( a2, offset ); 760 __m128 d3 = _mm_mul_ps( a3, offset ); 761 762 __m128 s0 = _mm_unpacklo_ps( d0, d2 ); // a0, c0, a1, c1 763 __m128 s1 = _mm_unpackhi_ps( d0, d2 ); // a2, c2, a3, c3 764 __m128 s2 = _mm_unpacklo_ps( d1, d3 ); // b0, d0, b1, d1 765 __m128 s3 = _mm_unpackhi_ps( d1, d3 ); // b2, d2, b3, d3 766 767 __m128 t0 = _mm_unpacklo_ps( s0, s2 ); // a0, b0, c0, d0 768 __m128 t1 = _mm_unpackhi_ps( s0, s2 ); // a1, b1, c1, d1 769 __m128 t2 = _mm_unpacklo_ps( s1, s3 ); // a2, b2, c2, d2 770 771 t0 = _mm_add_ps( t0, t1 ); 772 t0 = _mm_add_ps( t0, t2 ); 773 774 __m128 n0 = _mm_and_ps( _mm_splat_ps( t0, 0 ), vector_float_keep_last ); 775 __m128 n1 = _mm_and_ps( _mm_splat_ps( t0, 1 ), vector_float_keep_last ); 776 __m128 n2 = _mm_and_ps( _mm_splat_ps( t0, 2 ), vector_float_keep_last ); 777 __m128 n3 = _mm_and_ps( _mm_splat_ps( t0, 3 ), vector_float_keep_last ); 778 779 a0 = _mm_madd_ps( a0, scale, n0 ); 780 a1 = _mm_madd_ps( a1, scale, n1 ); 781 a2 = _mm_madd_ps( a2, scale, n2 ); 782 a3 = _mm_madd_ps( a3, scale, n3 ); 783 784 _mm_storeu_ps( out.m + 0*4, a0 ); 785 _mm_storeu_ps( out.m + 1*4, a1 ); 786 _mm_storeu_ps( out.m + 2*4, a2 ); 787 _mm_storeu_ps( out.m + 3*4, a3 ); 788 789 #else 790 791 const idVec3 offset = ( bounds[1] + bounds[0] ) * 0.5f; 792 const idVec3 scale = ( bounds[1] - bounds[0] ) * 0.5f; 793 794 out[0][0] = src[0][0] * scale[0]; 795 out[0][1] = src[0][1] * scale[1]; 796 out[0][2] = src[0][2] * scale[2]; 797 out[0][3] = src[0][3] + src[0][0] * offset[0] + src[0][1] * offset[1] + src[0][2] * offset[2]; 798 799 out[1][0] = src[1][0] * scale[0]; 800 out[1][1] = src[1][1] * scale[1]; 801 out[1][2] = src[1][2] * scale[2]; 802 out[1][3] = src[1][3] + src[1][0] * offset[0] + src[1][1] * offset[1] + src[1][2] * offset[2]; 803 804 out[2][0] = src[2][0] * scale[0]; 805 out[2][1] = src[2][1] * scale[1]; 806 out[2][2] = src[2][2] * scale[2]; 807 out[2][3] = src[2][3] + src[2][0] * offset[0] + src[2][1] * offset[1] + src[2][2] * offset[2]; 808 809 out[3][0] = src[3][0] * scale[0]; 810 out[3][1] = src[3][1] * scale[1]; 811 out[3][2] = src[3][2] * scale[2]; 812 out[3][3] = src[3][3] + src[3][0] * offset[0] + src[3][1] * offset[1] + src[3][2] * offset[2]; 813 814 #endif 815 } 816 817 /* 818 ======================== 819 idRenderMatrix::InverseOffsetScaleForBounds 820 821 Subtract the offset to the center of the bounds and inverse scale for the width of the bounds. 822 The result matrix will transform the bounds to exactly cover the unit-cube. 823 ======================== 824 */ 825 void idRenderMatrix::InverseOffsetScaleForBounds( const idRenderMatrix & src, const idBounds & bounds, idRenderMatrix & out ) { 826 assert( &src != &out ); 827 828 #ifdef ID_WIN_X86_SSE2_INTRIN 829 830 __m128 b0 = _mm_loadu_bounds_0( bounds ); 831 __m128 b1 = _mm_loadu_bounds_1( bounds ); 832 833 __m128 offset = _mm_mul_ps( _mm_add_ps( b1, b0 ), vector_float_neg_half ); 834 __m128 scale = _mm_mul_ps( _mm_sub_ps( b0, b1 ), vector_float_neg_half ); 835 836 scale = _mm_max_ps( scale, vector_float_smallest_non_denorm ); 837 838 __m128 rscale = _mm_rcp32_ps( scale ); 839 840 offset = _mm_mul_ps( offset, rscale ); 841 842 __m128 d0 = _mm_and_ps( _mm_splat_ps( offset, 0 ), vector_float_keep_last ); 843 __m128 d1 = _mm_and_ps( _mm_splat_ps( offset, 1 ), vector_float_keep_last ); 844 __m128 d2 = _mm_and_ps( _mm_splat_ps( offset, 2 ), vector_float_keep_last ); 845 846 __m128 a0 = _mm_loadu_ps( src.m + 0*4 ); 847 __m128 a1 = _mm_loadu_ps( src.m + 1*4 ); 848 __m128 a2 = _mm_loadu_ps( src.m + 2*4 ); 849 __m128 a3 = _mm_loadu_ps( src.m + 3*4 ); 850 851 a0 = _mm_madd_ps( a0, _mm_splat_ps( rscale, 0 ), d0 ); 852 a1 = _mm_madd_ps( a1, _mm_splat_ps( rscale, 1 ), d1 ); 853 a2 = _mm_madd_ps( a2, _mm_splat_ps( rscale, 2 ), d2 ); 854 855 _mm_storeu_ps( out.m + 0*4, a0 ); 856 _mm_storeu_ps( out.m + 1*4, a1 ); 857 _mm_storeu_ps( out.m + 2*4, a2 ); 858 _mm_storeu_ps( out.m + 3*4, a3 ); 859 860 #else 861 862 const idVec3 offset = -0.5f * ( bounds[1] + bounds[0] ); 863 const idVec3 scale = 2.0f / ( bounds[1] - bounds[0] ); 864 865 out[0][0] = scale[0] * src[0][0]; 866 out[0][1] = scale[0] * src[0][1]; 867 out[0][2] = scale[0] * src[0][2]; 868 out[0][3] = scale[0] * ( src[0][3] + offset[0] ); 869 870 out[1][0] = scale[1] * src[1][0]; 871 out[1][1] = scale[1] * src[1][1]; 872 out[1][2] = scale[1] * src[1][2]; 873 out[1][3] = scale[1] * ( src[1][3] + offset[1] ); 874 875 out[2][0] = scale[2] * src[2][0]; 876 out[2][1] = scale[2] * src[2][1]; 877 out[2][2] = scale[2] * src[2][2]; 878 out[2][3] = scale[2] * ( src[2][3] + offset[2] ); 879 880 out[3][0] = src[3][0]; 881 out[3][1] = src[3][1]; 882 out[3][2] = src[3][2]; 883 out[3][3] = src[3][3]; 884 885 #endif 886 } 887 888 /* 889 ======================== 890 idRenderMatrix::Transpose 891 ======================== 892 */ 893 void idRenderMatrix::Transpose( const idRenderMatrix & src, idRenderMatrix & out ) { 894 assert( &src != &out ); 895 896 #ifdef ID_WIN_X86_SSE2_INTRIN 897 898 const __m128 a0 = _mm_loadu_ps( src.m + 0*4 ); 899 const __m128 a1 = _mm_loadu_ps( src.m + 1*4 ); 900 const __m128 a2 = _mm_loadu_ps( src.m + 2*4 ); 901 const __m128 a3 = _mm_loadu_ps( src.m + 3*4 ); 902 903 const __m128 r0 = _mm_unpacklo_ps( a0, a2 ); 904 const __m128 r1 = _mm_unpackhi_ps( a0, a2 ); 905 const __m128 r2 = _mm_unpacklo_ps( a1, a3 ); 906 const __m128 r3 = _mm_unpackhi_ps( a1, a3 ); 907 908 const __m128 t0 = _mm_unpacklo_ps( r0, r2 ); 909 const __m128 t1 = _mm_unpackhi_ps( r0, r2 ); 910 const __m128 t2 = _mm_unpacklo_ps( r1, r3 ); 911 const __m128 t3 = _mm_unpackhi_ps( r1, r3 ); 912 913 _mm_storeu_ps( out.m + 0*4, t0 ); 914 _mm_storeu_ps( out.m + 1*4, t1 ); 915 _mm_storeu_ps( out.m + 2*4, t2 ); 916 _mm_storeu_ps( out.m + 3*4, t3 ); 917 918 #else 919 920 for ( int i = 0; i < 4; i++ ) { 921 for ( int j = 0; j < 4; j++ ) { 922 out[i][j] = src[j][i]; 923 } 924 } 925 926 #endif 927 } 928 929 /* 930 ======================== 931 idRenderMatrix::Multiply 932 ======================== 933 */ 934 void idRenderMatrix::Multiply( const idRenderMatrix & a, const idRenderMatrix & b, idRenderMatrix & out ) { 935 936 #ifdef ID_WIN_X86_SSE2_INTRIN 937 938 __m128 a0 = _mm_loadu_ps( a.m + 0*4 ); 939 __m128 a1 = _mm_loadu_ps( a.m + 1*4 ); 940 __m128 a2 = _mm_loadu_ps( a.m + 2*4 ); 941 __m128 a3 = _mm_loadu_ps( a.m + 3*4 ); 942 943 __m128 b0 = _mm_loadu_ps( b.m + 0*4 ); 944 __m128 b1 = _mm_loadu_ps( b.m + 1*4 ); 945 __m128 b2 = _mm_loadu_ps( b.m + 2*4 ); 946 __m128 b3 = _mm_loadu_ps( b.m + 3*4 ); 947 948 __m128 t0 = _mm_mul_ps( _mm_splat_ps( a0, 0 ), b0 ); 949 __m128 t1 = _mm_mul_ps( _mm_splat_ps( a1, 0 ), b0 ); 950 __m128 t2 = _mm_mul_ps( _mm_splat_ps( a2, 0 ), b0 ); 951 __m128 t3 = _mm_mul_ps( _mm_splat_ps( a3, 0 ), b0 ); 952 953 t0 = _mm_madd_ps( _mm_splat_ps( a0, 1 ), b1, t0 ); 954 t1 = _mm_madd_ps( _mm_splat_ps( a1, 1 ), b1, t1 ); 955 t2 = _mm_madd_ps( _mm_splat_ps( a2, 1 ), b1, t2 ); 956 t3 = _mm_madd_ps( _mm_splat_ps( a3, 1 ), b1, t3 ); 957 958 t0 = _mm_madd_ps( _mm_splat_ps( a0, 2 ), b2, t0 ); 959 t1 = _mm_madd_ps( _mm_splat_ps( a1, 2 ), b2, t1 ); 960 t2 = _mm_madd_ps( _mm_splat_ps( a2, 2 ), b2, t2 ); 961 t3 = _mm_madd_ps( _mm_splat_ps( a3, 2 ), b2, t3 ); 962 963 t0 = _mm_madd_ps( _mm_splat_ps( a0, 3 ), b3, t0 ); 964 t1 = _mm_madd_ps( _mm_splat_ps( a1, 3 ), b3, t1 ); 965 t2 = _mm_madd_ps( _mm_splat_ps( a2, 3 ), b3, t2 ); 966 t3 = _mm_madd_ps( _mm_splat_ps( a3, 3 ), b3, t3 ); 967 968 _mm_storeu_ps( out.m + 0*4, t0 ); 969 _mm_storeu_ps( out.m + 1*4, t1 ); 970 _mm_storeu_ps( out.m + 2*4, t2 ); 971 _mm_storeu_ps( out.m + 3*4, t3 ); 972 973 #else 974 975 /* 976 for ( int i = 0 ; i < 4 ; i++ ) { 977 for ( int j = 0 ; j < 4 ; j++ ) { 978 out.m[ i * 4 + j ] = 979 a.m[ i * 4 + 0 ] * b.m[ 0 * 4 + j ] + 980 a.m[ i * 4 + 1 ] * b.m[ 1 * 4 + j ] + 981 a.m[ i * 4 + 2 ] * b.m[ 2 * 4 + j ] + 982 a.m[ i * 4 + 3 ] * b.m[ 3 * 4 + j ]; 983 } 984 } 985 */ 986 987 out.m[0*4+0] = a.m[0*4+0]*b.m[0*4+0] + a.m[0*4+1]*b.m[1*4+0] + a.m[0*4+2]*b.m[2*4+0] + a.m[0*4+3]*b.m[3*4+0]; 988 out.m[0*4+1] = a.m[0*4+0]*b.m[0*4+1] + a.m[0*4+1]*b.m[1*4+1] + a.m[0*4+2]*b.m[2*4+1] + a.m[0*4+3]*b.m[3*4+1]; 989 out.m[0*4+2] = a.m[0*4+0]*b.m[0*4+2] + a.m[0*4+1]*b.m[1*4+2] + a.m[0*4+2]*b.m[2*4+2] + a.m[0*4+3]*b.m[3*4+2]; 990 out.m[0*4+3] = a.m[0*4+0]*b.m[0*4+3] + a.m[0*4+1]*b.m[1*4+3] + a.m[0*4+2]*b.m[2*4+3] + a.m[0*4+3]*b.m[3*4+3]; 991 992 out.m[1*4+0] = a.m[1*4+0]*b.m[0*4+0] + a.m[1*4+1]*b.m[1*4+0] + a.m[1*4+2]*b.m[2*4+0] + a.m[1*4+3]*b.m[3*4+0]; 993 out.m[1*4+1] = a.m[1*4+0]*b.m[0*4+1] + a.m[1*4+1]*b.m[1*4+1] + a.m[1*4+2]*b.m[2*4+1] + a.m[1*4+3]*b.m[3*4+1]; 994 out.m[1*4+2] = a.m[1*4+0]*b.m[0*4+2] + a.m[1*4+1]*b.m[1*4+2] + a.m[1*4+2]*b.m[2*4+2] + a.m[1*4+3]*b.m[3*4+2]; 995 out.m[1*4+3] = a.m[1*4+0]*b.m[0*4+3] + a.m[1*4+1]*b.m[1*4+3] + a.m[1*4+2]*b.m[2*4+3] + a.m[1*4+3]*b.m[3*4+3]; 996 997 out.m[2*4+0] = a.m[2*4+0]*b.m[0*4+0] + a.m[2*4+1]*b.m[1*4+0] + a.m[2*4+2]*b.m[2*4+0] + a.m[2*4+3]*b.m[3*4+0]; 998 out.m[2*4+1] = a.m[2*4+0]*b.m[0*4+1] + a.m[2*4+1]*b.m[1*4+1] + a.m[2*4+2]*b.m[2*4+1] + a.m[2*4+3]*b.m[3*4+1]; 999 out.m[2*4+2] = a.m[2*4+0]*b.m[0*4+2] + a.m[2*4+1]*b.m[1*4+2] + a.m[2*4+2]*b.m[2*4+2] + a.m[2*4+3]*b.m[3*4+2]; 1000 out.m[2*4+3] = a.m[2*4+0]*b.m[0*4+3] + a.m[2*4+1]*b.m[1*4+3] + a.m[2*4+2]*b.m[2*4+3] + a.m[2*4+3]*b.m[3*4+3]; 1001 1002 out.m[3*4+0] = a.m[3*4+0]*b.m[0*4+0] + a.m[3*4+1]*b.m[1*4+0] + a.m[3*4+2]*b.m[2*4+0] + a.m[3*4+3]*b.m[3*4+0]; 1003 out.m[3*4+1] = a.m[3*4+0]*b.m[0*4+1] + a.m[3*4+1]*b.m[1*4+1] + a.m[3*4+2]*b.m[2*4+1] + a.m[3*4+3]*b.m[3*4+1]; 1004 out.m[3*4+2] = a.m[3*4+0]*b.m[0*4+2] + a.m[3*4+1]*b.m[1*4+2] + a.m[3*4+2]*b.m[2*4+2] + a.m[3*4+3]*b.m[3*4+2]; 1005 out.m[3*4+3] = a.m[3*4+0]*b.m[0*4+3] + a.m[3*4+1]*b.m[1*4+3] + a.m[3*4+2]*b.m[2*4+3] + a.m[3*4+3]*b.m[3*4+3]; 1006 1007 #endif 1008 } 1009 1010 /* 1011 ======================== 1012 idRenderMatrix::Inverse 1013 1014 inverse( M ) = ( 1 / determinant( M ) ) * transpose( cofactor( M ) ) 1015 1016 This code is based on the code written by Cédric Lallain, published on "Cell Performance" 1017 (by Mike Acton) and released under the BSD 3-Clause ("BSD New" or "BSD Simplified") license. 1018 https://code.google.com/p/cellperformance-snippets/ 1019 1020 Note that large parallel lights can have very small values in the projection matrix, 1021 scaling tens of thousands of world units down to a 0-1 range, so the determinants 1022 can get really, really small. 1023 ======================== 1024 */ 1025 bool idRenderMatrix::Inverse( const idRenderMatrix & src, idRenderMatrix & out ) { 1026 1027 #ifdef ID_WIN_X86_SSE2_INTRIN 1028 1029 const __m128 r0 = _mm_loadu_ps( src.m + 0 * 4 ); 1030 const __m128 r1 = _mm_loadu_ps( src.m + 1 * 4 ); 1031 const __m128 r2 = _mm_loadu_ps( src.m + 2 * 4 ); 1032 const __m128 r3 = _mm_loadu_ps( src.m + 3 * 4 ); 1033 1034 // rXuY = row X rotated up by Y floats. 1035 const __m128 r0u1 = _mm_perm_ps( r0, _MM_SHUFFLE( 2, 1, 0, 3 ) ); 1036 const __m128 r0u2 = _mm_perm_ps( r0, _MM_SHUFFLE( 1, 0, 3, 2 ) ); 1037 const __m128 r0u3 = _mm_perm_ps( r0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 1038 1039 const __m128 r1u1 = _mm_perm_ps( r1, _MM_SHUFFLE( 2, 1, 0, 3 ) ); 1040 const __m128 r1u2 = _mm_perm_ps( r1, _MM_SHUFFLE( 1, 0, 3, 2 ) ); 1041 const __m128 r1u3 = _mm_perm_ps( r1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 1042 1043 const __m128 r2u1 = _mm_perm_ps( r2, _MM_SHUFFLE( 2, 1, 0, 3 ) ); 1044 const __m128 r2u2 = _mm_perm_ps( r2, _MM_SHUFFLE( 1, 0, 3, 2 ) ); 1045 const __m128 r2u3 = _mm_perm_ps( r2, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 1046 1047 const __m128 r3u1 = _mm_perm_ps( r3, _MM_SHUFFLE( 2, 1, 0, 3 ) ); 1048 const __m128 r3u2 = _mm_perm_ps( r3, _MM_SHUFFLE( 1, 0, 3, 2 ) ); 1049 const __m128 r3u3 = _mm_perm_ps( r3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 1050 1051 const __m128 m_r2u2_r3u3 = _mm_mul_ps( r2u2, r3u3 ); 1052 const __m128 m_r1u1_r2u2_r3u3 = _mm_mul_ps( r1u1, m_r2u2_r3u3 ); 1053 const __m128 m_r2u3_r3u1 = _mm_mul_ps( r2u3, r3u1 ); 1054 const __m128 a_m_r1u2_r2u3_r3u1_m_r1u1_r2u2_r3u3 = _mm_madd_ps( r1u2, m_r2u3_r3u1, m_r1u1_r2u2_r3u3 ); 1055 const __m128 m_r2u1_r3u2 = _mm_perm_ps( m_r2u2_r3u3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 1056 const __m128 pos_part_det3x3_r0 = _mm_madd_ps( r1u3, m_r2u1_r3u2, a_m_r1u2_r2u3_r3u1_m_r1u1_r2u2_r3u3 ); 1057 const __m128 m_r2u3_r3u2 = _mm_mul_ps( r2u3, r3u2 ); 1058 const __m128 m_r1u1_r2u3_r3u2 = _mm_mul_ps( r1u1, m_r2u3_r3u2 ); 1059 const __m128 m_r2u1_r3u3 = _mm_perm_ps( m_r2u3_r3u1, _MM_SHUFFLE( 1, 0, 3, 2 ) ); 1060 const __m128 a_m_r1u2_r2u1_r3u3_m_r1u1_r2u3_r3u2 = _mm_madd_ps( r1u2, m_r2u1_r3u3, m_r1u1_r2u3_r3u2 ); 1061 const __m128 m_r2u2_r3u1 = _mm_perm_ps( m_r2u3_r3u2, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 1062 const __m128 neg_part_det3x3_r0 = _mm_madd_ps( r1u3, m_r2u2_r3u1, a_m_r1u2_r2u1_r3u3_m_r1u1_r2u3_r3u2 ); 1063 const __m128 det3x3_r0 = _mm_sub_ps( pos_part_det3x3_r0, neg_part_det3x3_r0 ); 1064 1065 const __m128 m_r0u1_r2u2_r3u3 = _mm_mul_ps( r0u1, m_r2u2_r3u3 ); 1066 const __m128 a_m_r0u2_r2u3_r3u1_m_r0u1_r2u2_r3u3 = _mm_madd_ps( r0u2, m_r2u3_r3u1, m_r0u1_r2u2_r3u3 ); 1067 const __m128 pos_part_det3x3_r1 = _mm_madd_ps( r0u3, m_r2u1_r3u2, a_m_r0u2_r2u3_r3u1_m_r0u1_r2u2_r3u3 ); 1068 const __m128 m_r0u1_r2u3_r3u2 = _mm_mul_ps( r0u1, m_r2u3_r3u2 ); 1069 const __m128 a_m_r0u2_r2u1_r3u3_m_r0u1_r2u3_r3u2 = _mm_madd_ps( r0u2, m_r2u1_r3u3, m_r0u1_r2u3_r3u2 ); 1070 const __m128 neg_part_det3x3_r1 = _mm_madd_ps( r0u3, m_r2u2_r3u1, a_m_r0u2_r2u1_r3u3_m_r0u1_r2u3_r3u2 ); 1071 const __m128 det3x3_r1 = _mm_sub_ps( pos_part_det3x3_r1, neg_part_det3x3_r1 ); 1072 1073 const __m128 m_r0u1_r1u2 = _mm_mul_ps( r0u1, r1u2 ); 1074 const __m128 m_r0u1_r1u2_r2u3 = _mm_mul_ps( m_r0u1_r1u2, r2u3 ); 1075 const __m128 m_r0u2_r1u3 = _mm_perm_ps( m_r0u1_r1u2, _MM_SHUFFLE( 2, 1, 0, 3 ) ); 1076 const __m128 a_m_r0u2_r1u3_r2u1_m_r0u1_r1u2_r2u3 = _mm_madd_ps( m_r0u2_r1u3, r2u1, m_r0u1_r1u2_r2u3 ); 1077 const __m128 m_r0u3_r1u1 = _mm_mul_ps( r0u3, r1u1 ); 1078 const __m128 pos_part_det3x3_r3 = _mm_madd_ps( m_r0u3_r1u1, r2u2, a_m_r0u2_r1u3_r2u1_m_r0u1_r1u2_r2u3 ); 1079 const __m128 m_r0u1_r1u3 = _mm_perm_ps( m_r0u3_r1u1, _MM_SHUFFLE( 1, 0, 3, 2 ) ); 1080 const __m128 m_r0u1_r1u3_r2u2 = _mm_mul_ps( m_r0u1_r1u3, r2u2 ); 1081 const __m128 m_r0u2_r1u1 = _mm_mul_ps( r0u2, r1u1 ); 1082 const __m128 a_m_r0u2_r1u1_r2u3_m_r0u1_r1u3_r2u2 = _mm_madd_ps( m_r0u2_r1u1, r2u3, m_r0u1_r1u3_r2u2 ); 1083 const __m128 m_r0u3_r1u2 = _mm_perm_ps( m_r0u2_r1u1, _MM_SHUFFLE( 2, 1, 0, 3 ) ); 1084 const __m128 neg_part_det3x3_r3 = _mm_madd_ps( m_r0u3_r1u2, r2u1, a_m_r0u2_r1u1_r2u3_m_r0u1_r1u3_r2u2 ); 1085 const __m128 det3x3_r3 = _mm_sub_ps( pos_part_det3x3_r3, neg_part_det3x3_r3 ); 1086 1087 const __m128 m_r0u1_r1u2_r3u3 = _mm_mul_ps( m_r0u1_r1u2, r3u3 ); 1088 const __m128 a_m_r0u2_r1u3_r3u1_m_r0u1_r1u2_r3u3 = _mm_madd_ps( m_r0u2_r1u3, r3u1, m_r0u1_r1u2_r3u3 ); 1089 const __m128 pos_part_det3x3_r2 = _mm_madd_ps( m_r0u3_r1u1, r3u2, a_m_r0u2_r1u3_r3u1_m_r0u1_r1u2_r3u3 ); 1090 const __m128 m_r0u1_r1u3_r3u2 = _mm_mul_ps( m_r0u1_r1u3, r3u2 ); 1091 const __m128 a_m_r0u2_r1u1_r3u3_m_r0u1_r1u3_r3u2 = _mm_madd_ps( m_r0u2_r1u1, r3u3, m_r0u1_r1u3_r3u2 ); 1092 const __m128 neg_part_det3x3_r2 = _mm_madd_ps( m_r0u3_r1u2, r3u1, a_m_r0u2_r1u1_r3u3_m_r0u1_r1u3_r3u2 ); 1093 const __m128 det3x3_r2 = _mm_sub_ps( pos_part_det3x3_r2, neg_part_det3x3_r2 ); 1094 1095 const __m128 c_zero = _mm_setzero_ps(); 1096 const __m128 c_mask = _mm_cmpeq_ps( c_zero, c_zero ); 1097 const __m128 c_signmask = _mm_castsi128_ps( _mm_slli_epi32( _mm_castps_si128( c_mask ), 31 ) ); 1098 const __m128 c_znzn = _mm_unpacklo_ps( c_zero, c_signmask ); 1099 const __m128 c_nznz = _mm_unpacklo_ps( c_signmask, c_zero ); 1100 1101 const __m128 cofactor_r0 = _mm_xor_ps( det3x3_r0, c_znzn ); 1102 const __m128 cofactor_r1 = _mm_xor_ps( det3x3_r1, c_nznz ); 1103 const __m128 cofactor_r2 = _mm_xor_ps( det3x3_r2, c_znzn ); 1104 const __m128 cofactor_r3 = _mm_xor_ps( det3x3_r3, c_nznz ); 1105 1106 const __m128 dot0 = _mm_mul_ps( r0, cofactor_r0 ); 1107 const __m128 dot1 = _mm_add_ps( dot0, _mm_perm_ps( dot0, _MM_SHUFFLE( 2, 1, 0, 3 ) ) ); 1108 const __m128 det = _mm_add_ps( dot1, _mm_perm_ps( dot1, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 1109 1110 const __m128 absDet = _mm_andnot_ps( c_signmask, det ); 1111 if ( _mm_movemask_ps( _mm_cmplt_ps( absDet, vector_float_inverse_epsilon ) ) & 15 ) { 1112 return false; 1113 } 1114 1115 const __m128 rcpDet = _mm_rcp32_ps( det ); 1116 1117 const __m128 hi_part_r0_r2 = _mm_unpacklo_ps( cofactor_r0, cofactor_r2 ); 1118 const __m128 lo_part_r0_r2 = _mm_unpackhi_ps( cofactor_r0, cofactor_r2 ); 1119 const __m128 hi_part_r1_r3 = _mm_unpacklo_ps( cofactor_r1, cofactor_r3 ); 1120 const __m128 lo_part_r1_r3 = _mm_unpackhi_ps( cofactor_r1, cofactor_r3 ); 1121 1122 const __m128 adjoint_r0 = _mm_unpacklo_ps( hi_part_r0_r2, hi_part_r1_r3 ); 1123 const __m128 adjoint_r1 = _mm_unpackhi_ps( hi_part_r0_r2, hi_part_r1_r3 ); 1124 const __m128 adjoint_r2 = _mm_unpacklo_ps( lo_part_r0_r2, lo_part_r1_r3 ); 1125 const __m128 adjoint_r3 = _mm_unpackhi_ps( lo_part_r0_r2, lo_part_r1_r3 ); 1126 1127 _mm_storeu_ps( out.m + 0 * 4, _mm_mul_ps( adjoint_r0, rcpDet ) ); 1128 _mm_storeu_ps( out.m + 1 * 4, _mm_mul_ps( adjoint_r1, rcpDet ) ); 1129 _mm_storeu_ps( out.m + 2 * 4, _mm_mul_ps( adjoint_r2, rcpDet ) ); 1130 _mm_storeu_ps( out.m + 3 * 4, _mm_mul_ps( adjoint_r3, rcpDet ) ); 1131 1132 #else 1133 1134 const int FRL = 4; 1135 1136 // 84+4+16 = 104 multiplications 1137 // 1 division 1138 1139 // 2x2 sub-determinants required to calculate 4x4 determinant 1140 const float det2_01_01 = src.m[0*FRL+0] * src.m[1*FRL+1] - src.m[0*FRL+1] * src.m[1*FRL+0]; 1141 const float det2_01_02 = src.m[0*FRL+0] * src.m[1*FRL+2] - src.m[0*FRL+2] * src.m[1*FRL+0]; 1142 const float det2_01_03 = src.m[0*FRL+0] * src.m[1*FRL+3] - src.m[0*FRL+3] * src.m[1*FRL+0]; 1143 const float det2_01_12 = src.m[0*FRL+1] * src.m[1*FRL+2] - src.m[0*FRL+2] * src.m[1*FRL+1]; 1144 const float det2_01_13 = src.m[0*FRL+1] * src.m[1*FRL+3] - src.m[0*FRL+3] * src.m[1*FRL+1]; 1145 const float det2_01_23 = src.m[0*FRL+2] * src.m[1*FRL+3] - src.m[0*FRL+3] * src.m[1*FRL+2]; 1146 1147 // 3x3 sub-determinants required to calculate 4x4 determinant 1148 const float det3_201_012 = src.m[2*FRL+0] * det2_01_12 - src.m[2*FRL+1] * det2_01_02 + src.m[2*FRL+2] * det2_01_01; 1149 const float det3_201_013 = src.m[2*FRL+0] * det2_01_13 - src.m[2*FRL+1] * det2_01_03 + src.m[2*FRL+3] * det2_01_01; 1150 const float det3_201_023 = src.m[2*FRL+0] * det2_01_23 - src.m[2*FRL+2] * det2_01_03 + src.m[2*FRL+3] * det2_01_02; 1151 const float det3_201_123 = src.m[2*FRL+1] * det2_01_23 - src.m[2*FRL+2] * det2_01_13 + src.m[2*FRL+3] * det2_01_12; 1152 1153 const float det = ( - det3_201_123 * src.m[3*FRL+0] + det3_201_023 * src.m[3*FRL+1] - det3_201_013 * src.m[3*FRL+2] + det3_201_012 * src.m[3*FRL+3] ); 1154 1155 if ( idMath::Fabs( det ) < RENDER_MATRIX_INVERSE_EPSILON ) { 1156 return false; 1157 } 1158 1159 const float rcpDet = 1.0f / det; 1160 1161 // remaining 2x2 sub-determinants 1162 const float det2_03_01 = src.m[0*FRL+0] * src.m[3*FRL+1] - src.m[0*FRL+1] * src.m[3*FRL+0]; 1163 const float det2_03_02 = src.m[0*FRL+0] * src.m[3*FRL+2] - src.m[0*FRL+2] * src.m[3*FRL+0]; 1164 const float det2_03_03 = src.m[0*FRL+0] * src.m[3*FRL+3] - src.m[0*FRL+3] * src.m[3*FRL+0]; 1165 const float det2_03_12 = src.m[0*FRL+1] * src.m[3*FRL+2] - src.m[0*FRL+2] * src.m[3*FRL+1]; 1166 const float det2_03_13 = src.m[0*FRL+1] * src.m[3*FRL+3] - src.m[0*FRL+3] * src.m[3*FRL+1]; 1167 const float det2_03_23 = src.m[0*FRL+2] * src.m[3*FRL+3] - src.m[0*FRL+3] * src.m[3*FRL+2]; 1168 1169 const float det2_13_01 = src.m[1*FRL+0] * src.m[3*FRL+1] - src.m[1*FRL+1] * src.m[3*FRL+0]; 1170 const float det2_13_02 = src.m[1*FRL+0] * src.m[3*FRL+2] - src.m[1*FRL+2] * src.m[3*FRL+0]; 1171 const float det2_13_03 = src.m[1*FRL+0] * src.m[3*FRL+3] - src.m[1*FRL+3] * src.m[3*FRL+0]; 1172 const float det2_13_12 = src.m[1*FRL+1] * src.m[3*FRL+2] - src.m[1*FRL+2] * src.m[3*FRL+1]; 1173 const float det2_13_13 = src.m[1*FRL+1] * src.m[3*FRL+3] - src.m[1*FRL+3] * src.m[3*FRL+1]; 1174 const float det2_13_23 = src.m[1*FRL+2] * src.m[3*FRL+3] - src.m[1*FRL+3] * src.m[3*FRL+2]; 1175 1176 // remaining 3x3 sub-determinants 1177 const float det3_203_012 = src.m[2*FRL+0] * det2_03_12 - src.m[2*FRL+1] * det2_03_02 + src.m[2*FRL+2] * det2_03_01; 1178 const float det3_203_013 = src.m[2*FRL+0] * det2_03_13 - src.m[2*FRL+1] * det2_03_03 + src.m[2*FRL+3] * det2_03_01; 1179 const float det3_203_023 = src.m[2*FRL+0] * det2_03_23 - src.m[2*FRL+2] * det2_03_03 + src.m[2*FRL+3] * det2_03_02; 1180 const float det3_203_123 = src.m[2*FRL+1] * det2_03_23 - src.m[2*FRL+2] * det2_03_13 + src.m[2*FRL+3] * det2_03_12; 1181 1182 const float det3_213_012 = src.m[2*FRL+0] * det2_13_12 - src.m[2*FRL+1] * det2_13_02 + src.m[2*FRL+2] * det2_13_01; 1183 const float det3_213_013 = src.m[2*FRL+0] * det2_13_13 - src.m[2*FRL+1] * det2_13_03 + src.m[2*FRL+3] * det2_13_01; 1184 const float det3_213_023 = src.m[2*FRL+0] * det2_13_23 - src.m[2*FRL+2] * det2_13_03 + src.m[2*FRL+3] * det2_13_02; 1185 const float det3_213_123 = src.m[2*FRL+1] * det2_13_23 - src.m[2*FRL+2] * det2_13_13 + src.m[2*FRL+3] * det2_13_12; 1186 1187 const float det3_301_012 = src.m[3*FRL+0] * det2_01_12 - src.m[3*FRL+1] * det2_01_02 + src.m[3*FRL+2] * det2_01_01; 1188 const float det3_301_013 = src.m[3*FRL+0] * det2_01_13 - src.m[3*FRL+1] * det2_01_03 + src.m[3*FRL+3] * det2_01_01; 1189 const float det3_301_023 = src.m[3*FRL+0] * det2_01_23 - src.m[3*FRL+2] * det2_01_03 + src.m[3*FRL+3] * det2_01_02; 1190 const float det3_301_123 = src.m[3*FRL+1] * det2_01_23 - src.m[3*FRL+2] * det2_01_13 + src.m[3*FRL+3] * det2_01_12; 1191 1192 out.m[0*FRL+0] = - det3_213_123 * rcpDet; 1193 out.m[1*FRL+0] = + det3_213_023 * rcpDet; 1194 out.m[2*FRL+0] = - det3_213_013 * rcpDet; 1195 out.m[3*FRL+0] = + det3_213_012 * rcpDet; 1196 1197 out.m[0*FRL+1] = + det3_203_123 * rcpDet; 1198 out.m[1*FRL+1] = - det3_203_023 * rcpDet; 1199 out.m[2*FRL+1] = + det3_203_013 * rcpDet; 1200 out.m[3*FRL+1] = - det3_203_012 * rcpDet; 1201 1202 out.m[0*FRL+2] = + det3_301_123 * rcpDet; 1203 out.m[1*FRL+2] = - det3_301_023 * rcpDet; 1204 out.m[2*FRL+2] = + det3_301_013 * rcpDet; 1205 out.m[3*FRL+2] = - det3_301_012 * rcpDet; 1206 1207 out.m[0*FRL+3] = - det3_201_123 * rcpDet; 1208 out.m[1*FRL+3] = + det3_201_023 * rcpDet; 1209 out.m[2*FRL+3] = - det3_201_013 * rcpDet; 1210 out.m[3*FRL+3] = + det3_201_012 * rcpDet; 1211 1212 #endif 1213 1214 return true; 1215 } 1216 1217 /* 1218 ======================== 1219 idRenderMatrix::InverseByTranspose 1220 ======================== 1221 */ 1222 void idRenderMatrix::InverseByTranspose( const idRenderMatrix & src, idRenderMatrix & out ) { 1223 assert( &src != &out ); 1224 assert( src.IsAffineTransform( 0.01f ) ); 1225 1226 out[0][0] = src[0][0]; 1227 out[1][0] = src[0][1]; 1228 out[2][0] = src[0][2]; 1229 out[3][0] = 0.0f; 1230 out[0][1] = src[1][0]; 1231 out[1][1] = src[1][1]; 1232 out[2][1] = src[1][2]; 1233 out[3][1] = 0.0f; 1234 out[0][2] = src[2][0]; 1235 out[1][2] = src[2][1]; 1236 out[2][2] = src[2][2]; 1237 out[3][2] = 0.0f; 1238 out[0][3] = -( src[0][0] * src[0][3] + src[1][0] * src[1][3] + src[2][0] * src[2][3] ); 1239 out[1][3] = -( src[0][1] * src[0][3] + src[1][1] * src[1][3] + src[2][1] * src[2][3] ); 1240 out[2][3] = -( src[0][2] * src[0][3] + src[1][2] * src[1][3] + src[2][2] * src[2][3] ); 1241 out[3][3] = 1.0f; 1242 } 1243 1244 1245 /* 1246 ======================== 1247 idRenderMatrix::InverseByDoubles 1248 1249 This should never be used at run-time. 1250 This is only for tools where more precision is needed. 1251 ======================== 1252 */ 1253 bool idRenderMatrix::InverseByDoubles( const idRenderMatrix & src, idRenderMatrix & out ) { 1254 const int FRL = 4; 1255 1256 // 84+4+16 = 104 multiplications 1257 // 1 division 1258 1259 // 2x2 sub-determinants required to calculate 4x4 determinant 1260 const double det2_01_01 = (double)src.m[0*FRL+0] * (double)src.m[1*FRL+1] - (double)src.m[0*FRL+1] * (double)src.m[1*FRL+0]; 1261 const double det2_01_02 = (double)src.m[0*FRL+0] * (double)src.m[1*FRL+2] - (double)src.m[0*FRL+2] * (double)src.m[1*FRL+0]; 1262 const double det2_01_03 = (double)src.m[0*FRL+0] * (double)src.m[1*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[1*FRL+0]; 1263 const double det2_01_12 = (double)src.m[0*FRL+1] * (double)src.m[1*FRL+2] - (double)src.m[0*FRL+2] * (double)src.m[1*FRL+1]; 1264 const double det2_01_13 = (double)src.m[0*FRL+1] * (double)src.m[1*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[1*FRL+1]; 1265 const double det2_01_23 = (double)src.m[0*FRL+2] * (double)src.m[1*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[1*FRL+2]; 1266 1267 // 3x3 sub-determinants required to calculate 4x4 determinant 1268 const double det3_201_012 = (double)src.m[2*FRL+0] * det2_01_12 - (double)src.m[2*FRL+1] * det2_01_02 + (double)src.m[2*FRL+2] * det2_01_01; 1269 const double det3_201_013 = (double)src.m[2*FRL+0] * det2_01_13 - (double)src.m[2*FRL+1] * det2_01_03 + (double)src.m[2*FRL+3] * det2_01_01; 1270 const double det3_201_023 = (double)src.m[2*FRL+0] * det2_01_23 - (double)src.m[2*FRL+2] * det2_01_03 + (double)src.m[2*FRL+3] * det2_01_02; 1271 const double det3_201_123 = (double)src.m[2*FRL+1] * det2_01_23 - (double)src.m[2*FRL+2] * det2_01_13 + (double)src.m[2*FRL+3] * det2_01_12; 1272 1273 const double det = ( - det3_201_123 * (double)src.m[3*FRL+0] + det3_201_023 * (double)src.m[3*FRL+1] - det3_201_013 * (double)src.m[3*FRL+2] + det3_201_012 * (double)src.m[3*FRL+3] ); 1274 1275 const double rcpDet = 1.0f / det; 1276 1277 // remaining 2x2 sub-determinants 1278 const double det2_03_01 = (double)src.m[0*FRL+0] * (double)src.m[3*FRL+1] - (double)src.m[0*FRL+1] * (double)src.m[3*FRL+0]; 1279 const double det2_03_02 = (double)src.m[0*FRL+0] * (double)src.m[3*FRL+2] - (double)src.m[0*FRL+2] * (double)src.m[3*FRL+0]; 1280 const double det2_03_03 = (double)src.m[0*FRL+0] * (double)src.m[3*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[3*FRL+0]; 1281 const double det2_03_12 = (double)src.m[0*FRL+1] * (double)src.m[3*FRL+2] - (double)src.m[0*FRL+2] * (double)src.m[3*FRL+1]; 1282 const double det2_03_13 = (double)src.m[0*FRL+1] * (double)src.m[3*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[3*FRL+1]; 1283 const double det2_03_23 = (double)src.m[0*FRL+2] * (double)src.m[3*FRL+3] - (double)src.m[0*FRL+3] * (double)src.m[3*FRL+2]; 1284 1285 const double det2_13_01 = (double)src.m[1*FRL+0] * (double)src.m[3*FRL+1] - (double)src.m[1*FRL+1] * (double)src.m[3*FRL+0]; 1286 const double det2_13_02 = (double)src.m[1*FRL+0] * (double)src.m[3*FRL+2] - (double)src.m[1*FRL+2] * (double)src.m[3*FRL+0]; 1287 const double det2_13_03 = (double)src.m[1*FRL+0] * (double)src.m[3*FRL+3] - (double)src.m[1*FRL+3] * (double)src.m[3*FRL+0]; 1288 const double det2_13_12 = (double)src.m[1*FRL+1] * (double)src.m[3*FRL+2] - (double)src.m[1*FRL+2] * (double)src.m[3*FRL+1]; 1289 const double det2_13_13 = (double)src.m[1*FRL+1] * (double)src.m[3*FRL+3] - (double)src.m[1*FRL+3] * (double)src.m[3*FRL+1]; 1290 const double det2_13_23 = (double)src.m[1*FRL+2] * (double)src.m[3*FRL+3] - (double)src.m[1*FRL+3] * (double)src.m[3*FRL+2]; 1291 1292 // remaining 3x3 sub-determinants 1293 const double det3_203_012 = (double)src.m[2*FRL+0] * det2_03_12 - (double)src.m[2*FRL+1] * det2_03_02 + (double)src.m[2*FRL+2] * det2_03_01; 1294 const double det3_203_013 = (double)src.m[2*FRL+0] * det2_03_13 - (double)src.m[2*FRL+1] * det2_03_03 + (double)src.m[2*FRL+3] * det2_03_01; 1295 const double det3_203_023 = (double)src.m[2*FRL+0] * det2_03_23 - (double)src.m[2*FRL+2] * det2_03_03 + (double)src.m[2*FRL+3] * det2_03_02; 1296 const double det3_203_123 = (double)src.m[2*FRL+1] * det2_03_23 - (double)src.m[2*FRL+2] * det2_03_13 + (double)src.m[2*FRL+3] * det2_03_12; 1297 1298 const double det3_213_012 = (double)src.m[2*FRL+0] * det2_13_12 - (double)src.m[2*FRL+1] * det2_13_02 + (double)src.m[2*FRL+2] * det2_13_01; 1299 const double det3_213_013 = (double)src.m[2*FRL+0] * det2_13_13 - (double)src.m[2*FRL+1] * det2_13_03 + (double)src.m[2*FRL+3] * det2_13_01; 1300 const double det3_213_023 = (double)src.m[2*FRL+0] * det2_13_23 - (double)src.m[2*FRL+2] * det2_13_03 + (double)src.m[2*FRL+3] * det2_13_02; 1301 const double det3_213_123 = (double)src.m[2*FRL+1] * det2_13_23 - (double)src.m[2*FRL+2] * det2_13_13 + (double)src.m[2*FRL+3] * det2_13_12; 1302 1303 const double det3_301_012 = (double)src.m[3*FRL+0] * det2_01_12 - (double)src.m[3*FRL+1] * det2_01_02 + (double)src.m[3*FRL+2] * det2_01_01; 1304 const double det3_301_013 = (double)src.m[3*FRL+0] * det2_01_13 - (double)src.m[3*FRL+1] * det2_01_03 + (double)src.m[3*FRL+3] * det2_01_01; 1305 const double det3_301_023 = (double)src.m[3*FRL+0] * det2_01_23 - (double)src.m[3*FRL+2] * det2_01_03 + (double)src.m[3*FRL+3] * det2_01_02; 1306 const double det3_301_123 = (double)src.m[3*FRL+1] * det2_01_23 - (double)src.m[3*FRL+2] * det2_01_13 + (double)src.m[3*FRL+3] * det2_01_12; 1307 1308 out.m[0*FRL+0] = (float)( - det3_213_123 * rcpDet ); 1309 out.m[1*FRL+0] = (float)( + det3_213_023 * rcpDet ); 1310 out.m[2*FRL+0] = (float)( - det3_213_013 * rcpDet ); 1311 out.m[3*FRL+0] = (float)( + det3_213_012 * rcpDet ); 1312 1313 out.m[0*FRL+1] = (float)( + det3_203_123 * rcpDet ); 1314 out.m[1*FRL+1] = (float)( - det3_203_023 * rcpDet ); 1315 out.m[2*FRL+1] = (float)( + det3_203_013 * rcpDet ); 1316 out.m[3*FRL+1] = (float)( - det3_203_012 * rcpDet ); 1317 1318 out.m[0*FRL+2] = (float)( + det3_301_123 * rcpDet ); 1319 out.m[1*FRL+2] = (float)( - det3_301_023 * rcpDet ); 1320 out.m[2*FRL+2] = (float)( + det3_301_013 * rcpDet ); 1321 out.m[3*FRL+2] = (float)( - det3_301_012 * rcpDet ); 1322 1323 out.m[0*FRL+3] = (float)( - det3_201_123 * rcpDet ); 1324 out.m[1*FRL+3] = (float)( + det3_201_023 * rcpDet ); 1325 out.m[2*FRL+3] = (float)( - det3_201_013 * rcpDet ); 1326 out.m[3*FRL+3] = (float)( + det3_201_012 * rcpDet ); 1327 1328 return true; 1329 } 1330 1331 1332 /* 1333 ======================== 1334 DeterminantIsNegative 1335 ======================== 1336 */ 1337 #ifdef ID_WIN_X86_SSE2_INTRIN 1338 1339 void DeterminantIsNegative( bool & negativeDeterminant, const __m128 & r0, const __m128 & r1, const __m128 & r2, const __m128 & r3 ) { 1340 1341 const __m128 r1u1 = _mm_perm_ps( r1, _MM_SHUFFLE( 2, 1, 0, 3 ) ); 1342 const __m128 r1u2 = _mm_perm_ps( r1, _MM_SHUFFLE( 1, 0, 3, 2 ) ); 1343 const __m128 r1u3 = _mm_perm_ps( r1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 1344 1345 const __m128 r2u2 = _mm_perm_ps( r2, _MM_SHUFFLE( 1, 0, 3, 2 ) ); 1346 const __m128 r2u3 = _mm_perm_ps( r2, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 1347 1348 const __m128 r3u1 = _mm_perm_ps( r3, _MM_SHUFFLE( 2, 1, 0, 3 ) ); 1349 const __m128 r3u2 = _mm_perm_ps( r3, _MM_SHUFFLE( 1, 0, 3, 2 ) ); 1350 const __m128 r3u3 = _mm_perm_ps( r3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 1351 1352 const __m128 m_r2u2_r3u3 = _mm_mul_ps( r2u2, r3u3 ); 1353 const __m128 m_r1u1_r2u2_r3u3 = _mm_mul_ps( r1u1, m_r2u2_r3u3 ); 1354 const __m128 m_r2u3_r3u1 = _mm_mul_ps( r2u3, r3u1 ); 1355 const __m128 a_m_r1u2_r2u3_r3u1_m_r1u1_r2u2_r3u3 = _mm_madd_ps( r1u2, m_r2u3_r3u1, m_r1u1_r2u2_r3u3 ); 1356 const __m128 m_r2u1_r3u2 = _mm_perm_ps( m_r2u2_r3u3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 1357 const __m128 pos_part_det3x3_r0 = _mm_madd_ps( r1u3, m_r2u1_r3u2, a_m_r1u2_r2u3_r3u1_m_r1u1_r2u2_r3u3 ); 1358 const __m128 m_r2u3_r3u2 = _mm_mul_ps( r2u3, r3u2 ); 1359 const __m128 m_r1u1_r2u3_r3u2 = _mm_mul_ps( r1u1, m_r2u3_r3u2 ); 1360 const __m128 m_r2u1_r3u3 = _mm_perm_ps( m_r2u3_r3u1, _MM_SHUFFLE( 1, 0, 3, 2 ) ); 1361 const __m128 a_m_r1u2_r2u1_r3u3_m_r1u1_r2u3_r3u2 = _mm_madd_ps( r1u2, m_r2u1_r3u3, m_r1u1_r2u3_r3u2 ); 1362 const __m128 m_r2u2_r3u1 = _mm_perm_ps( m_r2u3_r3u2, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 1363 const __m128 neg_part_det3x3_r0 = _mm_madd_ps( r1u3, m_r2u2_r3u1, a_m_r1u2_r2u1_r3u3_m_r1u1_r2u3_r3u2 ); 1364 const __m128 det3x3_r0 = _mm_sub_ps( pos_part_det3x3_r0, neg_part_det3x3_r0 ); 1365 1366 const __m128 c_zero = _mm_setzero_ps(); 1367 const __m128 c_mask = _mm_cmpeq_ps( c_zero, c_zero ); 1368 const __m128 c_signmask = _mm_castsi128_ps( _mm_slli_epi32( _mm_castps_si128( c_mask ), 31 ) ); 1369 const __m128 c_znzn = _mm_unpacklo_ps( c_zero, c_signmask ); 1370 1371 const __m128 cofactor_r0 = _mm_xor_ps( det3x3_r0, c_znzn ); 1372 1373 const __m128 dot0 = _mm_mul_ps( r0, cofactor_r0 ); 1374 const __m128 dot1 = _mm_add_ps( dot0, _mm_perm_ps( dot0, _MM_SHUFFLE( 2, 1, 0, 3 ) ) ); 1375 const __m128 det = _mm_add_ps( dot1, _mm_perm_ps( dot1, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 1376 1377 const __m128 result = _mm_cmpgt_ps( c_zero, det ); 1378 1379 negativeDeterminant = _mm_movemask_ps( result ) & 1; 1380 } 1381 1382 #else 1383 1384 void DeterminantIsNegative( bool & negativeDeterminant, const float * row0, const float * row1, const float * row2, const float * row3 ) { 1385 1386 // 2x2 sub-determinants required to calculate 4x4 determinant 1387 const float det2_01_01 = row0[0] * row1[1] - row0[1] * row1[0]; 1388 const float det2_01_02 = row0[0] * row1[2] - row0[2] * row1[0]; 1389 const float det2_01_03 = row0[0] * row1[3] - row0[3] * row1[0]; 1390 const float det2_01_12 = row0[1] * row1[2] - row0[2] * row1[1]; 1391 const float det2_01_13 = row0[1] * row1[3] - row0[3] * row1[1]; 1392 const float det2_01_23 = row0[2] * row1[3] - row0[3] * row1[2]; 1393 1394 // 3x3 sub-determinants required to calculate 4x4 determinant 1395 const float det3_201_012 = row2[0] * det2_01_12 - row2[1] * det2_01_02 + row2[2] * det2_01_01; 1396 const float det3_201_013 = row2[0] * det2_01_13 - row2[1] * det2_01_03 + row2[3] * det2_01_01; 1397 const float det3_201_023 = row2[0] * det2_01_23 - row2[2] * det2_01_03 + row2[3] * det2_01_02; 1398 const float det3_201_123 = row2[1] * det2_01_23 - row2[2] * det2_01_13 + row2[3] * det2_01_12; 1399 1400 const float det = ( - det3_201_123 * row3[0] + det3_201_023 * row3[1] - det3_201_013 * row3[2] + det3_201_012 * row3[3] ); 1401 1402 negativeDeterminant = ( det < 0.0f ); 1403 } 1404 1405 #endif 1406 1407 /* 1408 ======================== 1409 idRenderMatrix::CopyMatrix 1410 ======================== 1411 */ 1412 void idRenderMatrix::CopyMatrix( const idRenderMatrix & matrix, idVec4 & row0, idVec4 & row1, idVec4 & row2, idVec4 & row3 ) { 1413 assert_16_byte_aligned( row0.ToFloatPtr() ); 1414 assert_16_byte_aligned( row1.ToFloatPtr() ); 1415 assert_16_byte_aligned( row2.ToFloatPtr() ); 1416 assert_16_byte_aligned( row3.ToFloatPtr() ); 1417 1418 #ifdef ID_WIN_X86_SSE2_INTRIN 1419 1420 const __m128 r0 = _mm_loadu_ps( matrix.m + 0 * 4 ); 1421 const __m128 r1 = _mm_loadu_ps( matrix.m + 1 * 4 ); 1422 const __m128 r2 = _mm_loadu_ps( matrix.m + 2 * 4 ); 1423 const __m128 r3 = _mm_loadu_ps( matrix.m + 3 * 4 ); 1424 1425 _mm_store_ps( row0.ToFloatPtr(), r0 ); 1426 _mm_store_ps( row1.ToFloatPtr(), r1 ); 1427 _mm_store_ps( row2.ToFloatPtr(), r2 ); 1428 _mm_store_ps( row3.ToFloatPtr(), r3 ); 1429 1430 #else 1431 1432 memcpy( row0.ToFloatPtr(), matrix[0], sizeof( idVec4 ) ); 1433 memcpy( row1.ToFloatPtr(), matrix[1], sizeof( idVec4 ) ); 1434 memcpy( row2.ToFloatPtr(), matrix[2], sizeof( idVec4 ) ); 1435 memcpy( row3.ToFloatPtr(), matrix[3], sizeof( idVec4 ) ); 1436 1437 #endif 1438 } 1439 1440 /* 1441 ======================== 1442 idRenderMatrix::SetMVP 1443 ======================== 1444 */ 1445 void idRenderMatrix::SetMVP( const idRenderMatrix & mvp, idVec4 & row0, idVec4 & row1, idVec4 & row2, idVec4 & row3, bool & negativeDeterminant ) { 1446 assert_16_byte_aligned( row0.ToFloatPtr() ); 1447 assert_16_byte_aligned( row1.ToFloatPtr() ); 1448 assert_16_byte_aligned( row2.ToFloatPtr() ); 1449 assert_16_byte_aligned( row3.ToFloatPtr() ); 1450 1451 #ifdef ID_WIN_X86_SSE2_INTRIN 1452 1453 const __m128 r0 = _mm_loadu_ps( mvp.m + 0 * 4 ); 1454 const __m128 r1 = _mm_loadu_ps( mvp.m + 1 * 4 ); 1455 const __m128 r2 = _mm_loadu_ps( mvp.m + 2 * 4 ); 1456 const __m128 r3 = _mm_loadu_ps( mvp.m + 3 * 4 ); 1457 1458 _mm_store_ps( row0.ToFloatPtr(), r0 ); 1459 _mm_store_ps( row1.ToFloatPtr(), r1 ); 1460 _mm_store_ps( row2.ToFloatPtr(), r2 ); 1461 _mm_store_ps( row3.ToFloatPtr(), r3 ); 1462 1463 DeterminantIsNegative( negativeDeterminant, r0, r1, r2, r3 ); 1464 1465 #else 1466 1467 memcpy( row0.ToFloatPtr(), mvp[0], sizeof( idVec4 ) ); 1468 memcpy( row1.ToFloatPtr(), mvp[1], sizeof( idVec4 ) ); 1469 memcpy( row2.ToFloatPtr(), mvp[2], sizeof( idVec4 ) ); 1470 memcpy( row3.ToFloatPtr(), mvp[3], sizeof( idVec4 ) ); 1471 1472 DeterminantIsNegative( negativeDeterminant, mvp[0], mvp[1], mvp[2], mvp[3] ); 1473 1474 #endif 1475 } 1476 1477 /* 1478 ======================== 1479 idRenderMatrix::SetMVPForBounds 1480 ======================== 1481 */ 1482 void idRenderMatrix::SetMVPForBounds( const idRenderMatrix & mvp, const idBounds & bounds, idVec4 & row0, idVec4 & row1, idVec4 & row2, idVec4 & row3, bool & negativeDeterminant ) { 1483 assert_16_byte_aligned( row0.ToFloatPtr() ); 1484 assert_16_byte_aligned( row1.ToFloatPtr() ); 1485 assert_16_byte_aligned( row2.ToFloatPtr() ); 1486 assert_16_byte_aligned( row3.ToFloatPtr() ); 1487 1488 #ifdef ID_WIN_X86_SSE2_INTRIN 1489 1490 __m128 b0 = _mm_loadu_bounds_0( bounds ); 1491 __m128 b1 = _mm_loadu_bounds_1( bounds ); 1492 1493 __m128 offset = _mm_mul_ps( _mm_add_ps( b1, b0 ), vector_float_half ); 1494 __m128 scale = _mm_mul_ps( _mm_sub_ps( b1, b0 ), vector_float_half ); 1495 1496 scale = _mm_or_ps( scale, vector_float_last_one ); 1497 1498 __m128 r0 = _mm_loadu_ps( mvp.m + 0 * 4 ); 1499 __m128 r1 = _mm_loadu_ps( mvp.m + 1 * 4 ); 1500 __m128 r2 = _mm_loadu_ps( mvp.m + 2 * 4 ); 1501 __m128 r3 = _mm_loadu_ps( mvp.m + 3 * 4 ); 1502 1503 __m128 d0 = _mm_mul_ps( r0, offset ); 1504 __m128 d1 = _mm_mul_ps( r1, offset ); 1505 __m128 d2 = _mm_mul_ps( r2, offset ); 1506 __m128 d3 = _mm_mul_ps( r3, offset ); 1507 1508 __m128 s0 = _mm_unpacklo_ps( d0, d2 ); // a0, c0, a1, c1 1509 __m128 s1 = _mm_unpackhi_ps( d0, d2 ); // a2, c2, a3, c3 1510 __m128 s2 = _mm_unpacklo_ps( d1, d3 ); // b0, d0, b1, d1 1511 __m128 s3 = _mm_unpackhi_ps( d1, d3 ); // b2, d2, b3, d3 1512 1513 __m128 t0 = _mm_unpacklo_ps( s0, s2 ); // a0, b0, c0, d0 1514 __m128 t1 = _mm_unpackhi_ps( s0, s2 ); // a1, b1, c1, d1 1515 __m128 t2 = _mm_unpacklo_ps( s1, s3 ); // a2, b2, c2, d2 1516 1517 t0 = _mm_add_ps( t0, t1 ); 1518 t0 = _mm_add_ps( t0, t2 ); 1519 1520 __m128 n0 = _mm_and_ps( _mm_splat_ps( t0, 0 ), vector_float_keep_last ); 1521 __m128 n1 = _mm_and_ps( _mm_splat_ps( t0, 1 ), vector_float_keep_last ); 1522 __m128 n2 = _mm_and_ps( _mm_splat_ps( t0, 2 ), vector_float_keep_last ); 1523 __m128 n3 = _mm_and_ps( _mm_splat_ps( t0, 3 ), vector_float_keep_last ); 1524 1525 r0 = _mm_madd_ps( r0, scale, n0 ); 1526 r1 = _mm_madd_ps( r1, scale, n1 ); 1527 r2 = _mm_madd_ps( r2, scale, n2 ); 1528 r3 = _mm_madd_ps( r3, scale, n3 ); 1529 1530 _mm_store_ps( row0.ToFloatPtr(), r0 ); 1531 _mm_store_ps( row1.ToFloatPtr(), r1 ); 1532 _mm_store_ps( row2.ToFloatPtr(), r2 ); 1533 _mm_store_ps( row3.ToFloatPtr(), r3 ); 1534 1535 DeterminantIsNegative( negativeDeterminant, r0, r1, r2, r3 ); 1536 1537 #else 1538 1539 const idVec3 offset = ( bounds[1] + bounds[0] ) * 0.5f; 1540 const idVec3 scale = ( bounds[1] - bounds[0] ) * 0.5f; 1541 1542 row0[0] = mvp[0][0] * scale[0]; 1543 row0[1] = mvp[0][1] * scale[1]; 1544 row0[2] = mvp[0][2] * scale[2]; 1545 row0[3] = mvp[0][3] + mvp[0][0] * offset[0] + mvp[0][1] * offset[1] + mvp[0][2] * offset[2]; 1546 1547 row1[0] = mvp[1][0] * scale[0]; 1548 row1[1] = mvp[1][1] * scale[1]; 1549 row1[2] = mvp[1][2] * scale[2]; 1550 row1[3] = mvp[1][3] + mvp[1][0] * offset[0] + mvp[1][1] * offset[1] + mvp[1][2] * offset[2]; 1551 1552 row2[0] = mvp[2][0] * scale[0]; 1553 row2[1] = mvp[2][1] * scale[1]; 1554 row2[2] = mvp[2][2] * scale[2]; 1555 row2[3] = mvp[2][3] + mvp[2][0] * offset[0] + mvp[2][1] * offset[1] + mvp[2][2] * offset[2]; 1556 1557 row3[0] = mvp[3][0] * scale[0]; 1558 row3[1] = mvp[3][1] * scale[1]; 1559 row3[2] = mvp[3][2] * scale[2]; 1560 row3[3] = mvp[3][3] + mvp[3][0] * offset[0] + mvp[3][1] * offset[1] + mvp[3][2] * offset[2]; 1561 1562 DeterminantIsNegative( negativeDeterminant, row0.ToFloatPtr(), row1.ToFloatPtr(), row2.ToFloatPtr(), row3.ToFloatPtr() ); 1563 1564 #endif 1565 } 1566 1567 /* 1568 ======================== 1569 idRenderMatrix::SetMVPForInverseProject 1570 ======================== 1571 */ 1572 void idRenderMatrix::SetMVPForInverseProject( const idRenderMatrix & mvp, const idRenderMatrix & inverseProject, idVec4 & row0, idVec4 & row1, idVec4 & row2, idVec4 & row3, bool & negativeDeterminant ) { 1573 assert_16_byte_aligned( row0.ToFloatPtr() ); 1574 assert_16_byte_aligned( row1.ToFloatPtr() ); 1575 assert_16_byte_aligned( row2.ToFloatPtr() ); 1576 assert_16_byte_aligned( row3.ToFloatPtr() ); 1577 1578 #ifdef ID_WIN_X86_SSE2_INTRIN 1579 1580 __m128 r0 = _mm_loadu_ps( mvp.m + 0 * 4 ); 1581 __m128 r1 = _mm_loadu_ps( mvp.m + 1 * 4 ); 1582 __m128 r2 = _mm_loadu_ps( mvp.m + 2 * 4 ); 1583 __m128 r3 = _mm_loadu_ps( mvp.m + 3 * 4 ); 1584 1585 __m128 p0 = _mm_loadu_ps( inverseProject.m + 0 * 4 ); 1586 __m128 p1 = _mm_loadu_ps( inverseProject.m + 1 * 4 ); 1587 __m128 p2 = _mm_loadu_ps( inverseProject.m + 2 * 4 ); 1588 __m128 p3 = _mm_loadu_ps( inverseProject.m + 3 * 4 ); 1589 1590 __m128 t0 = _mm_mul_ps( _mm_splat_ps( r0, 0 ), p0 ); 1591 __m128 t1 = _mm_mul_ps( _mm_splat_ps( r1, 0 ), p0 ); 1592 __m128 t2 = _mm_mul_ps( _mm_splat_ps( r2, 0 ), p0 ); 1593 __m128 t3 = _mm_mul_ps( _mm_splat_ps( r3, 0 ), p0 ); 1594 1595 t0 = _mm_madd_ps( _mm_splat_ps( r0, 1 ), p1, t0 ); 1596 t1 = _mm_madd_ps( _mm_splat_ps( r1, 1 ), p1, t1 ); 1597 t2 = _mm_madd_ps( _mm_splat_ps( r2, 1 ), p1, t2 ); 1598 t3 = _mm_madd_ps( _mm_splat_ps( r3, 1 ), p1, t3 ); 1599 1600 t0 = _mm_madd_ps( _mm_splat_ps( r0, 2 ), p2, t0 ); 1601 t1 = _mm_madd_ps( _mm_splat_ps( r1, 2 ), p2, t1 ); 1602 t2 = _mm_madd_ps( _mm_splat_ps( r2, 2 ), p2, t2 ); 1603 t3 = _mm_madd_ps( _mm_splat_ps( r3, 2 ), p2, t3 ); 1604 1605 t0 = _mm_madd_ps( _mm_splat_ps( r0, 3 ), p3, t0 ); 1606 t1 = _mm_madd_ps( _mm_splat_ps( r1, 3 ), p3, t1 ); 1607 t2 = _mm_madd_ps( _mm_splat_ps( r2, 3 ), p3, t2 ); 1608 t3 = _mm_madd_ps( _mm_splat_ps( r3, 3 ), p3, t3 ); 1609 1610 _mm_store_ps( row0.ToFloatPtr(), t0 ); 1611 _mm_store_ps( row1.ToFloatPtr(), t1 ); 1612 _mm_store_ps( row2.ToFloatPtr(), t2 ); 1613 _mm_store_ps( row3.ToFloatPtr(), t3 ); 1614 1615 DeterminantIsNegative( negativeDeterminant, t0, t1, t2, t3 ); 1616 1617 #else 1618 1619 row0[0] = mvp.m[0*4+0]*inverseProject.m[0*4+0] + mvp.m[0*4+1]*inverseProject.m[1*4+0] + mvp.m[0*4+2]*inverseProject.m[2*4+0] + mvp.m[0*4+3]*inverseProject.m[3*4+0]; 1620 row0[1] = mvp.m[0*4+0]*inverseProject.m[0*4+1] + mvp.m[0*4+1]*inverseProject.m[1*4+1] + mvp.m[0*4+2]*inverseProject.m[2*4+1] + mvp.m[0*4+3]*inverseProject.m[3*4+1]; 1621 row0[2] = mvp.m[0*4+0]*inverseProject.m[0*4+2] + mvp.m[0*4+1]*inverseProject.m[1*4+2] + mvp.m[0*4+2]*inverseProject.m[2*4+2] + mvp.m[0*4+3]*inverseProject.m[3*4+2]; 1622 row0[3] = mvp.m[0*4+0]*inverseProject.m[0*4+3] + mvp.m[0*4+1]*inverseProject.m[1*4+3] + mvp.m[0*4+2]*inverseProject.m[2*4+3] + mvp.m[0*4+3]*inverseProject.m[3*4+3]; 1623 1624 row1[0] = mvp.m[1*4+0]*inverseProject.m[0*4+0] + mvp.m[1*4+1]*inverseProject.m[1*4+0] + mvp.m[1*4+2]*inverseProject.m[2*4+0] + mvp.m[1*4+3]*inverseProject.m[3*4+0]; 1625 row1[1] = mvp.m[1*4+0]*inverseProject.m[0*4+1] + mvp.m[1*4+1]*inverseProject.m[1*4+1] + mvp.m[1*4+2]*inverseProject.m[2*4+1] + mvp.m[1*4+3]*inverseProject.m[3*4+1]; 1626 row1[2] = mvp.m[1*4+0]*inverseProject.m[0*4+2] + mvp.m[1*4+1]*inverseProject.m[1*4+2] + mvp.m[1*4+2]*inverseProject.m[2*4+2] + mvp.m[1*4+3]*inverseProject.m[3*4+2]; 1627 row1[3] = mvp.m[1*4+0]*inverseProject.m[0*4+3] + mvp.m[1*4+1]*inverseProject.m[1*4+3] + mvp.m[1*4+2]*inverseProject.m[2*4+3] + mvp.m[1*4+3]*inverseProject.m[3*4+3]; 1628 1629 row2[0] = mvp.m[2*4+0]*inverseProject.m[0*4+0] + mvp.m[2*4+1]*inverseProject.m[1*4+0] + mvp.m[2*4+2]*inverseProject.m[2*4+0] + mvp.m[2*4+3]*inverseProject.m[3*4+0]; 1630 row2[1] = mvp.m[2*4+0]*inverseProject.m[0*4+1] + mvp.m[2*4+1]*inverseProject.m[1*4+1] + mvp.m[2*4+2]*inverseProject.m[2*4+1] + mvp.m[2*4+3]*inverseProject.m[3*4+1]; 1631 row2[2] = mvp.m[2*4+0]*inverseProject.m[0*4+2] + mvp.m[2*4+1]*inverseProject.m[1*4+2] + mvp.m[2*4+2]*inverseProject.m[2*4+2] + mvp.m[2*4+3]*inverseProject.m[3*4+2]; 1632 row2[3] = mvp.m[2*4+0]*inverseProject.m[0*4+3] + mvp.m[2*4+1]*inverseProject.m[1*4+3] + mvp.m[2*4+2]*inverseProject.m[2*4+3] + mvp.m[2*4+3]*inverseProject.m[3*4+3]; 1633 1634 row3[0] = mvp.m[3*4+0]*inverseProject.m[0*4+0] + mvp.m[3*4+1]*inverseProject.m[1*4+0] + mvp.m[3*4+2]*inverseProject.m[2*4+0] + mvp.m[3*4+3]*inverseProject.m[3*4+0]; 1635 row3[1] = mvp.m[3*4+0]*inverseProject.m[0*4+1] + mvp.m[3*4+1]*inverseProject.m[1*4+1] + mvp.m[3*4+2]*inverseProject.m[2*4+1] + mvp.m[3*4+3]*inverseProject.m[3*4+1]; 1636 row3[2] = mvp.m[3*4+0]*inverseProject.m[0*4+2] + mvp.m[3*4+1]*inverseProject.m[1*4+2] + mvp.m[3*4+2]*inverseProject.m[2*4+2] + mvp.m[3*4+3]*inverseProject.m[3*4+2]; 1637 row3[3] = mvp.m[3*4+0]*inverseProject.m[0*4+3] + mvp.m[3*4+1]*inverseProject.m[1*4+3] + mvp.m[3*4+2]*inverseProject.m[2*4+3] + mvp.m[3*4+3]*inverseProject.m[3*4+3]; 1638 1639 DeterminantIsNegative( negativeDeterminant, row0.ToFloatPtr(), row1.ToFloatPtr(), row2.ToFloatPtr(), row3.ToFloatPtr() ); 1640 1641 #endif 1642 } 1643 1644 /* 1645 ======================== 1646 idRenderMatrix::CullPointToMVPbits 1647 1648 Returns true if the point transformed by the given Model View Projection (MVP) matrix is 1649 outside the clip space. 1650 1651 Normally the clip space extends from -1.0 to 1.0 on each axis, but by setting 'zeroToOne' 1652 to true, the clip space will extend from 0.0 to 1.0 on each axis for a light projection matrix. 1653 ======================== 1654 */ 1655 bool idRenderMatrix::CullPointToMVPbits( const idRenderMatrix & mvp, const idVec3 & p, byte * outBits, bool zeroToOne ) { 1656 1657 idVec4 c; 1658 for ( int i = 0; i < 4; i++ ) { 1659 c[i] = p[0] * mvp[i][0] + p[1] * mvp[i][1] + p[2] * mvp[i][2] + mvp[i][3]; 1660 } 1661 1662 const float minW = zeroToOne ? 0.0f : -c[3]; 1663 const float maxW = c[3]; 1664 #if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false 1665 const float minZ = 0.0f; 1666 #else 1667 const float minZ = minW; 1668 #endif 1669 1670 int bits = 0; 1671 if ( c[0] > minW ) { bits |= ( 1 << 0 ); } 1672 if ( c[0] < maxW ) { bits |= ( 1 << 1 ); } 1673 if ( c[1] > minW ) { bits |= ( 1 << 2 ); } 1674 if ( c[1] < maxW ) { bits |= ( 1 << 3 ); } 1675 if ( c[2] > minZ ) { bits |= ( 1 << 4 ); } // NOTE: using minZ 1676 if ( c[2] < maxW ) { bits |= ( 1 << 5 ); } 1677 1678 // store out a bit set for each side where the point is outside the clip space 1679 *outBits = (byte)( bits ^ 63 ); 1680 1681 // if any bits weren't set, the point is completely off one side of the frustum 1682 return ( bits != 63 ); 1683 } 1684 1685 /* 1686 ======================== 1687 idRenderMatrix::CullBoundsToMVPbits 1688 1689 Returns true if nothing contained in the bounds is transformed by the given 1690 Model View Projection (MVP) matrix to anything inside the clip space. 1691 1692 Normally the clip space extends from -1.0 to 1.0 on each axis, but by setting 'zeroToOne' 1693 to true, the clip space will extend from 0.0 to 1.0 on each axis for a light projection matrix. 1694 1695 When all the corners of the bounding box are behind one of the six frustum planes, the box is 1696 culled. This is conservative, because some boxes may "cross corners" and can be in front of a 1697 frustum plane, but only while also being behind another one. 1698 ======================== 1699 */ 1700 bool idRenderMatrix::CullBoundsToMVPbits( const idRenderMatrix & mvp, const idBounds & bounds, byte * outBits, bool zeroToOne ) { 1701 1702 #ifdef ID_WIN_X86_SSE2_INTRIN 1703 1704 __m128 mvp0 = _mm_loadu_ps( mvp[0] ); 1705 __m128 mvp1 = _mm_loadu_ps( mvp[1] ); 1706 __m128 mvp2 = _mm_loadu_ps( mvp[2] ); 1707 __m128 mvp3 = _mm_loadu_ps( mvp[3] ); 1708 1709 __m128 minMul = zeroToOne ? vector_float_zero : vector_float_neg_one; 1710 1711 __m128 b0 = _mm_loadu_bounds_0( bounds ); 1712 __m128 b1 = _mm_loadu_bounds_1( bounds ); 1713 1714 // take the four points on the X-Y plane 1715 __m128 vxy = _mm_unpacklo_ps( b0, b1 ); // min X, max X, min Y, max Y 1716 __m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // min X, max X, min X, max X 1717 __m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) ); // min Y, min Y, max Y, max Y 1718 1719 __m128 vz0 = _mm_splat_ps( b0, 2 ); // min Z, min Z, min Z, min Z 1720 __m128 vz1 = _mm_splat_ps( b1, 2 ); // max Z, max Z, max Z, max Z 1721 1722 // compute four partial X,Y,Z,W values 1723 __m128 parx = _mm_splat_ps( mvp0, 3 ); 1724 __m128 pary = _mm_splat_ps( mvp1, 3 ); 1725 __m128 parz = _mm_splat_ps( mvp2, 3 ); 1726 __m128 parw = _mm_splat_ps( mvp3, 3 ); 1727 1728 parx = _mm_madd_ps( vx, _mm_splat_ps( mvp0, 0 ), parx ); 1729 pary = _mm_madd_ps( vx, _mm_splat_ps( mvp1, 0 ), pary ); 1730 parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz ); 1731 parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw ); 1732 1733 parx = _mm_madd_ps( vy, _mm_splat_ps( mvp0, 1 ), parx ); 1734 pary = _mm_madd_ps( vy, _mm_splat_ps( mvp1, 1 ), pary ); 1735 parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz ); 1736 parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw ); 1737 1738 // compute full X,Y,Z,W values 1739 __m128 mvp0Z = _mm_splat_ps( mvp0, 2 ); 1740 __m128 mvp1Z = _mm_splat_ps( mvp1, 2 ); 1741 __m128 mvp2Z = _mm_splat_ps( mvp2, 2 ); 1742 __m128 mvp3Z = _mm_splat_ps( mvp3, 2 ); 1743 1744 __m128 x0 = _mm_madd_ps( vz0, mvp0Z, parx ); 1745 __m128 y0 = _mm_madd_ps( vz0, mvp1Z, pary ); 1746 __m128 z0 = _mm_madd_ps( vz0, mvp2Z, parz ); 1747 __m128 w0 = _mm_madd_ps( vz0, mvp3Z, parw ); 1748 1749 __m128 x1 = _mm_madd_ps( vz1, mvp0Z, parx ); 1750 __m128 y1 = _mm_madd_ps( vz1, mvp1Z, pary ); 1751 __m128 z1 = _mm_madd_ps( vz1, mvp2Z, parz ); 1752 __m128 w1 = _mm_madd_ps( vz1, mvp3Z, parw ); 1753 1754 __m128 maxW0 = w0; 1755 __m128 maxW1 = w1; 1756 __m128 minW0 = _mm_mul_ps( w0, minMul ); 1757 __m128 minW1 = _mm_mul_ps( w1, minMul ); 1758 #if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false 1759 __m128 minZ0 = vector_float_zero; 1760 __m128 minZ1 = vector_float_zero; 1761 #else 1762 __m128 minZ0 = minW0; 1763 __m128 minZ1 = minW1; 1764 #endif 1765 1766 __m128 cullBits0 = _mm_cmpgt_ps( x0, minW0 ); 1767 __m128 cullBits1 = _mm_cmpgt_ps( maxW0, x0 ); 1768 __m128 cullBits2 = _mm_cmpgt_ps( y0, minW0 ); 1769 __m128 cullBits3 = _mm_cmpgt_ps( maxW0, y0 ); 1770 __m128 cullBits4 = _mm_cmpgt_ps( z0, minZ0 ); // NOTE: using minZ0 1771 __m128 cullBits5 = _mm_cmpgt_ps( maxW0, z0 ); 1772 1773 cullBits0 = _mm_or_ps( cullBits0, _mm_cmpgt_ps( x1, minW1 ) ); 1774 cullBits1 = _mm_or_ps( cullBits1, _mm_cmpgt_ps( maxW1, x1 ) ); 1775 cullBits2 = _mm_or_ps( cullBits2, _mm_cmpgt_ps( y1, minW1 ) ); 1776 cullBits3 = _mm_or_ps( cullBits3, _mm_cmpgt_ps( maxW1, y1 ) ); 1777 cullBits4 = _mm_or_ps( cullBits4, _mm_cmpgt_ps( z1, minZ1 ) ); // NOTE: using minZ1 1778 cullBits5 = _mm_or_ps( cullBits5, _mm_cmpgt_ps( maxW1, z1 ) ); 1779 1780 cullBits0 = _mm_and_ps( cullBits0, vector_float_mask0 ); 1781 cullBits1 = _mm_and_ps( cullBits1, vector_float_mask1 ); 1782 cullBits2 = _mm_and_ps( cullBits2, vector_float_mask2 ); 1783 cullBits3 = _mm_and_ps( cullBits3, vector_float_mask3 ); 1784 cullBits4 = _mm_and_ps( cullBits4, vector_float_mask4 ); 1785 cullBits5 = _mm_and_ps( cullBits5, vector_float_mask5 ); 1786 1787 cullBits0 = _mm_or_ps( cullBits0, cullBits1 ); 1788 cullBits2 = _mm_or_ps( cullBits2, cullBits3 ); 1789 cullBits4 = _mm_or_ps( cullBits4, cullBits5 ); 1790 cullBits0 = _mm_or_ps( cullBits0, cullBits2 ); 1791 cullBits0 = _mm_or_ps( cullBits0, cullBits4 ); 1792 1793 cullBits0 = _mm_or_ps( cullBits0, _mm_perm_ps( cullBits0, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 1794 cullBits0 = _mm_or_ps( cullBits0, _mm_perm_ps( cullBits0, _MM_SHUFFLE( 0, 1, 0, 1 ) ) ); 1795 1796 int bits = _mm_cvtsi128_si32( (const __m128i &)cullBits0 ); 1797 1798 *outBits = (byte)( bits ^ 63 ); 1799 1800 return ( bits != 63 ); 1801 1802 #else 1803 1804 int bits = 0; 1805 1806 idVec3 v; 1807 for ( int x = 0; x < 2; x++ ) { 1808 v[0] = bounds[x][0]; 1809 for ( int y = 0; y < 2; y++ ) { 1810 v[1] = bounds[y][1]; 1811 for ( int z = 0; z < 2; z++ ) { 1812 v[2] = bounds[z][2]; 1813 1814 idVec4 c; 1815 for ( int i = 0; i < 4; i++ ) { 1816 c[i] = v[0] * mvp[i][0] + v[1] * mvp[i][1] + v[2] * mvp[i][2] + mvp[i][3]; 1817 } 1818 1819 const float minW = zeroToOne ? 0.0f : -c[3]; 1820 const float maxW = c[3]; 1821 #if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false 1822 const float minZ = 0.0f; 1823 #else 1824 const float minZ = minW; 1825 #endif 1826 1827 if ( c[0] > minW ) { bits |= ( 1 << 0 ); } 1828 if ( c[0] < maxW ) { bits |= ( 1 << 1 ); } 1829 if ( c[1] > minW ) { bits |= ( 1 << 2 ); } 1830 if ( c[1] < maxW ) { bits |= ( 1 << 3 ); } 1831 if ( c[2] > minZ ) { bits |= ( 1 << 4 ); } // NOTE: using minZ 1832 if ( c[2] < maxW ) { bits |= ( 1 << 5 ); } 1833 } 1834 } 1835 } 1836 1837 // store out a bit set for each side where the bounds is outside the clip space 1838 *outBits = (byte)( bits ^ 63 ); 1839 1840 // if any bits weren't set, the bounds is completely off one side of the frustum 1841 return ( bits != 63 ); 1842 1843 #endif 1844 } 1845 1846 /* 1847 ======================== 1848 idRenderMatrix::CullExtrudedBoundsToMVPbits 1849 1850 Returns true if nothing contained in the extruded bounds is transformed by the 1851 given Model View Projection (MVP) matrix to anything inside the clip space. 1852 1853 The given bounds is extruded in the 'extrudeDirection' up to the 'clipPlane'. 1854 1855 Normally the clip space extends from -1.0 to 1.0 on each axis, but by setting 'zeroToOne' 1856 to true, the clip space will extend from 0.0 to 1.0 on each axis for a light projection matrix. 1857 1858 When all the corners of the bounding box are behind one of the six frustum planes, the box is 1859 culled. This is conservative, because some boxes may "cross corners" and can be in front of a 1860 frustum plane, but only while also being behind another one. 1861 ======================== 1862 */ 1863 bool idRenderMatrix::CullExtrudedBoundsToMVPbits( const idRenderMatrix & mvp, const idBounds & bounds, const idVec3 & extrudeDirection, const idPlane & clipPlane, byte * outBits, bool zeroToOne ) { 1864 assert( idMath::Fabs( extrudeDirection * clipPlane.Normal() ) >= idMath::FLT_SMALLEST_NON_DENORMAL ); 1865 1866 #ifdef ID_WIN_X86_SSE2_INTRIN 1867 1868 __m128 mvp0 = _mm_loadu_ps( mvp[0] ); 1869 __m128 mvp1 = _mm_loadu_ps( mvp[1] ); 1870 __m128 mvp2 = _mm_loadu_ps( mvp[2] ); 1871 __m128 mvp3 = _mm_loadu_ps( mvp[3] ); 1872 1873 __m128 minMul = zeroToOne ? vector_float_zero : vector_float_neg_one; 1874 1875 __m128 b0 = _mm_loadu_bounds_0( bounds ); 1876 __m128 b1 = _mm_loadu_bounds_1( bounds ); 1877 1878 // take the four points on the X-Y plane 1879 __m128 vxy = _mm_unpacklo_ps( b0, b1 ); // min X, max X, min Y, max Y 1880 __m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // min X, max X, min X, max X 1881 __m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) ); // min Y, min Y, max Y, max Y 1882 1883 __m128 vz0 = _mm_splat_ps( b0, 2 ); // min Z, min Z, min Z, min Z 1884 __m128 vz1 = _mm_splat_ps( b1, 2 ); // max Z, max Z, max Z, max Z 1885 1886 __m128 cullBits0; 1887 __m128 cullBits1; 1888 __m128 cullBits2; 1889 __m128 cullBits3; 1890 __m128 cullBits4; 1891 __m128 cullBits5; 1892 1893 // calculate the cull bits for the bounding box corners 1894 { 1895 // compute four partial X,Y,Z,W values 1896 __m128 parx = _mm_splat_ps( mvp0, 3 ); 1897 __m128 pary = _mm_splat_ps( mvp1, 3 ); 1898 __m128 parz = _mm_splat_ps( mvp2, 3 ); 1899 __m128 parw = _mm_splat_ps( mvp3, 3 ); 1900 1901 parx = _mm_madd_ps( vx, _mm_splat_ps( mvp0, 0 ), parx ); 1902 pary = _mm_madd_ps( vx, _mm_splat_ps( mvp1, 0 ), pary ); 1903 parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz ); 1904 parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw ); 1905 1906 parx = _mm_madd_ps( vy, _mm_splat_ps( mvp0, 1 ), parx ); 1907 pary = _mm_madd_ps( vy, _mm_splat_ps( mvp1, 1 ), pary ); 1908 parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz ); 1909 parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw ); 1910 1911 // compute full X,Y,Z,W values 1912 __m128 mvp0Z = _mm_splat_ps( mvp0, 2 ); 1913 __m128 mvp1Z = _mm_splat_ps( mvp1, 2 ); 1914 __m128 mvp2Z = _mm_splat_ps( mvp2, 2 ); 1915 __m128 mvp3Z = _mm_splat_ps( mvp3, 2 ); 1916 1917 __m128 x0 = _mm_madd_ps( vz0, mvp0Z, parx ); 1918 __m128 y0 = _mm_madd_ps( vz0, mvp1Z, pary ); 1919 __m128 z0 = _mm_madd_ps( vz0, mvp2Z, parz ); 1920 __m128 w0 = _mm_madd_ps( vz0, mvp3Z, parw ); 1921 1922 __m128 x1 = _mm_madd_ps( vz1, mvp0Z, parx ); 1923 __m128 y1 = _mm_madd_ps( vz1, mvp1Z, pary ); 1924 __m128 z1 = _mm_madd_ps( vz1, mvp2Z, parz ); 1925 __m128 w1 = _mm_madd_ps( vz1, mvp3Z, parw ); 1926 1927 __m128 maxW0 = w0; 1928 __m128 maxW1 = w1; 1929 __m128 minW0 = _mm_mul_ps( w0, minMul ); 1930 __m128 minW1 = _mm_mul_ps( w1, minMul ); 1931 #if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false 1932 __m128 minZ0 = vector_float_zero; 1933 __m128 minZ1 = vector_float_zero; 1934 #else 1935 __m128 minZ0 = minW0; 1936 __m128 minZ1 = minW1; 1937 #endif 1938 1939 cullBits0 = _mm_cmpgt_ps( x0, minW0 ); 1940 cullBits1 = _mm_cmpgt_ps( maxW0, x0 ); 1941 cullBits2 = _mm_cmpgt_ps( y0, minW0 ); 1942 cullBits3 = _mm_cmpgt_ps( maxW0, y0 ); 1943 cullBits4 = _mm_cmpgt_ps( z0, minZ0 ); // NOTE: using minZ0 1944 cullBits5 = _mm_cmpgt_ps( maxW0, z0 ); 1945 1946 cullBits0 = _mm_or_ps( cullBits0, _mm_cmpgt_ps( x1, minW1 ) ); 1947 cullBits1 = _mm_or_ps( cullBits1, _mm_cmpgt_ps( maxW1, x1 ) ); 1948 cullBits2 = _mm_or_ps( cullBits2, _mm_cmpgt_ps( y1, minW1 ) ); 1949 cullBits3 = _mm_or_ps( cullBits3, _mm_cmpgt_ps( maxW1, y1 ) ); 1950 cullBits4 = _mm_or_ps( cullBits4, _mm_cmpgt_ps( z1, minZ1 ) ); // NOTE: using minZ1 1951 cullBits5 = _mm_or_ps( cullBits5, _mm_cmpgt_ps( maxW1, z1 ) ); 1952 } 1953 1954 // calculate and include the cull bits for the extruded bounding box corners 1955 { 1956 __m128 clipX = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 0 ), 0 ); 1957 __m128 clipY = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 1 ), 0 ); 1958 __m128 clipZ = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 2 ), 0 ); 1959 __m128 clipW = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 3 ), 0 ); 1960 1961 __m128 extrudeX = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 0 ), 0 ); 1962 __m128 extrudeY = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 1 ), 0 ); 1963 __m128 extrudeZ = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 2 ), 0 ); 1964 1965 __m128 closing = _mm_madd_ps( clipX, extrudeX, _mm_madd_ps( clipY, extrudeY, _mm_mul_ps( clipZ, extrudeZ ) ) ); 1966 __m128 invClosing = _mm_rcp32_ps( closing ); 1967 invClosing = _mm_xor_ps( invClosing, vector_float_sign_bit ); 1968 1969 __m128 dt = _mm_madd_ps( clipX, vx, _mm_madd_ps( clipY, vy, clipW ) ); 1970 __m128 d0 = _mm_madd_ps( clipZ, vz0, dt ); 1971 __m128 d1 = _mm_madd_ps( clipZ, vz1, dt ); 1972 1973 d0 = _mm_mul_ps( d0, invClosing ); 1974 d1 = _mm_mul_ps( d1, invClosing ); 1975 1976 __m128 vx0 = _mm_madd_ps( extrudeX, d0, vx ); 1977 __m128 vx1 = _mm_madd_ps( extrudeX, d1, vx ); 1978 1979 __m128 vy0 = _mm_madd_ps( extrudeY, d0, vy ); 1980 __m128 vy1 = _mm_madd_ps( extrudeY, d1, vy ); 1981 1982 vz0 = _mm_madd_ps( extrudeZ, d0, vz0 ); 1983 vz1 = _mm_madd_ps( extrudeZ, d1, vz1 ); 1984 1985 __m128 mvp0X = _mm_splat_ps( mvp0, 0 ); 1986 __m128 mvp1X = _mm_splat_ps( mvp1, 0 ); 1987 __m128 mvp2X = _mm_splat_ps( mvp2, 0 ); 1988 __m128 mvp3X = _mm_splat_ps( mvp3, 0 ); 1989 1990 __m128 mvp0W = _mm_splat_ps( mvp0, 3 ); 1991 __m128 mvp1W = _mm_splat_ps( mvp1, 3 ); 1992 __m128 mvp2W = _mm_splat_ps( mvp2, 3 ); 1993 __m128 mvp3W = _mm_splat_ps( mvp3, 3 ); 1994 1995 __m128 x0 = _mm_madd_ps( vx0, mvp0X, mvp0W ); 1996 __m128 y0 = _mm_madd_ps( vx0, mvp1X, mvp1W ); 1997 __m128 z0 = _mm_madd_ps( vx0, mvp2X, mvp2W ); 1998 __m128 w0 = _mm_madd_ps( vx0, mvp3X, mvp3W ); 1999 2000 __m128 x1 = _mm_madd_ps( vx1, mvp0X, mvp0W ); 2001 __m128 y1 = _mm_madd_ps( vx1, mvp1X, mvp1W ); 2002 __m128 z1 = _mm_madd_ps( vx1, mvp2X, mvp2W ); 2003 __m128 w1 = _mm_madd_ps( vx1, mvp3X, mvp3W ); 2004 2005 __m128 mvp0Y = _mm_splat_ps( mvp0, 1 ); 2006 __m128 mvp1Y = _mm_splat_ps( mvp1, 1 ); 2007 __m128 mvp2Y = _mm_splat_ps( mvp2, 1 ); 2008 __m128 mvp3Y = _mm_splat_ps( mvp3, 1 ); 2009 2010 x0 = _mm_madd_ps( vy0, mvp0Y, x0 ); //-V537 2011 y0 = _mm_madd_ps( vy0, mvp1Y, y0 ); 2012 z0 = _mm_madd_ps( vy0, mvp2Y, z0 ); //-V537 2013 w0 = _mm_madd_ps( vy0, mvp3Y, w0 ); 2014 2015 x1 = _mm_madd_ps( vy1, mvp0Y, x1 ); //-V537 2016 y1 = _mm_madd_ps( vy1, mvp1Y, y1 ); 2017 z1 = _mm_madd_ps( vy1, mvp2Y, z1 ); //-V537 2018 w1 = _mm_madd_ps( vy1, mvp3Y, w1 ); 2019 2020 __m128 mvp0Z = _mm_splat_ps( mvp0, 2 ); 2021 __m128 mvp1Z = _mm_splat_ps( mvp1, 2 ); 2022 __m128 mvp2Z = _mm_splat_ps( mvp2, 2 ); 2023 __m128 mvp3Z = _mm_splat_ps( mvp3, 2 ); 2024 2025 x0 = _mm_madd_ps( vz0, mvp0Z, x0 ); 2026 y0 = _mm_madd_ps( vz0, mvp1Z, y0 ); //-V537 2027 z0 = _mm_madd_ps( vz0, mvp2Z, z0 ); 2028 w0 = _mm_madd_ps( vz0, mvp3Z, w0 ); 2029 2030 x1 = _mm_madd_ps( vz1, mvp0Z, x1 ); 2031 y1 = _mm_madd_ps( vz1, mvp1Z, y1 ); //-V537 2032 z1 = _mm_madd_ps( vz1, mvp2Z, z1 ); 2033 w1 = _mm_madd_ps( vz1, mvp3Z, w1 ); 2034 2035 __m128 maxW0 = w0; 2036 __m128 maxW1 = w1; 2037 __m128 minW0 = _mm_mul_ps( w0, minMul ); 2038 __m128 minW1 = _mm_mul_ps( w1, minMul ); 2039 #if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false 2040 __m128 minZ0 = vector_float_zero; 2041 __m128 minZ1 = vector_float_zero; 2042 #else 2043 __m128 minZ0 = minW0; 2044 __m128 minZ1 = minW1; 2045 #endif 2046 2047 cullBits0 = _mm_or_ps( cullBits0, _mm_cmpgt_ps( x0, minW0 ) ); 2048 cullBits1 = _mm_or_ps( cullBits1, _mm_cmpgt_ps( maxW0, x0 ) ); 2049 cullBits2 = _mm_or_ps( cullBits2, _mm_cmpgt_ps( y0, minW0 ) ); 2050 cullBits3 = _mm_or_ps( cullBits3, _mm_cmpgt_ps( maxW0, y0 ) ); 2051 cullBits4 = _mm_or_ps( cullBits4, _mm_cmpgt_ps( z0, minZ0 ) ); // NOTE: using minZ0 2052 cullBits5 = _mm_or_ps( cullBits5, _mm_cmpgt_ps( maxW0, z0 ) ); 2053 2054 cullBits0 = _mm_or_ps( cullBits0, _mm_cmpgt_ps( x1, minW1 ) ); 2055 cullBits1 = _mm_or_ps( cullBits1, _mm_cmpgt_ps( maxW1, x1 ) ); 2056 cullBits2 = _mm_or_ps( cullBits2, _mm_cmpgt_ps( y1, minW1 ) ); 2057 cullBits3 = _mm_or_ps( cullBits3, _mm_cmpgt_ps( maxW1, y1 ) ); 2058 cullBits4 = _mm_or_ps( cullBits4, _mm_cmpgt_ps( z1, minZ1 ) ); // NOTE: using minZ1 2059 cullBits5 = _mm_or_ps( cullBits5, _mm_cmpgt_ps( maxW1, z1 ) ); 2060 } 2061 2062 cullBits0 = _mm_and_ps( cullBits0, vector_float_mask0 ); 2063 cullBits1 = _mm_and_ps( cullBits1, vector_float_mask1 ); 2064 cullBits2 = _mm_and_ps( cullBits2, vector_float_mask2 ); 2065 cullBits3 = _mm_and_ps( cullBits3, vector_float_mask3 ); 2066 cullBits4 = _mm_and_ps( cullBits4, vector_float_mask4 ); 2067 cullBits5 = _mm_and_ps( cullBits5, vector_float_mask5 ); 2068 2069 cullBits0 = _mm_or_ps( cullBits0, cullBits1 ); 2070 cullBits2 = _mm_or_ps( cullBits2, cullBits3 ); 2071 cullBits4 = _mm_or_ps( cullBits4, cullBits5 ); 2072 cullBits0 = _mm_or_ps( cullBits0, cullBits2 ); 2073 cullBits0 = _mm_or_ps( cullBits0, cullBits4 ); 2074 2075 cullBits0 = _mm_or_ps( cullBits0, _mm_perm_ps( cullBits0, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 2076 cullBits0 = _mm_or_ps( cullBits0, _mm_perm_ps( cullBits0, _MM_SHUFFLE( 0, 1, 0, 1 ) ) ); 2077 2078 int bits = _mm_cvtsi128_si32( (const __m128i &)cullBits0 ); 2079 2080 *outBits = (byte)(bits ^ 63); 2081 2082 return ( bits != 63 ); 2083 2084 #else 2085 2086 int bits = 0; 2087 2088 float closing = extrudeDirection * clipPlane.Normal(); 2089 float invClosing = -1.0f / closing; 2090 2091 idVec3 v; 2092 for ( int x = 0; x < 2; x++ ) { 2093 v[0] = bounds[x][0]; 2094 for ( int y = 0; y < 2; y++ ) { 2095 v[1] = bounds[y][1]; 2096 for ( int z = 0; z < 2; z++ ) { 2097 v[2] = bounds[z][2]; 2098 2099 for ( int extrude = 0; extrude <= 1; extrude++ ) { 2100 2101 idVec3 test; 2102 if ( extrude ) { 2103 const float extrudeDist = clipPlane.Distance( v ) * invClosing; 2104 test = v + extrudeDirection * extrudeDist; 2105 } else { 2106 test = v; 2107 } 2108 2109 idVec4 c; 2110 for ( int i = 0; i < 4; i++ ) { 2111 c[i] = test[0] * mvp[i][0] + test[1] * mvp[i][1] + test[2] * mvp[i][2] + mvp[i][3]; 2112 } 2113 2114 const float minW = zeroToOne ? 0.0f : -c[3]; 2115 const float maxW = c[3]; 2116 #if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false 2117 const float minZ = 0.0f; 2118 #else 2119 const float minZ = minW; 2120 #endif 2121 2122 if ( c[0] > minW ) { bits |= ( 1 << 0 ); } 2123 if ( c[0] < maxW ) { bits |= ( 1 << 1 ); } 2124 if ( c[1] > minW ) { bits |= ( 1 << 2 ); } 2125 if ( c[1] < maxW ) { bits |= ( 1 << 3 ); } 2126 if ( c[2] > minZ ) { bits |= ( 1 << 4 ); } // NOTE: using minZ 2127 if ( c[2] < maxW ) { bits |= ( 1 << 5 ); } 2128 } 2129 } 2130 } 2131 } 2132 2133 // store out a bit set for each side where the bounds is outside the clip space 2134 *outBits = (byte)(bits ^ 63); 2135 2136 // if any bits weren't set, the bounds is completely off one side of the frustum 2137 return ( bits != 63 ); 2138 2139 #endif 2140 } 2141 2142 /* 2143 ======================== 2144 idRenderMatrix::ProjectedBounds 2145 2146 Calculates the bounds of the given bounding box projected with the given Model View Projection (MVP) matrix. 2147 If 'windowSpace' is true then the calculated bounds along each axis are moved and clamped to the [0, 1] range. 2148 2149 The given bounding box is not clipped to the MVP so the projected bounds may not be as tight as possible. 2150 If the given bounding box is W=0 clipped then the projected bounds will cover the full X-Y range. 2151 Note that while projected[0][1] will be set to the minimum when the given bounding box is W=0 clipped, 2152 projected[1][1] will still be valid and will NOT be set to the maximum when the given bounding box 2153 is W=0 clipped. 2154 ======================== 2155 */ 2156 void idRenderMatrix::ProjectedBounds( idBounds & projected, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) { 2157 #ifdef ID_WIN_X86_SSE2_INTRIN 2158 2159 __m128 mvp0 = _mm_loadu_ps( mvp[0] ); 2160 __m128 mvp1 = _mm_loadu_ps( mvp[1] ); 2161 __m128 mvp2 = _mm_loadu_ps( mvp[2] ); 2162 __m128 mvp3 = _mm_loadu_ps( mvp[3] ); 2163 2164 __m128 b0 = _mm_loadu_bounds_0( bounds ); 2165 __m128 b1 = _mm_loadu_bounds_1( bounds ); 2166 2167 // take the four points on the X-Y plane 2168 __m128 vxy = _mm_unpacklo_ps( b0, b1 ); // min X, max X, min Y, max Y 2169 __m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // min X, max X, min X, max X 2170 __m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) ); // min Y, min Y, max Y, max Y 2171 2172 __m128 vz0 = _mm_splat_ps( b0, 2 ); // min Z, min Z, min Z, min Z 2173 __m128 vz1 = _mm_splat_ps( b1, 2 ); // max Z, max Z, max Z, max Z 2174 2175 // compute four partial X,Y,Z,W values 2176 __m128 parx = _mm_splat_ps( mvp0, 3 ); 2177 __m128 pary = _mm_splat_ps( mvp1, 3 ); 2178 __m128 parz = _mm_splat_ps( mvp2, 3 ); 2179 __m128 parw = _mm_splat_ps( mvp3, 3 ); 2180 2181 parx = _mm_madd_ps( vx, _mm_splat_ps( mvp0, 0 ), parx ); 2182 pary = _mm_madd_ps( vx, _mm_splat_ps( mvp1, 0 ), pary ); 2183 parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz ); 2184 parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw ); 2185 2186 parx = _mm_madd_ps( vy, _mm_splat_ps( mvp0, 1 ), parx ); 2187 pary = _mm_madd_ps( vy, _mm_splat_ps( mvp1, 1 ), pary ); 2188 parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz ); 2189 parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw ); 2190 2191 // compute full X,Y,Z,W values 2192 __m128 mvp0Z = _mm_splat_ps( mvp0, 2 ); 2193 __m128 mvp1Z = _mm_splat_ps( mvp1, 2 ); 2194 __m128 mvp2Z = _mm_splat_ps( mvp2, 2 ); 2195 __m128 mvp3Z = _mm_splat_ps( mvp3, 2 ); 2196 2197 __m128 x0 = _mm_madd_ps( vz0, mvp0Z, parx ); 2198 __m128 y0 = _mm_madd_ps( vz0, mvp1Z, pary ); 2199 __m128 z0 = _mm_madd_ps( vz0, mvp2Z, parz ); 2200 __m128 w0 = _mm_madd_ps( vz0, mvp3Z, parw ); 2201 2202 __m128 x1 = _mm_madd_ps( vz1, mvp0Z, parx ); 2203 __m128 y1 = _mm_madd_ps( vz1, mvp1Z, pary ); 2204 __m128 z1 = _mm_madd_ps( vz1, mvp2Z, parz ); 2205 __m128 w1 = _mm_madd_ps( vz1, mvp3Z, parw ); 2206 2207 __m128 s0 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w0 ); 2208 __m128 s1 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w1 ); 2209 2210 w0 = _mm_sel_ps( w0, vector_float_one, s0 ); 2211 w1 = _mm_sel_ps( w1, vector_float_one, s1 ); 2212 2213 __m128 rw0 = _mm_rcp32_ps( w0 ); 2214 __m128 rw1 = _mm_rcp32_ps( w1 ); 2215 2216 x0 = _mm_mul_ps( x0, rw0 ); 2217 y0 = _mm_mul_ps( y0, rw0 ); 2218 z0 = _mm_mul_ps( z0, rw0 ); 2219 2220 x1 = _mm_mul_ps( x1, rw1 ); 2221 y1 = _mm_mul_ps( y1, rw1 ); 2222 z1 = _mm_mul_ps( z1, rw1 ); 2223 2224 __m128 minX = _mm_min_ps( x0, x1 ); 2225 __m128 minY = _mm_min_ps( y0, y1 ); 2226 __m128 minZ = _mm_min_ps( z0, z1 ); 2227 2228 __m128 maxX = _mm_max_ps( x0, x1 ); 2229 __m128 maxY = _mm_max_ps( y0, y1 ); 2230 __m128 maxZ = _mm_max_ps( z0, z1 ); 2231 2232 minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 2233 minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 2234 minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 2235 2236 minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 2237 minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 2238 minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 2239 2240 maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 2241 maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 2242 maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 2243 2244 maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 2245 maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 2246 maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 2247 2248 s0 = _mm_or_ps( s0, s1 ); 2249 s0 = _mm_or_ps( s0, _mm_perm_ps( s0, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 2250 s0 = _mm_or_ps( s0, _mm_perm_ps( s0, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 2251 2252 minX = _mm_sel_ps( minX, vector_float_neg_infinity, s0 ); 2253 minY = _mm_sel_ps( minY, vector_float_neg_infinity, s0 ); 2254 minZ = _mm_sel_ps( minZ, vector_float_neg_infinity, s0 ); 2255 2256 maxX = _mm_sel_ps( maxX, vector_float_pos_infinity, s0 ); 2257 maxY = _mm_sel_ps( maxY, vector_float_pos_infinity, s0 ); 2258 // NOTE: maxZ is valid either way 2259 2260 if ( windowSpace ) { 2261 minX = _mm_madd_ps( minX, vector_float_half, vector_float_half ); 2262 maxX = _mm_madd_ps( maxX, vector_float_half, vector_float_half ); 2263 2264 minY = _mm_madd_ps( minY, vector_float_half, vector_float_half ); 2265 maxY = _mm_madd_ps( maxY, vector_float_half, vector_float_half ); 2266 2267 #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] 2268 minZ = _mm_madd_ps( minZ, vector_float_half, vector_float_half ); 2269 maxZ = _mm_madd_ps( maxZ, vector_float_half, vector_float_half ); 2270 #endif 2271 2272 minX = _mm_max_ps( _mm_min_ps( minX, vector_float_one ), vector_float_zero ); 2273 maxX = _mm_max_ps( _mm_min_ps( maxX, vector_float_one ), vector_float_zero ); 2274 2275 minY = _mm_max_ps( _mm_min_ps( minY, vector_float_one ), vector_float_zero ); 2276 maxY = _mm_max_ps( _mm_min_ps( maxY, vector_float_one ), vector_float_zero ); 2277 2278 minZ = _mm_max_ps( _mm_min_ps( minZ, vector_float_one ), vector_float_zero ); 2279 maxZ = _mm_max_ps( _mm_min_ps( maxZ, vector_float_one ), vector_float_zero ); 2280 } 2281 2282 _mm_store_ss( & projected[0].x, minX ); 2283 _mm_store_ss( & projected[0].y, minY ); 2284 _mm_store_ss( & projected[0].z, minZ ); 2285 2286 _mm_store_ss( & projected[1].x, maxX ); 2287 _mm_store_ss( & projected[1].y, maxY ); 2288 _mm_store_ss( & projected[1].z, maxZ ); 2289 2290 #else 2291 2292 for ( int i = 0; i < 3; i++ ) { 2293 projected[0][i] = RENDER_MATRIX_INFINITY; 2294 projected[1][i] = - RENDER_MATRIX_INFINITY; 2295 } 2296 2297 idVec3 v; 2298 for ( int x = 0; x < 2; x++ ) { 2299 v[0] = bounds[x][0]; 2300 for ( int y = 0; y < 2; y++ ) { 2301 v[1] = bounds[y][1]; 2302 for ( int z = 0; z < 2; z++ ) { 2303 v[2] = bounds[z][2]; 2304 2305 float tx = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3]; 2306 float ty = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3]; 2307 float tz = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; 2308 float tw = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; 2309 2310 if ( tw <= idMath::FLT_SMALLEST_NON_DENORMAL ) { 2311 projected[0][0] = -RENDER_MATRIX_INFINITY; 2312 projected[0][1] = -RENDER_MATRIX_INFINITY; 2313 projected[0][2] = -RENDER_MATRIX_INFINITY; 2314 projected[1][0] = RENDER_MATRIX_INFINITY; 2315 projected[1][1] = RENDER_MATRIX_INFINITY; 2316 // NOTE: projected[1][1] is still valid 2317 continue; 2318 } 2319 2320 float rw = 1.0f / tw; 2321 2322 tx = tx * rw; 2323 ty = ty * rw; 2324 tz = tz * rw; 2325 2326 projected[0][0] = Min( projected[0][0], tx ); 2327 projected[0][1] = Min( projected[0][1], ty ); 2328 projected[0][2] = Min( projected[0][2], tz ); 2329 2330 projected[1][0] = Max( projected[1][0], tx ); 2331 projected[1][1] = Max( projected[1][1], ty ); 2332 projected[1][2] = Max( projected[1][2], tz ); 2333 } 2334 } 2335 } 2336 2337 if ( windowSpace ) { 2338 // convert to window coords 2339 projected[0][0] = projected[0][0] * 0.5f + 0.5f; 2340 projected[1][0] = projected[1][0] * 0.5f + 0.5f; 2341 2342 projected[0][1] = projected[0][1] * 0.5f + 0.5f; 2343 projected[1][1] = projected[1][1] * 0.5f + 0.5f; 2344 2345 #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] 2346 projected[0][2] = projected[0][2] * 0.5f + 0.5f; 2347 projected[1][2] = projected[1][2] * 0.5f + 0.5f; 2348 #endif 2349 2350 // clamp to [0, 1] range 2351 projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] ); 2352 projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] ); 2353 2354 projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] ); 2355 projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] ); 2356 2357 projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] ); 2358 projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] ); 2359 } 2360 2361 #endif 2362 } 2363 2364 /* 2365 ======================== 2366 idRenderMatrix::ProjectedNearClippedBounds 2367 2368 Calculates the bounds of the given bounding box projected with the given Model View Projection (MVP) matrix. 2369 If 'windowSpace' is true then the calculated bounds along each axis are moved and clamped to the [0, 1] range. 2370 2371 The given bounding box is first near clipped so the projected bounds do not cover the full X-Y range when 2372 the given bounding box crosses the W=0 plane. However, the given bounding box is not clipped against the 2373 other planes so the projected bounds are still not as tight as they could be if the given bounding box 2374 crosses a corner. Fortunately, clipping to the near clipping planes typically provides more than 50% of 2375 the gain between not clipping at all and fully clipping the bounding box to all planes. Only clipping to 2376 the near clipping plane is much cheaper than clipping to all planes and can be easily implemented with 2377 completely branchless SIMD. 2378 ======================== 2379 */ 2380 void idRenderMatrix::ProjectedNearClippedBounds( idBounds & projected, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) { 2381 /* 2382 4----{E}---5 2383 + /| /| 2384 Z {H} {I} {F} | 2385 - / | / {J} 2386 7--{G}-----6 | 2387 | | | | 2388 {L} 0----|-{A}-1 2389 | / {K} / - 2390 | {D} | {B} Y 2391 |/ |/ + 2392 3---{C}----2 2393 2394 - X + 2395 */ 2396 2397 #ifdef ID_WIN_X86_SSE2_INTRIN 2398 2399 const __m128 mvp0 = _mm_loadu_ps( mvp[0] ); 2400 const __m128 mvp1 = _mm_loadu_ps( mvp[1] ); 2401 const __m128 mvp2 = _mm_loadu_ps( mvp[2] ); 2402 const __m128 mvp3 = _mm_loadu_ps( mvp[3] ); 2403 2404 const __m128 b0 = _mm_loadu_bounds_0( bounds ); 2405 const __m128 b1 = _mm_loadu_bounds_1( bounds ); 2406 2407 // take the four points on the X-Y plane 2408 const __m128 vxy = _mm_unpacklo_ps( b0, b1 ); // min X, max X, min Y, max Y 2409 const __m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // min X, max X, min X, max X 2410 const __m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) ); // min Y, min Y, max Y, max Y 2411 2412 const __m128 vz0 = _mm_splat_ps( b0, 2 ); // min Z, min Z, min Z, min Z 2413 const __m128 vz1 = _mm_splat_ps( b1, 2 ); // max Z, max Z, max Z, max Z 2414 2415 // compute four partial X,Y,Z,W values 2416 __m128 parx = _mm_splat_ps( mvp0, 3 ); 2417 __m128 pary = _mm_splat_ps( mvp1, 3 ); 2418 __m128 parz = _mm_splat_ps( mvp2, 3 ); 2419 __m128 parw = _mm_splat_ps( mvp3, 3 ); 2420 2421 parx = _mm_madd_ps( vx, _mm_splat_ps( mvp0, 0 ), parx ); 2422 pary = _mm_madd_ps( vx, _mm_splat_ps( mvp1, 0 ), pary ); 2423 parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz ); 2424 parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw ); 2425 2426 parx = _mm_madd_ps( vy, _mm_splat_ps( mvp0, 1 ), parx ); 2427 pary = _mm_madd_ps( vy, _mm_splat_ps( mvp1, 1 ), pary ); 2428 parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz ); 2429 parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw ); 2430 2431 // compute full X,Y,Z,W values 2432 const __m128 mvp0Z = _mm_splat_ps( mvp0, 2 ); 2433 const __m128 mvp1Z = _mm_splat_ps( mvp1, 2 ); 2434 const __m128 mvp2Z = _mm_splat_ps( mvp2, 2 ); 2435 const __m128 mvp3Z = _mm_splat_ps( mvp3, 2 ); 2436 2437 const __m128 x_0123 = _mm_madd_ps( vz0, mvp0Z, parx ); 2438 const __m128 y_0123 = _mm_madd_ps( vz0, mvp1Z, pary ); 2439 const __m128 z_0123 = _mm_madd_ps( vz0, mvp2Z, parz ); 2440 const __m128 w_0123 = _mm_madd_ps( vz0, mvp3Z, parw ); 2441 2442 const __m128 x_4567 = _mm_madd_ps( vz1, mvp0Z, parx ); 2443 const __m128 y_4567 = _mm_madd_ps( vz1, mvp1Z, pary ); 2444 const __m128 z_4567 = _mm_madd_ps( vz1, mvp2Z, parz ); 2445 const __m128 w_4567 = _mm_madd_ps( vz1, mvp3Z, parw ); 2446 2447 // rotate the X,Y,Z,W values up by one 2448 const __m128 x_1230 = _mm_perm_ps( x_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 2449 const __m128 y_1230 = _mm_perm_ps( y_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 2450 const __m128 z_1230 = _mm_perm_ps( z_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 2451 const __m128 w_1230 = _mm_perm_ps( w_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 2452 2453 const __m128 x_5674 = _mm_perm_ps( x_4567, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 2454 const __m128 y_5674 = _mm_perm_ps( y_4567, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 2455 const __m128 z_5674 = _mm_perm_ps( z_4567, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 2456 const __m128 w_5674 = _mm_perm_ps( w_4567, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 2457 2458 #if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1 2459 const __m128 d_0123 = z_0123; 2460 const __m128 d_4567 = z_4567; 2461 const __m128 d_1230 = z_1230; 2462 const __m128 d_5674 = z_5674; 2463 #else 2464 const __m128 d_0123 = _mm_add_ps( z_0123, w_0123 ); 2465 const __m128 d_4567 = _mm_add_ps( z_4567, w_4567 ); 2466 const __m128 d_1230 = _mm_add_ps( z_1230, w_1230 ); 2467 const __m128 d_5674 = _mm_add_ps( z_5674, w_5674 ); 2468 #endif 2469 2470 const __m128 deltaABCD = _mm_sub_ps( d_0123, d_1230 ); 2471 const __m128 deltaEFGH = _mm_sub_ps( d_4567, d_5674 ); 2472 const __m128 deltaIJKL = _mm_sub_ps( d_0123, d_4567 ); 2473 2474 const __m128 maskABCD = _mm_cmpgt_ps( _mm_and_ps( deltaABCD, vector_float_abs_mask ), vector_float_smallest_non_denorm ); 2475 const __m128 maskEFGH = _mm_cmpgt_ps( _mm_and_ps( deltaEFGH, vector_float_abs_mask ), vector_float_smallest_non_denorm ); 2476 const __m128 maskIJKL = _mm_cmpgt_ps( _mm_and_ps( deltaIJKL, vector_float_abs_mask ), vector_float_smallest_non_denorm ); 2477 2478 const __m128 fractionABCD = _mm_and_ps( _mm_div32_ps( d_0123, _mm_sel_ps( vector_float_one, deltaABCD, maskABCD ) ), maskABCD ); 2479 const __m128 fractionEFGH = _mm_and_ps( _mm_div32_ps( d_4567, _mm_sel_ps( vector_float_one, deltaEFGH, maskEFGH ) ), maskEFGH ); 2480 const __m128 fractionIJKL = _mm_and_ps( _mm_div32_ps( d_0123, _mm_sel_ps( vector_float_one, deltaIJKL, maskIJKL ) ), maskIJKL ); 2481 2482 const __m128 clipABCD = _mm_and_ps( _mm_cmpgt_ps( fractionABCD, vector_float_zero ), _mm_cmpgt_ps( vector_float_one, fractionABCD ) ); 2483 const __m128 clipEFGH = _mm_and_ps( _mm_cmpgt_ps( fractionEFGH, vector_float_zero ), _mm_cmpgt_ps( vector_float_one, fractionEFGH ) ); 2484 const __m128 clipIJKL = _mm_and_ps( _mm_cmpgt_ps( fractionIJKL, vector_float_zero ), _mm_cmpgt_ps( vector_float_one, fractionIJKL ) ); 2485 2486 const __m128 intersectionABCD_x = _mm_madd_ps( fractionABCD, _mm_sub_ps( x_1230, x_0123 ), x_0123 ); 2487 const __m128 intersectionABCD_y = _mm_madd_ps( fractionABCD, _mm_sub_ps( y_1230, y_0123 ), y_0123 ); 2488 const __m128 intersectionABCD_z = _mm_madd_ps( fractionABCD, _mm_sub_ps( z_1230, z_0123 ), z_0123 ); 2489 const __m128 intersectionABCD_w = _mm_madd_ps( fractionABCD, _mm_sub_ps( w_1230, w_0123 ), w_0123 ); 2490 2491 const __m128 intersectionEFGH_x = _mm_madd_ps( fractionEFGH, _mm_sub_ps( x_5674, x_4567 ), x_4567 ); 2492 const __m128 intersectionEFGH_y = _mm_madd_ps( fractionEFGH, _mm_sub_ps( y_5674, y_4567 ), y_4567 ); 2493 const __m128 intersectionEFGH_z = _mm_madd_ps( fractionEFGH, _mm_sub_ps( z_5674, z_4567 ), z_4567 ); 2494 const __m128 intersectionEFGH_w = _mm_madd_ps( fractionEFGH, _mm_sub_ps( w_5674, w_4567 ), w_4567 ); 2495 2496 const __m128 intersectionIJKL_x = _mm_madd_ps( fractionIJKL, _mm_sub_ps( x_4567, x_0123 ), x_0123 ); 2497 const __m128 intersectionIJKL_y = _mm_madd_ps( fractionIJKL, _mm_sub_ps( y_4567, y_0123 ), y_0123 ); 2498 const __m128 intersectionIJKL_z = _mm_madd_ps( fractionIJKL, _mm_sub_ps( z_4567, z_0123 ), z_0123 ); 2499 const __m128 intersectionIJKL_w = _mm_madd_ps( fractionIJKL, _mm_sub_ps( w_4567, w_0123 ), w_0123 ); 2500 2501 const __m128 mask_0123 = _mm_cmpgt_ps( vector_float_zero, d_0123 ); 2502 const __m128 mask_1230 = _mm_cmpgt_ps( vector_float_zero, d_1230 ); 2503 const __m128 mask_4567 = _mm_cmpgt_ps( vector_float_zero, d_4567 ); 2504 const __m128 mask_5674 = _mm_cmpgt_ps( vector_float_zero, d_5674 ); 2505 2506 const __m128 maskABCD_0123 = _mm_and_ps( clipABCD, mask_0123 ); 2507 const __m128 maskABCD_1230 = _mm_and_ps( clipABCD, mask_1230 ); 2508 const __m128 maskEFGH_4567 = _mm_and_ps( clipEFGH, mask_4567 ); 2509 const __m128 maskEFGH_5674 = _mm_and_ps( clipEFGH, mask_5674 ); 2510 const __m128 maskIJKL_0123 = _mm_and_ps( clipIJKL, mask_0123 ); 2511 const __m128 maskIJKL_4567 = _mm_and_ps( clipIJKL, mask_4567 ); 2512 2513 __m128 edgeVertsABCD_x0 = _mm_sel_ps( x_0123, intersectionABCD_x, maskABCD_0123 ); 2514 __m128 edgeVertsABCD_y0 = _mm_sel_ps( y_0123, intersectionABCD_y, maskABCD_0123 ); 2515 __m128 edgeVertsABCD_z0 = _mm_sel_ps( z_0123, intersectionABCD_z, maskABCD_0123 ); 2516 __m128 edgeVertsABCD_w0 = _mm_sel_ps( w_0123, intersectionABCD_w, maskABCD_0123 ); 2517 2518 __m128 edgeVertsABCD_x1 = _mm_sel_ps( x_1230, intersectionABCD_x, maskABCD_1230 ); 2519 __m128 edgeVertsABCD_y1 = _mm_sel_ps( y_1230, intersectionABCD_y, maskABCD_1230 ); 2520 __m128 edgeVertsABCD_z1 = _mm_sel_ps( z_1230, intersectionABCD_z, maskABCD_1230 ); 2521 __m128 edgeVertsABCD_w1 = _mm_sel_ps( w_1230, intersectionABCD_w, maskABCD_1230 ); 2522 2523 __m128 edgeVertsEFGH_x0 = _mm_sel_ps( x_4567, intersectionEFGH_x, maskEFGH_4567 ); 2524 __m128 edgeVertsEFGH_y0 = _mm_sel_ps( y_4567, intersectionEFGH_y, maskEFGH_4567 ); 2525 __m128 edgeVertsEFGH_z0 = _mm_sel_ps( z_4567, intersectionEFGH_z, maskEFGH_4567 ); 2526 __m128 edgeVertsEFGH_w0 = _mm_sel_ps( w_4567, intersectionEFGH_w, maskEFGH_4567 ); 2527 2528 __m128 edgeVertsEFGH_x1 = _mm_sel_ps( x_5674, intersectionEFGH_x, maskEFGH_5674 ); 2529 __m128 edgeVertsEFGH_y1 = _mm_sel_ps( y_5674, intersectionEFGH_y, maskEFGH_5674 ); 2530 __m128 edgeVertsEFGH_z1 = _mm_sel_ps( z_5674, intersectionEFGH_z, maskEFGH_5674 ); 2531 __m128 edgeVertsEFGH_w1 = _mm_sel_ps( w_5674, intersectionEFGH_w, maskEFGH_5674 ); 2532 2533 __m128 edgeVertsIJKL_x0 = _mm_sel_ps( x_0123, intersectionIJKL_x, maskIJKL_0123 ); 2534 __m128 edgeVertsIJKL_y0 = _mm_sel_ps( y_0123, intersectionIJKL_y, maskIJKL_0123 ); 2535 __m128 edgeVertsIJKL_z0 = _mm_sel_ps( z_0123, intersectionIJKL_z, maskIJKL_0123 ); 2536 __m128 edgeVertsIJKL_w0 = _mm_sel_ps( w_0123, intersectionIJKL_w, maskIJKL_0123 ); 2537 2538 __m128 edgeVertsIJKL_x1 = _mm_sel_ps( x_4567, intersectionIJKL_x, maskIJKL_4567 ); 2539 __m128 edgeVertsIJKL_y1 = _mm_sel_ps( y_4567, intersectionIJKL_y, maskIJKL_4567 ); 2540 __m128 edgeVertsIJKL_z1 = _mm_sel_ps( z_4567, intersectionIJKL_z, maskIJKL_4567 ); 2541 __m128 edgeVertsIJKL_w1 = _mm_sel_ps( w_4567, intersectionIJKL_w, maskIJKL_4567 ); 2542 2543 const __m128 maskABCD_w0 = _mm_cmpgt_ps( edgeVertsABCD_w0, vector_float_smallest_non_denorm ); 2544 const __m128 maskABCD_w1 = _mm_cmpgt_ps( edgeVertsABCD_w1, vector_float_smallest_non_denorm ); 2545 const __m128 maskEFGH_w0 = _mm_cmpgt_ps( edgeVertsEFGH_w0, vector_float_smallest_non_denorm ); 2546 const __m128 maskEFGH_w1 = _mm_cmpgt_ps( edgeVertsEFGH_w1, vector_float_smallest_non_denorm ); 2547 const __m128 maskIJKL_w0 = _mm_cmpgt_ps( edgeVertsIJKL_w0, vector_float_smallest_non_denorm ); 2548 const __m128 maskIJKL_w1 = _mm_cmpgt_ps( edgeVertsIJKL_w1, vector_float_smallest_non_denorm ); 2549 2550 edgeVertsABCD_w0 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsABCD_w0, maskABCD_w0 ) ); 2551 edgeVertsABCD_w1 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsABCD_w1, maskABCD_w1 ) ); 2552 edgeVertsEFGH_w0 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsEFGH_w0, maskEFGH_w0 ) ); 2553 edgeVertsEFGH_w1 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsEFGH_w1, maskEFGH_w1 ) ); 2554 edgeVertsIJKL_w0 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsIJKL_w0, maskIJKL_w0 ) ); 2555 edgeVertsIJKL_w1 = _mm_rcp32_ps( _mm_sel_ps( vector_float_one, edgeVertsIJKL_w1, maskIJKL_w1 ) ); 2556 2557 edgeVertsABCD_x0 = _mm_mul_ps( edgeVertsABCD_x0, edgeVertsABCD_w0 ); 2558 edgeVertsABCD_x1 = _mm_mul_ps( edgeVertsABCD_x1, edgeVertsABCD_w1 ); 2559 edgeVertsEFGH_x0 = _mm_mul_ps( edgeVertsEFGH_x0, edgeVertsEFGH_w0 ); 2560 edgeVertsEFGH_x1 = _mm_mul_ps( edgeVertsEFGH_x1, edgeVertsEFGH_w1 ); 2561 edgeVertsIJKL_x0 = _mm_mul_ps( edgeVertsIJKL_x0, edgeVertsIJKL_w0 ); 2562 edgeVertsIJKL_x1 = _mm_mul_ps( edgeVertsIJKL_x1, edgeVertsIJKL_w1 ); 2563 2564 edgeVertsABCD_y0 = _mm_mul_ps( edgeVertsABCD_y0, edgeVertsABCD_w0 ); 2565 edgeVertsABCD_y1 = _mm_mul_ps( edgeVertsABCD_y1, edgeVertsABCD_w1 ); 2566 edgeVertsEFGH_y0 = _mm_mul_ps( edgeVertsEFGH_y0, edgeVertsEFGH_w0 ); 2567 edgeVertsEFGH_y1 = _mm_mul_ps( edgeVertsEFGH_y1, edgeVertsEFGH_w1 ); 2568 edgeVertsIJKL_y0 = _mm_mul_ps( edgeVertsIJKL_y0, edgeVertsIJKL_w0 ); 2569 edgeVertsIJKL_y1 = _mm_mul_ps( edgeVertsIJKL_y1, edgeVertsIJKL_w1 ); 2570 2571 edgeVertsABCD_z0 = _mm_mul_ps( edgeVertsABCD_z0, edgeVertsABCD_w0 ); 2572 edgeVertsABCD_z1 = _mm_mul_ps( edgeVertsABCD_z1, edgeVertsABCD_w1 ); 2573 edgeVertsEFGH_z0 = _mm_mul_ps( edgeVertsEFGH_z0, edgeVertsEFGH_w0 ); 2574 edgeVertsEFGH_z1 = _mm_mul_ps( edgeVertsEFGH_z1, edgeVertsEFGH_w1 ); 2575 edgeVertsIJKL_z0 = _mm_mul_ps( edgeVertsIJKL_z0, edgeVertsIJKL_w0 ); 2576 edgeVertsIJKL_z1 = _mm_mul_ps( edgeVertsIJKL_z1, edgeVertsIJKL_w1 ); 2577 2578 const __m128 posInf = vector_float_pos_infinity; 2579 const __m128 negInf = vector_float_neg_infinity; 2580 2581 const __m128 minX0 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsABCD_x0, maskABCD_w0 ), _mm_sel_ps( posInf, edgeVertsABCD_x1, maskABCD_w1 ) ); 2582 const __m128 minX1 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsEFGH_x0, maskEFGH_w0 ), _mm_sel_ps( posInf, edgeVertsEFGH_x1, maskEFGH_w1 ) ); 2583 const __m128 minX2 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsIJKL_x0, maskIJKL_w0 ), _mm_sel_ps( posInf, edgeVertsIJKL_x1, maskIJKL_w1 ) ); 2584 2585 const __m128 minY0 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsABCD_y0, maskABCD_w0 ), _mm_sel_ps( posInf, edgeVertsABCD_y1, maskABCD_w1 ) ); 2586 const __m128 minY1 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsEFGH_y0, maskEFGH_w0 ), _mm_sel_ps( posInf, edgeVertsEFGH_y1, maskEFGH_w1 ) ); 2587 const __m128 minY2 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsIJKL_y0, maskIJKL_w0 ), _mm_sel_ps( posInf, edgeVertsIJKL_y1, maskIJKL_w1 ) ); 2588 2589 const __m128 minZ0 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsABCD_z0, maskABCD_w0 ), _mm_sel_ps( posInf, edgeVertsABCD_z1, maskABCD_w1 ) ); 2590 const __m128 minZ1 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsEFGH_z0, maskEFGH_w0 ), _mm_sel_ps( posInf, edgeVertsEFGH_z1, maskEFGH_w1 ) ); 2591 const __m128 minZ2 = _mm_min_ps( _mm_sel_ps( posInf, edgeVertsIJKL_z0, maskIJKL_w0 ), _mm_sel_ps( posInf, edgeVertsIJKL_z1, maskIJKL_w1 ) ); 2592 2593 const __m128 maxX0 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsABCD_x0, maskABCD_w0 ), _mm_sel_ps( negInf, edgeVertsABCD_x1, maskABCD_w1 ) ); 2594 const __m128 maxX1 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsEFGH_x0, maskEFGH_w0 ), _mm_sel_ps( negInf, edgeVertsEFGH_x1, maskEFGH_w1 ) ); 2595 const __m128 maxX2 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsIJKL_x0, maskIJKL_w0 ), _mm_sel_ps( negInf, edgeVertsIJKL_x1, maskIJKL_w1 ) ); 2596 2597 const __m128 maxY0 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsABCD_y0, maskABCD_w0 ), _mm_sel_ps( negInf, edgeVertsABCD_y1, maskABCD_w1 ) ); 2598 const __m128 maxY1 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsEFGH_y0, maskEFGH_w0 ), _mm_sel_ps( negInf, edgeVertsEFGH_y1, maskEFGH_w1 ) ); 2599 const __m128 maxY2 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsIJKL_y0, maskIJKL_w0 ), _mm_sel_ps( negInf, edgeVertsIJKL_y1, maskIJKL_w1 ) ); 2600 2601 const __m128 maxZ0 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsABCD_z0, maskABCD_w0 ), _mm_sel_ps( negInf, edgeVertsABCD_z1, maskABCD_w1 ) ); 2602 const __m128 maxZ1 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsEFGH_z0, maskEFGH_w0 ), _mm_sel_ps( negInf, edgeVertsEFGH_z1, maskEFGH_w1 ) ); 2603 const __m128 maxZ2 = _mm_max_ps( _mm_sel_ps( negInf, edgeVertsIJKL_z0, maskIJKL_w0 ), _mm_sel_ps( negInf, edgeVertsIJKL_z1, maskIJKL_w1 ) ); 2604 2605 __m128 minX = _mm_min_ps( minX0, _mm_min_ps( minX1, minX2 ) ); 2606 __m128 minY = _mm_min_ps( minY0, _mm_min_ps( minY1, minY2 ) ); 2607 __m128 minZ = _mm_min_ps( minZ0, _mm_min_ps( minZ1, minZ2 ) ); 2608 2609 __m128 maxX = _mm_max_ps( maxX0, _mm_max_ps( maxX1, maxX2 ) ); 2610 __m128 maxY = _mm_max_ps( maxY0, _mm_max_ps( maxY1, maxY2 ) ); 2611 __m128 maxZ = _mm_max_ps( maxZ0, _mm_max_ps( maxZ1, maxZ2 ) ); 2612 2613 minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 2614 minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 2615 minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 2616 2617 minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 2618 minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 2619 minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 2620 2621 maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 2622 maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 2623 maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 2624 2625 maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 2626 maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 2627 maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 2628 2629 if ( windowSpace ) { 2630 minX = _mm_madd_ps( minX, vector_float_half, vector_float_half ); 2631 maxX = _mm_madd_ps( maxX, vector_float_half, vector_float_half ); 2632 2633 minY = _mm_madd_ps( minY, vector_float_half, vector_float_half ); 2634 maxY = _mm_madd_ps( maxY, vector_float_half, vector_float_half ); 2635 2636 #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] 2637 minZ = _mm_madd_ps( minZ, vector_float_half, vector_float_half ); 2638 maxZ = _mm_madd_ps( maxZ, vector_float_half, vector_float_half ); 2639 #endif 2640 2641 minX = _mm_max_ps( _mm_min_ps( minX, vector_float_one ), vector_float_zero ); 2642 maxX = _mm_max_ps( _mm_min_ps( maxX, vector_float_one ), vector_float_zero ); 2643 2644 minY = _mm_max_ps( _mm_min_ps( minY, vector_float_one ), vector_float_zero ); 2645 maxY = _mm_max_ps( _mm_min_ps( maxY, vector_float_one ), vector_float_zero ); 2646 2647 minZ = _mm_max_ps( _mm_min_ps( minZ, vector_float_one ), vector_float_zero ); 2648 maxZ = _mm_max_ps( _mm_min_ps( maxZ, vector_float_one ), vector_float_zero ); 2649 } 2650 2651 _mm_store_ss( & projected[0].x, minX ); 2652 _mm_store_ss( & projected[0].y, minY ); 2653 _mm_store_ss( & projected[0].z, minZ ); 2654 2655 _mm_store_ss( & projected[1].x, maxX ); 2656 _mm_store_ss( & projected[1].y, maxY ); 2657 _mm_store_ss( & projected[1].z, maxZ ); 2658 2659 #elif 1 2660 2661 { 2662 const idVec3 points[8] = { 2663 idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ), 2664 idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ), 2665 idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ), 2666 idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ), 2667 idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ), 2668 idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ), 2669 idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ), 2670 idVec3( bounds[0][0], bounds[1][1], bounds[1][2] ) 2671 }; 2672 2673 idVec4 projectedPoints[8]; 2674 for ( int i = 0; i < 8; i++ ) { 2675 const idVec3 & v = points[i]; 2676 projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3]; 2677 projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3]; 2678 projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; 2679 projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; 2680 } 2681 2682 const idVec4 & p0 = projectedPoints[0]; 2683 const idVec4 & p1 = projectedPoints[1]; 2684 const idVec4 & p2 = projectedPoints[2]; 2685 const idVec4 & p3 = projectedPoints[3]; 2686 const idVec4 & p4 = projectedPoints[4]; 2687 const idVec4 & p5 = projectedPoints[5]; 2688 const idVec4 & p6 = projectedPoints[6]; 2689 const idVec4 & p7 = projectedPoints[7]; 2690 2691 #if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1 2692 const float d0 = p0.z; 2693 const float d1 = p1.z; 2694 const float d2 = p2.z; 2695 const float d3 = p3.z; 2696 const float d4 = p4.z; 2697 const float d5 = p5.z; 2698 const float d6 = p6.z; 2699 const float d7 = p7.z; 2700 #else 2701 const float d0 = p0.z + p0.w; 2702 const float d1 = p1.z + p1.w; 2703 const float d2 = p2.z + p2.w; 2704 const float d3 = p3.z + p3.w; 2705 const float d4 = p4.z + p4.w; 2706 const float d5 = p5.z + p5.w; 2707 const float d6 = p6.z + p6.w; 2708 const float d7 = p7.z + p7.w; 2709 #endif 2710 2711 const float deltaA = d0 - d1; 2712 const float deltaB = d1 - d2; 2713 const float deltaC = d2 - d3; 2714 const float deltaD = d3 - d0; 2715 2716 const float deltaE = d4 - d5; 2717 const float deltaF = d5 - d6; 2718 const float deltaG = d6 - d7; 2719 const float deltaH = d7 - d4; 2720 2721 const float deltaI = d0 - d4; 2722 const float deltaJ = d1 - d5; 2723 const float deltaK = d2 - d6; 2724 const float deltaL = d3 - d7; 2725 2726 const float fractionA = ( fabs( deltaA ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d0 / deltaA ) : 0.0f; 2727 const float fractionB = ( fabs( deltaB ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d1 / deltaB ) : 0.0f; 2728 const float fractionC = ( fabs( deltaC ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d2 / deltaC ) : 0.0f; 2729 const float fractionD = ( fabs( deltaD ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d3 / deltaD ) : 0.0f; 2730 2731 const float fractionE = ( fabs( deltaE ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d4 / deltaE ) : 0.0f; 2732 const float fractionF = ( fabs( deltaF ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d5 / deltaF ) : 0.0f; 2733 const float fractionG = ( fabs( deltaG ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d6 / deltaG ) : 0.0f; 2734 const float fractionH = ( fabs( deltaH ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d7 / deltaH ) : 0.0f; 2735 2736 const float fractionI = ( fabs( deltaI ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d0 / deltaI ) : 0.0f; 2737 const float fractionJ = ( fabs( deltaJ ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d1 / deltaJ ) : 0.0f; 2738 const float fractionK = ( fabs( deltaK ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d2 / deltaK ) : 0.0f; 2739 const float fractionL = ( fabs( deltaL ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d3 / deltaL ) : 0.0f; 2740 2741 const bool clipA = ( fractionA > 0.0f && fractionA < 1.0f ); 2742 const bool clipB = ( fractionB > 0.0f && fractionB < 1.0f ); 2743 const bool clipC = ( fractionC > 0.0f && fractionC < 1.0f ); 2744 const bool clipD = ( fractionD > 0.0f && fractionD < 1.0f ); 2745 2746 const bool clipE = ( fractionE > 0.0f && fractionE < 1.0f ); 2747 const bool clipF = ( fractionF > 0.0f && fractionF < 1.0f ); 2748 const bool clipG = ( fractionG > 0.0f && fractionG < 1.0f ); 2749 const bool clipH = ( fractionH > 0.0f && fractionH < 1.0f ); 2750 2751 const bool clipI = ( fractionI > 0.0f && fractionI < 1.0f ); 2752 const bool clipJ = ( fractionJ > 0.0f && fractionJ < 1.0f ); 2753 const bool clipK = ( fractionK > 0.0f && fractionK < 1.0f ); 2754 const bool clipL = ( fractionL > 0.0f && fractionL < 1.0f ); 2755 2756 const idVec4 intersectionA = p0 + fractionA * ( p1 - p0 ); 2757 const idVec4 intersectionB = p1 + fractionB * ( p2 - p1 ); 2758 const idVec4 intersectionC = p2 + fractionC * ( p3 - p2 ); 2759 const idVec4 intersectionD = p3 + fractionD * ( p0 - p3 ); 2760 2761 const idVec4 intersectionE = p4 + fractionE * ( p5 - p4 ); 2762 const idVec4 intersectionF = p5 + fractionF * ( p6 - p5 ); 2763 const idVec4 intersectionG = p6 + fractionG * ( p7 - p6 ); 2764 const idVec4 intersectionH = p7 + fractionH * ( p4 - p7 ); 2765 2766 const idVec4 intersectionI = p0 + fractionI * ( p4 - p0 ); 2767 const idVec4 intersectionJ = p1 + fractionJ * ( p5 - p1 ); 2768 const idVec4 intersectionK = p2 + fractionK * ( p6 - p2 ); 2769 const idVec4 intersectionL = p3 + fractionL * ( p7 - p3 ); 2770 2771 idVec4 edgeVerts[24]; 2772 2773 edgeVerts[ 0] = ( clipA && d0 < 0.0f ) ? intersectionA : p0; 2774 edgeVerts[ 2] = ( clipB && d1 < 0.0f ) ? intersectionB : p1; 2775 edgeVerts[ 4] = ( clipC && d2 < 0.0f ) ? intersectionC : p2; 2776 edgeVerts[ 6] = ( clipD && d3 < 0.0f ) ? intersectionD : p3; 2777 2778 edgeVerts[ 1] = ( clipA && d1 < 0.0f ) ? intersectionA : p1; 2779 edgeVerts[ 3] = ( clipB && d2 < 0.0f ) ? intersectionB : p2; 2780 edgeVerts[ 5] = ( clipC && d3 < 0.0f ) ? intersectionC : p3; 2781 edgeVerts[ 7] = ( clipD && d0 < 0.0f ) ? intersectionD : p0; 2782 2783 edgeVerts[ 8] = ( clipE && d4 < 0.0f ) ? intersectionE : p4; 2784 edgeVerts[10] = ( clipF && d5 < 0.0f ) ? intersectionF : p5; 2785 edgeVerts[12] = ( clipG && d6 < 0.0f ) ? intersectionG : p6; 2786 edgeVerts[14] = ( clipH && d7 < 0.0f ) ? intersectionH : p7; 2787 2788 edgeVerts[ 9] = ( clipE && d5 < 0.0f ) ? intersectionE : p5; 2789 edgeVerts[11] = ( clipF && d6 < 0.0f ) ? intersectionF : p6; 2790 edgeVerts[13] = ( clipG && d7 < 0.0f ) ? intersectionG : p7; 2791 edgeVerts[15] = ( clipH && d4 < 0.0f ) ? intersectionH : p4; 2792 2793 edgeVerts[16] = ( clipI && d0 < 0.0f ) ? intersectionI : p0; 2794 edgeVerts[18] = ( clipJ && d1 < 0.0f ) ? intersectionJ : p1; 2795 edgeVerts[20] = ( clipK && d2 < 0.0f ) ? intersectionK : p2; 2796 edgeVerts[22] = ( clipL && d3 < 0.0f ) ? intersectionL : p3; 2797 2798 edgeVerts[17] = ( clipI && d4 < 0.0f ) ? intersectionI : p4; 2799 edgeVerts[19] = ( clipJ && d5 < 0.0f ) ? intersectionJ : p5; 2800 edgeVerts[21] = ( clipK && d6 < 0.0f ) ? intersectionK : p6; 2801 edgeVerts[23] = ( clipL && d7 < 0.0f ) ? intersectionL : p7; 2802 2803 idBounds projBnds; 2804 for ( int i = 0; i < 3; i++ ) { 2805 projBnds[0][i] = RENDER_MATRIX_INFINITY; 2806 projBnds[1][i] = - RENDER_MATRIX_INFINITY; 2807 } 2808 2809 for ( int i = 0; i < 24; i++ ) { 2810 const idVec4 & v = edgeVerts[i]; 2811 2812 if ( v.w <= idMath::FLT_SMALLEST_NON_DENORMAL ) { 2813 continue; 2814 } 2815 2816 const float rw = 1.0f / v.w; 2817 2818 const float px = v.x * rw; 2819 const float py = v.y * rw; 2820 const float pz = v.z * rw; 2821 2822 projBnds[0][0] = Min( projBnds[0][0], px ); 2823 projBnds[0][1] = Min( projBnds[0][1], py ); 2824 projBnds[0][2] = Min( projBnds[0][2], pz ); 2825 2826 projBnds[1][0] = Max( projBnds[1][0], px ); 2827 projBnds[1][1] = Max( projBnds[1][1], py ); 2828 projBnds[1][2] = Max( projBnds[1][2], pz ); 2829 } 2830 2831 if ( windowSpace ) { 2832 // convert to window coords 2833 projBnds[0][0] = projBnds[0][0] * 0.5f + 0.5f; 2834 projBnds[1][0] = projBnds[1][0] * 0.5f + 0.5f; 2835 2836 projBnds[0][1] = projBnds[0][1] * 0.5f + 0.5f; 2837 projBnds[1][1] = projBnds[1][1] * 0.5f + 0.5f; 2838 2839 #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] 2840 projBnds[0][2] = projBnds[0][2] * 0.5f + 0.5f; 2841 projBnds[1][2] = projBnds[1][2] * 0.5f + 0.5f; 2842 #endif 2843 2844 // clamp to [0, 1] range 2845 projBnds[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][0] ); 2846 projBnds[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][0] ); 2847 2848 projBnds[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][1] ); 2849 projBnds[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][1] ); 2850 2851 projBnds[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][2] ); 2852 projBnds[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][2] ); 2853 } 2854 2855 assert( projected[0].Compare( projBnds[0], 0.01f ) ); 2856 assert( projected[1].Compare( projBnds[1], 0.01f ) ); 2857 } 2858 2859 #else 2860 2861 const idVec3 points[8] = { 2862 idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ), 2863 idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ), 2864 idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ), 2865 idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ), 2866 idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ), 2867 idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ), 2868 idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ), 2869 idVec3( bounds[0][0], bounds[1][1], bounds[1][2] ) 2870 }; 2871 2872 idVec4 projectedPoints[8]; 2873 for ( int i = 0; i < 8; i++ ) { 2874 const idVec3 & v = points[i]; 2875 projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3]; 2876 projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3]; 2877 projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; 2878 projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; 2879 } 2880 2881 idVec4 edgeVerts[24]; 2882 for ( int i = 0; i < 3; i++ ) { 2883 int offset0 = ( i & 1 ) * 4; 2884 int offset1 = ( i & 1 ) * 4 + ( i & 2 ) * 2; 2885 int offset3 = ~( i >> 1 ) & 1; 2886 for ( int j = 0; j < 4; j++ ) { 2887 const idVec4 p0 = projectedPoints[offset0 + ( ( j + 0 ) & 3 )]; 2888 const idVec4 p1 = projectedPoints[offset1 + ( ( j + offset3 ) & 3 )]; 2889 2890 #if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1 2891 const float d0 = p0.z; 2892 const float d1 = p1.z; 2893 #else 2894 const float d0 = p0.z + p0.w; 2895 const float d1 = p1.z + p1.w; 2896 #endif 2897 const float delta = d0 - d1; 2898 const float fraction = idMath::Fabs( delta ) > idMath::FLT_SMALLEST_NON_DENORMAL ? ( d0 / delta ) : 1.0f; 2899 const bool clip = ( fraction > 0.0f && fraction < 1.0f ); 2900 const idVec4 intersection = p0 + fraction * ( p1 - p0 ); 2901 2902 edgeVerts[i * 8 + j * 2 + 0] = ( clip && d0 < 0.0f ) ? intersection : p0; 2903 edgeVerts[i * 8 + j * 2 + 1] = ( clip && d1 < 0.0f ) ? intersection : p1; 2904 } 2905 } 2906 2907 for ( int i = 0; i < 3; i++ ) { 2908 projected[0][i] = RENDER_MATRIX_INFINITY; 2909 projected[1][i] = - RENDER_MATRIX_INFINITY; 2910 } 2911 2912 for ( int i = 0; i < 24; i++ ) { 2913 const idVec4 & v = edgeVerts[i]; 2914 2915 if ( v.w <= idMath::FLT_SMALLEST_NON_DENORMAL ) { 2916 continue; 2917 } 2918 2919 const float rw = 1.0f / v.w; 2920 2921 const float px = v.x * rw; 2922 const float py = v.y * rw; 2923 const float pz = v.z * rw; 2924 2925 projected[0][0] = Min( projected[0][0], px ); 2926 projected[0][1] = Min( projected[0][1], py ); 2927 projected[0][2] = Min( projected[0][2], pz ); 2928 2929 projected[1][0] = Max( projected[1][0], px ); 2930 projected[1][1] = Max( projected[1][1], py ); 2931 projected[1][2] = Max( projected[1][2], pz ); 2932 } 2933 2934 if ( windowSpace ) { 2935 // convert to window coords 2936 projected[0][0] = projected[0][0] * 0.5f + 0.5f; 2937 projected[1][0] = projected[1][0] * 0.5f + 0.5f; 2938 2939 projected[0][1] = projected[0][1] * 0.5f + 0.5f; 2940 projected[1][1] = projected[1][1] * 0.5f + 0.5f; 2941 2942 #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] 2943 projected[0][2] = projected[0][2] * 0.5f + 0.5f; 2944 projected[1][2] = projected[1][2] * 0.5f + 0.5f; 2945 #endif 2946 2947 // clamp to [0, 1] range 2948 projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] ); 2949 projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] ); 2950 2951 projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] ); 2952 projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] ); 2953 2954 projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] ); 2955 projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] ); 2956 } 2957 2958 #endif 2959 } 2960 2961 #if 0 2962 2963 /* 2964 ======================== 2965 LocalViewOriginFromMVP 2966 ======================== 2967 */ 2968 static idVec3 LocalViewOriginFromMVP( const idRenderMatrix & mvp ) { 2969 const float nearX = mvp[3][0] + mvp[2][0]; 2970 const float nearY = mvp[3][1] + mvp[2][1]; 2971 const float nearZ = mvp[3][2] + mvp[2][2]; 2972 const float s = idMath::InvSqrt( nearX * nearX + nearY * nearY + nearZ * nearZ ); 2973 2974 idRenderMatrix inverseMVP; 2975 idRenderMatrix::Inverse( mvp, inverseMVP ); 2976 const float invW = 1.0f / inverseMVP[3][3]; 2977 const float x = ( inverseMVP[0][3] - nearX * s ) * invW; 2978 const float y = ( inverseMVP[1][3] - nearY * s ) * invW; 2979 const float z = ( inverseMVP[2][3] - nearZ * s ) * invW; 2980 2981 return idVec3( x, y, z ); 2982 } 2983 2984 #endif 2985 2986 /* 2987 ======================== 2988 LocalNearClipCenterFromMVP 2989 2990 Based on whether the depth range is [0,1] or [-1,1], either transform (0,0,0) or (0,0,-1) with the inverse MVP. 2991 ======================== 2992 */ 2993 static idVec3 LocalNearClipCenterFromMVP( const idRenderMatrix & mvp ) { 2994 idRenderMatrix inverseMVP; 2995 idRenderMatrix::Inverse( mvp, inverseMVP ); 2996 #if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1 2997 const float x = inverseMVP[0][3]; 2998 const float y = inverseMVP[1][3]; 2999 const float z = inverseMVP[2][3]; 3000 const float w = inverseMVP[3][3]; 3001 #else 3002 const float x = inverseMVP[0][3] - inverseMVP[0][2]; 3003 const float y = inverseMVP[1][3] - inverseMVP[1][2]; 3004 const float z = inverseMVP[2][3] - inverseMVP[2][2]; 3005 const float w = inverseMVP[3][3] - inverseMVP[3][2]; 3006 #endif 3007 const float invW = 1.0f / w; 3008 return idVec3( x * invW, y * invW, z * invW ); 3009 } 3010 3011 #ifdef ID_WIN_X86_SSE2_INTRIN 3012 3013 /* 3014 ======================== 3015 ClipHomogeneousPolygonToSide 3016 3017 Clips a polygon with homogeneous coordinates to the axis aligned plane[axis] = sign * offset. 3018 ======================== 3019 */ 3020 static void ClipHomogeneousPolygonToSide_SSE2( idVec4 * __restrict newPoints, idVec4 * __restrict points, int & numPoints, 3021 const int axis, const __m128 & sign, const __m128 & offset ) { 3022 assert( newPoints != points ); 3023 3024 const __m128 side = _mm_mul_ps( sign, offset ); 3025 __m128i mask = _mm_sub_epi32( vector_int_0123, _mm_shuffle_epi32( _mm_cvtsi32_si128( numPoints ), 0 ) ); 3026 __m128i index = _mm_setzero_si128(); 3027 3028 ALIGNTYPE16 unsigned short indices[16 * 2]; 3029 ALIGNTYPE16 float clipFractions[16]; 3030 3031 int localNumPoint = numPoints; 3032 3033 for ( int i = 0; i < localNumPoint; i += 4 ) { 3034 const int i0 = ( i + 0 ) & ( ( i + 0 - localNumPoint ) >> 31 ); 3035 const int i1 = ( i + 1 ) & ( ( i + 1 - localNumPoint ) >> 31 ); 3036 const int i2 = ( i + 2 ) & ( ( i + 2 - localNumPoint ) >> 31 ); 3037 const int i3 = ( i + 3 ) & ( ( i + 3 - localNumPoint ) >> 31 ); 3038 const int i4 = ( i + 4 ) & ( ( i + 4 - localNumPoint ) >> 31 ); 3039 3040 const __m128 p0A = _mm_load_ss( &points[i0][axis] ); 3041 const __m128 p1A = _mm_load_ss( &points[i1][axis] ); 3042 const __m128 p2A = _mm_load_ss( &points[i2][axis] ); 3043 const __m128 p3A = _mm_load_ss( &points[i3][axis] ); 3044 const __m128 p4A = _mm_load_ss( &points[i4][axis] ); 3045 3046 const __m128 p0W = _mm_load_ss( &points[i0][3] ); 3047 const __m128 p1W = _mm_load_ss( &points[i1][3] ); 3048 const __m128 p2W = _mm_load_ss( &points[i2][3] ); 3049 const __m128 p3W = _mm_load_ss( &points[i3][3] ); 3050 const __m128 p4W = _mm_load_ss( &points[i4][3] ); 3051 3052 const __m128 t0 = _mm_unpacklo_ps( p0A, p2A ); 3053 const __m128 t1 = _mm_unpacklo_ps( p1A, p3A ); 3054 const __m128 pa0 = _mm_unpacklo_ps( t0, t1 ); 3055 const __m128 pa1 = _mm_sld_ps( pa0, p4A, 4 ); 3056 3057 const __m128 r0 = _mm_unpacklo_ps( p0W, p2W ); 3058 const __m128 r1 = _mm_unpacklo_ps( p1W, p3W ); 3059 const __m128 pw0 = _mm_unpacklo_ps( r0, r1 ); 3060 const __m128 pw1 = _mm_sld_ps( pw0, p4W, 4 ); 3061 3062 { 3063 const __m128 bside0 = _mm_cmpgt_ps( _mm_mul_ps( offset, pw0 ), _mm_mul_ps( sign, pa0 ) ); 3064 const __m128 bside1 = _mm_cmpgt_ps( _mm_mul_ps( offset, pw1 ), _mm_mul_ps( sign, pa1 ) ); 3065 const __m128i side0 = _mm_and_si128( __m128c( bside0 ), vector_int_1 ); 3066 const __m128i side1 = _mm_and_si128( __m128c( bside1 ), vector_int_1 ); 3067 const __m128i xorSide = _mm_xor_si128( side0, side1 ); 3068 const __m128i interleavedSide0 = _mm_unpacklo_epi32( side0, xorSide ); 3069 const __m128i interleavedSide1 = _mm_unpackhi_epi32( side0, xorSide ); 3070 const __m128i packedSide = _mm_packs_epi32( interleavedSide0, interleavedSide1 ); 3071 const __m128i packedMaskedSide = _mm_and_si128( packedSide, _mm_srai_epi32( mask, 31 ) ); 3072 3073 index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 2 ) ); 3074 index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 4 ) ); 3075 index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 6 ) ); 3076 index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 8 ) ); 3077 index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 10 ) ); 3078 index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 12 ) ); 3079 index = _mm_add_epi16( index, _mm_slli_si128( packedMaskedSide, 14 ) ); 3080 3081 _mm_store_si128( (__m128i *)&indices[i * 2], index ); 3082 3083 mask = _mm_add_epi32( mask, vector_int_4 ); 3084 index = _mm_add_epi16( index, packedMaskedSide ); 3085 index = _mm_shufflehi_epi16( index, _MM_SHUFFLE( 3, 3, 3, 3 ) ); 3086 index = _mm_shuffle_epi32( index, _MM_SHUFFLE( 3, 3, 3, 3 ) ); 3087 } 3088 3089 { 3090 const __m128 d0 = _mm_nmsub_ps( pw0, side, pa0 ); 3091 const __m128 d1 = _mm_nmsub_ps( pw1, side, pa1 ); 3092 const __m128 delta = _mm_sub_ps( d0, d1 ); 3093 const __m128 deltaAbs = _mm_and_ps( delta, vector_float_abs_mask ); 3094 const __m128 clamp = _mm_cmpgt_ps( vector_float_smallest_non_denorm, deltaAbs ); 3095 const __m128 deltaClamped = _mm_sel_ps( delta, vector_float_one, clamp ); 3096 const __m128 fraction = _mm_mul_ps( d0, _mm_rcp32_ps( deltaClamped ) ); 3097 const __m128 fractionClamped0 = _mm_sel_ps( fraction, vector_float_one, clamp ); 3098 const __m128 fractionClamped1 = _mm_max_ps( fractionClamped0, vector_float_zero ); 3099 const __m128 fractionClamped2 = _mm_min_ps( fractionClamped1, vector_float_one ); 3100 3101 _mm_store_ps( &clipFractions[i], fractionClamped2 ); 3102 } 3103 } 3104 3105 numPoints = _mm_cvtsi128_si32( index ) & 0xFFFF; 3106 3107 for ( int i = 0; i < localNumPoint; i += 4 ) { 3108 const int i0 = ( i + 0 ) & ( ( i + 0 - localNumPoint ) >> 31 ); 3109 const int i1 = ( i + 1 ) & ( ( i + 1 - localNumPoint ) >> 31 ); 3110 const int i2 = ( i + 2 ) & ( ( i + 2 - localNumPoint ) >> 31 ); 3111 const int i3 = ( i + 3 ) & ( ( i + 3 - localNumPoint ) >> 31 ); 3112 const int i4 = ( i + 4 ) & ( ( i + 4 - localNumPoint ) >> 31 ); 3113 3114 const __m128 p0 = _mm_load_ps( points[i0].ToFloatPtr() ); 3115 const __m128 p1 = _mm_load_ps( points[i1].ToFloatPtr() ); 3116 const __m128 p2 = _mm_load_ps( points[i2].ToFloatPtr() ); 3117 const __m128 p3 = _mm_load_ps( points[i3].ToFloatPtr() ); 3118 const __m128 p4 = _mm_load_ps( points[i4].ToFloatPtr() ); 3119 3120 const __m128 fraction = _mm_load_ps( &clipFractions[i] ); 3121 3122 const __m128 c0 = _mm_madd_ps( _mm_splat_ps( fraction, 0 ), _mm_sub_ps( p1, p0 ), p0 ); 3123 const __m128 c1 = _mm_madd_ps( _mm_splat_ps( fraction, 1 ), _mm_sub_ps( p2, p1 ), p1 ); 3124 const __m128 c2 = _mm_madd_ps( _mm_splat_ps( fraction, 2 ), _mm_sub_ps( p3, p2 ), p2 ); 3125 const __m128 c3 = _mm_madd_ps( _mm_splat_ps( fraction, 3 ), _mm_sub_ps( p4, p3 ), p3 ); 3126 3127 _mm_store_ps( newPoints[indices[i * 2 + 0]].ToFloatPtr(), p0 ); 3128 _mm_store_ps( newPoints[indices[i * 2 + 1]].ToFloatPtr(), c0 ); 3129 _mm_store_ps( newPoints[indices[i * 2 + 2]].ToFloatPtr(), p1 ); 3130 _mm_store_ps( newPoints[indices[i * 2 + 3]].ToFloatPtr(), c1 ); 3131 _mm_store_ps( newPoints[indices[i * 2 + 4]].ToFloatPtr(), p2 ); 3132 _mm_store_ps( newPoints[indices[i * 2 + 5]].ToFloatPtr(), c2 ); 3133 _mm_store_ps( newPoints[indices[i * 2 + 6]].ToFloatPtr(), p3 ); 3134 _mm_store_ps( newPoints[indices[i * 2 + 7]].ToFloatPtr(), c3 ); 3135 } 3136 } 3137 3138 /* 3139 ======================== 3140 ClipHomogeneousPolygonToUnitCube 3141 3142 Clips a polygon with homogeneous coordinates to all six axis aligned unit cube planes. 3143 ======================== 3144 */ 3145 static int ClipHomogeneousPolygonToUnitCube_SSE2( idVec4 * points, int numPoints ) { 3146 assert( numPoints < 16 - 6 ); 3147 ALIGNTYPE16 idVec4 newPoints[16 * 2]; 3148 3149 #if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1 3150 ClipHomogeneousPolygonToSide_SSE2( newPoints, points, numPoints, 2, vector_float_neg_one, vector_float_zero ); // near 3151 #else 3152 ClipHomogeneousPolygonToSide_SSE2( newPoints, points, numPoints, 2, vector_float_neg_one, vector_float_one ); // near 3153 #endif 3154 ClipHomogeneousPolygonToSide_SSE2( points, newPoints, numPoints, 2, vector_float_pos_one, vector_float_one ); // far 3155 ClipHomogeneousPolygonToSide_SSE2( newPoints, points, numPoints, 1, vector_float_neg_one, vector_float_one ); // bottom 3156 ClipHomogeneousPolygonToSide_SSE2( points, newPoints, numPoints, 1, vector_float_pos_one, vector_float_one ); // top 3157 ClipHomogeneousPolygonToSide_SSE2( newPoints, points, numPoints, 0, vector_float_neg_one, vector_float_one ); // left 3158 ClipHomogeneousPolygonToSide_SSE2( points, newPoints, numPoints, 0, vector_float_pos_one, vector_float_one ); // right 3159 return numPoints; 3160 } 3161 3162 #else 3163 3164 /* 3165 ======================== 3166 ClipHomogeneousLineToSide 3167 3168 Clips a line with homogeneous coordinates to the axis aligned plane[axis] = side. 3169 ======================== 3170 */ 3171 static idVec4 ClipHomogeneousLineToSide( const idVec4 & p0, const idVec4 & p1, int axis, float side ) { 3172 const float d0 = p0.w * side - p0[axis]; 3173 const float d1 = p1.w * side - p1[axis]; 3174 const float delta = d0 - d1; 3175 const float f = idMath::Fabs( delta ) > idMath::FLT_SMALLEST_NON_DENORMAL ? ( d0 / delta ) : 1.0f; 3176 const float c = idMath::ClampFloat( 0.0f, 1.0f, f ); 3177 return p0 + c * ( p1 - p0 ); 3178 } 3179 3180 /* 3181 ======================== 3182 ClipHomogeneousPolygonToSide 3183 3184 Clips a polygon with homogeneous coordinates to the axis aligned plane[axis] = sign * offset. 3185 ======================== 3186 */ 3187 static int ClipHomogeneousPolygonToSide_Generic( idVec4 * __restrict newPoints, idVec4 * __restrict points, int numPoints, int axis, float sign, float offset ) { 3188 assert( newPoints != points ); 3189 3190 assert( numPoints < 16 ); 3191 int sides[16]; 3192 3193 const float side = sign * offset; 3194 3195 // calculate the plane side for each original point and calculate all potential new points 3196 for ( int i = 0; i < numPoints; i++ ) { 3197 int j = ( i + 1 ) & ( ( i + 1 - numPoints ) >> 31 ); 3198 sides[i] = sign * points[i][axis] < offset * points[i].w; 3199 newPoints[i * 2 + 0] = points[i]; 3200 newPoints[i * 2 + 1] = ClipHomogeneousLineToSide( points[i], points[j], axis, side ); 3201 }; 3202 3203 // repeat the first side at the end to avoid having to wrap around 3204 sides[numPoints] = sides[0]; 3205 3206 // compact the array of points 3207 int numNewPoints = 0; 3208 for ( int i = 0; i < numPoints; i++ ) { 3209 if ( sides[i + 0] != 0 ) { 3210 newPoints[numNewPoints++] = newPoints[i * 2 + 0]; 3211 } 3212 if ( ( sides[i + 0] ^ sides[i + 1] ) != 0 ) { 3213 newPoints[numNewPoints++] = newPoints[i * 2 + 1]; 3214 } 3215 } 3216 3217 assert( numNewPoints <= 16 ); 3218 return numNewPoints; 3219 } 3220 3221 /* 3222 ======================== 3223 ClipHomogeneousPolygonToUnitCube 3224 3225 Clips a polygon with homogeneous coordinates to all six axis aligned unit cube planes. 3226 ======================== 3227 */ 3228 static int ClipHomogeneousPolygonToUnitCube_Generic( idVec4 * points, int numPoints ) { 3229 assert( numPoints < 16 - 6 ); 3230 ALIGNTYPE16 idVec4 newPoints[2 * 16]; // the C clip code temporarily doubles the points 3231 3232 #if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1 3233 numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 2, -1.0f, 0.0f ); // near 3234 #else 3235 numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 2, -1.0f, 1.0f ); // near 3236 #endif 3237 numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 2, +1.0f, 1.0f ); // far 3238 numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 1, -1.0f, 1.0f ); // bottom 3239 numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 1, +1.0f, 1.0f ); // top 3240 numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 0, -1.0f, 1.0f ); // left 3241 numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 0, +1.0f, 1.0f ); // right 3242 return numPoints; 3243 } 3244 3245 #endif 3246 3247 /* 3248 ======================== 3249 idRenderMatrix::ProjectedFullyClippedBounds 3250 3251 Calculates the bounds of the given bounding box projected with the given Model View Projection (MVP) matrix. 3252 If 'windowSpace' is true then the calculated bounds along each axis are moved and clamped to the [0, 1] range. 3253 3254 The given bounding box is first fully clipped to the MVP to get the smallest projected bounds. 3255 3256 Note that this code assumes the MVP matrix has an infinite far clipping plane. When the far plane is at 3257 infinity the bounds are never far clipped and it is sufficient to test whether or not the center of the 3258 near clip plane is inside the bounds to calculate the correct minimum Z. If the far plane is not at 3259 infinity then this code would also have to test for the view frustum being completely contained inside 3260 the given bounds in which case the projected bounds should be set to fully cover the view frustum. 3261 ======================== 3262 */ 3263 void idRenderMatrix::ProjectedFullyClippedBounds( idBounds & projected, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) { 3264 #ifdef ID_WIN_X86_SSE2_INTRIN 3265 3266 const __m128 mvp0 = _mm_loadu_ps( mvp[0] ); 3267 const __m128 mvp1 = _mm_loadu_ps( mvp[1] ); 3268 const __m128 mvp2 = _mm_loadu_ps( mvp[2] ); 3269 const __m128 mvp3 = _mm_loadu_ps( mvp[3] ); 3270 3271 const __m128 t0 = _mm_unpacklo_ps( mvp0, mvp2 ); // mvp[0][0], mvp[2][0], mvp[0][1], mvp[2][1] 3272 const __m128 t1 = _mm_unpackhi_ps( mvp0, mvp2 ); // mvp[0][2], mvp[2][2], mvp[0][3], mvp[2][3] 3273 const __m128 t2 = _mm_unpacklo_ps( mvp1, mvp3 ); // mvp[1][0], mvp[3][0], mvp[1][1], mvp[3][1] 3274 const __m128 t3 = _mm_unpackhi_ps( mvp1, mvp3 ); // mvp[1][2], mvp[3][2], mvp[1][3], mvp[3][3] 3275 3276 const __m128 mvpX = _mm_unpacklo_ps( t0, t2 ); // mvp[0][0], mvp[1][0], mvp[2][0], mvp[3][0] 3277 const __m128 mvpY = _mm_unpackhi_ps( t0, t2 ); // mvp[0][1], mvp[1][1], mvp[2][1], mvp[3][1] 3278 const __m128 mvpZ = _mm_unpacklo_ps( t1, t3 ); // mvp[0][2], mvp[1][2], mvp[2][2], mvp[3][2] 3279 const __m128 mvpW = _mm_unpackhi_ps( t1, t3 ); // mvp[0][3], mvp[1][3], mvp[2][3], mvp[3][3] 3280 3281 const __m128 b0 = _mm_loadu_bounds_0( bounds ); 3282 const __m128 b1 = _mm_loadu_bounds_1( bounds ); 3283 3284 const __m128 b0X = _mm_splat_ps( b0, 0 ); 3285 const __m128 b0Y = _mm_splat_ps( b0, 1 ); 3286 const __m128 b0Z = _mm_splat_ps( b0, 2 ); 3287 3288 const __m128 b1X = _mm_splat_ps( b1, 0 ); 3289 const __m128 b1Y = _mm_splat_ps( b1, 1 ); 3290 const __m128 b1Z = _mm_splat_ps( b1, 2 ); 3291 3292 const __m128 p0 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) ); 3293 const __m128 p1 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) ); 3294 const __m128 p2 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) ); 3295 const __m128 p3 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) ); 3296 const __m128 p4 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) ); 3297 const __m128 p5 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) ); 3298 const __m128 p6 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) ); 3299 const __m128 p7 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) ); 3300 3301 ALIGNTYPE16 idVec4 projectedPoints[8]; 3302 _mm_store_ps( projectedPoints[0].ToFloatPtr(), p0 ); 3303 _mm_store_ps( projectedPoints[1].ToFloatPtr(), p1 ); 3304 _mm_store_ps( projectedPoints[2].ToFloatPtr(), p2 ); 3305 _mm_store_ps( projectedPoints[3].ToFloatPtr(), p3 ); 3306 _mm_store_ps( projectedPoints[4].ToFloatPtr(), p4 ); 3307 _mm_store_ps( projectedPoints[5].ToFloatPtr(), p5 ); 3308 _mm_store_ps( projectedPoints[6].ToFloatPtr(), p6 ); 3309 _mm_store_ps( projectedPoints[7].ToFloatPtr(), p7 ); 3310 3311 ALIGNTYPE16 idVec4 clippedPoints[6 * 16]; 3312 int numClippedPoints = 0; 3313 for ( int i = 0; i < 6; i++ ) { 3314 _mm_store_ps( clippedPoints[numClippedPoints + 0].ToFloatPtr(), _mm_load_ps( projectedPoints[boxPolygonVertices[i][0]].ToFloatPtr() ) ); 3315 _mm_store_ps( clippedPoints[numClippedPoints + 1].ToFloatPtr(), _mm_load_ps( projectedPoints[boxPolygonVertices[i][1]].ToFloatPtr() ) ); 3316 _mm_store_ps( clippedPoints[numClippedPoints + 2].ToFloatPtr(), _mm_load_ps( projectedPoints[boxPolygonVertices[i][2]].ToFloatPtr() ) ); 3317 _mm_store_ps( clippedPoints[numClippedPoints + 3].ToFloatPtr(), _mm_load_ps( projectedPoints[boxPolygonVertices[i][3]].ToFloatPtr() ) ); 3318 numClippedPoints += ClipHomogeneousPolygonToUnitCube_SSE2( &clippedPoints[numClippedPoints], 4 ); 3319 } 3320 3321 // repeat the first clipped point at the end to get a multiple of 4 clipped points 3322 const __m128 point0 = _mm_load_ps( clippedPoints[0].ToFloatPtr() ); 3323 for ( int i = numClippedPoints; ( i & 3 ) != 0; i++ ) { 3324 _mm_store_ps( clippedPoints[i].ToFloatPtr(), point0 ); 3325 } 3326 3327 // test if the center of the near clip plane is inside the given bounding box 3328 const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp ); 3329 const bool inside = bounds.Expand( RENDER_MATRIX_PROJECTION_EPSILON ).ContainsPoint( localNearClipCenter ); 3330 3331 __m128 minX = vector_float_pos_infinity; 3332 __m128 minY = vector_float_pos_infinity; 3333 __m128 minZ = inside ? vector_float_neg_one : vector_float_pos_infinity; 3334 3335 __m128 maxX = vector_float_neg_infinity; 3336 __m128 maxY = vector_float_neg_infinity; 3337 __m128 maxZ = vector_float_neg_infinity; 3338 3339 for ( int i = 0; i < numClippedPoints; i += 4 ) { 3340 const __m128 cp0 = _mm_load_ps( clippedPoints[i + 0].ToFloatPtr() ); 3341 const __m128 cp1 = _mm_load_ps( clippedPoints[i + 1].ToFloatPtr() ); 3342 const __m128 cp2 = _mm_load_ps( clippedPoints[i + 2].ToFloatPtr() ); 3343 const __m128 cp3 = _mm_load_ps( clippedPoints[i + 3].ToFloatPtr() ); 3344 3345 const __m128 s0 = _mm_unpacklo_ps( cp0, cp2 ); // cp0[0], cp2[0], cp0[1], cp2[1] 3346 const __m128 s1 = _mm_unpackhi_ps( cp0, cp2 ); // cp0[2], cp2[2], cp0[3], cp2[3] 3347 const __m128 s2 = _mm_unpacklo_ps( cp1, cp3 ); // cp1[0], cp3[0], cp1[1], cp3[1] 3348 const __m128 s3 = _mm_unpackhi_ps( cp1, cp3 ); // cp1[2], cp3[2], cp1[3], cp3[3] 3349 3350 const __m128 cpX = _mm_unpacklo_ps( s0, s2 ); // cp0[0], cp1[0], cp2[0], cp3[0] 3351 const __m128 cpY = _mm_unpackhi_ps( s0, s2 ); // cp0[1], cp1[1], cp2[1], cp3[1] 3352 const __m128 cpZ = _mm_unpacklo_ps( s1, s3 ); // cp0[2], cp1[2], cp2[2], cp3[2] 3353 const __m128 cpW = _mm_unpackhi_ps( s1, s3 ); // cp0[3], cp1[3], cp2[3], cp3[3] 3354 3355 const __m128 rW = _mm_rcp32_ps( cpW ); 3356 const __m128 rX = _mm_mul_ps( cpX, rW ); 3357 const __m128 rY = _mm_mul_ps( cpY, rW ); 3358 const __m128 rZ = _mm_mul_ps( cpZ, rW ); 3359 3360 minX = _mm_min_ps( minX, rX ); 3361 minY = _mm_min_ps( minY, rY ); 3362 minZ = _mm_min_ps( minZ, rZ ); 3363 3364 maxX = _mm_max_ps( maxX, rX ); 3365 maxY = _mm_max_ps( maxY, rY ); 3366 maxZ = _mm_max_ps( maxZ, rZ ); 3367 } 3368 3369 minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 3370 minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 3371 minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 3372 3373 minX = _mm_min_ps( minX, _mm_perm_ps( minX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 3374 minY = _mm_min_ps( minY, _mm_perm_ps( minY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 3375 minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 3376 3377 maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 3378 maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 3379 maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 3380 3381 maxX = _mm_max_ps( maxX, _mm_perm_ps( maxX, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 3382 maxY = _mm_max_ps( maxY, _mm_perm_ps( maxY, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 3383 maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 3384 3385 if ( windowSpace ) { 3386 minX = _mm_madd_ps( minX, vector_float_half, vector_float_half ); 3387 maxX = _mm_madd_ps( maxX, vector_float_half, vector_float_half ); 3388 3389 minY = _mm_madd_ps( minY, vector_float_half, vector_float_half ); 3390 maxY = _mm_madd_ps( maxY, vector_float_half, vector_float_half ); 3391 3392 #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] 3393 minZ = _mm_madd_ps( minZ, vector_float_half, vector_float_half ); 3394 maxZ = _mm_madd_ps( maxZ, vector_float_half, vector_float_half ); 3395 #endif 3396 3397 minX = _mm_max_ps( _mm_min_ps( minX, vector_float_one ), vector_float_zero ); 3398 maxX = _mm_max_ps( _mm_min_ps( maxX, vector_float_one ), vector_float_zero ); 3399 3400 minY = _mm_max_ps( _mm_min_ps( minY, vector_float_one ), vector_float_zero ); 3401 maxY = _mm_max_ps( _mm_min_ps( maxY, vector_float_one ), vector_float_zero ); 3402 3403 minZ = _mm_max_ps( _mm_min_ps( minZ, vector_float_one ), vector_float_zero ); 3404 maxZ = _mm_max_ps( _mm_min_ps( maxZ, vector_float_one ), vector_float_zero ); 3405 } 3406 3407 _mm_store_ss( & projected[0].x, minX ); 3408 _mm_store_ss( & projected[0].y, minY ); 3409 _mm_store_ss( & projected[0].z, minZ ); 3410 3411 _mm_store_ss( & projected[1].x, maxX ); 3412 _mm_store_ss( & projected[1].y, maxY ); 3413 _mm_store_ss( & projected[1].z, maxZ ); 3414 3415 #else 3416 3417 const idVec3 points[8] = { 3418 idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ), 3419 idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ), 3420 idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ), 3421 idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ), 3422 idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ), 3423 idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ), 3424 idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ), 3425 idVec3( bounds[0][0], bounds[1][1], bounds[1][2] ) 3426 }; 3427 3428 idVec4 projectedPoints[8]; 3429 for ( int i = 0; i < 8; i++ ) { 3430 const idVec3 & v = points[i]; 3431 projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3]; 3432 projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3]; 3433 projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; 3434 projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; 3435 } 3436 3437 idVec4 clippedPoints[6 * 16]; 3438 int numClippedPoints = 0; 3439 for ( int i = 0; i < 6; i++ ) { 3440 clippedPoints[numClippedPoints + 0] = projectedPoints[boxPolygonVertices[i][0]]; 3441 clippedPoints[numClippedPoints + 1] = projectedPoints[boxPolygonVertices[i][1]]; 3442 clippedPoints[numClippedPoints + 2] = projectedPoints[boxPolygonVertices[i][2]]; 3443 clippedPoints[numClippedPoints + 3] = projectedPoints[boxPolygonVertices[i][3]]; 3444 numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 ); 3445 } 3446 3447 // test if the center of the near clip plane is inside the given bounding box 3448 const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp ); 3449 const bool inside = bounds.Expand( RENDER_MATRIX_PROJECTION_EPSILON ).ContainsPoint( localNearClipCenter ); 3450 3451 for ( int i = 0; i < 3; i++ ) { 3452 projected[0][i] = RENDER_MATRIX_INFINITY; 3453 projected[1][i] = - RENDER_MATRIX_INFINITY; 3454 } 3455 if ( inside ) { 3456 projected[0][2] = -1.0f; 3457 } 3458 3459 for ( int i = 0; i < numClippedPoints; i++ ) { 3460 const idVec4 & c = clippedPoints[i]; 3461 3462 assert( c.w > idMath::FLT_SMALLEST_NON_DENORMAL ); 3463 3464 const float rw = 1.0f / c.w; 3465 3466 const float px = c.x * rw; 3467 const float py = c.y * rw; 3468 const float pz = c.z * rw; 3469 3470 projected[0][0] = Min( projected[0][0], px ); 3471 projected[0][1] = Min( projected[0][1], py ); 3472 projected[0][2] = Min( projected[0][2], pz ); 3473 3474 projected[1][0] = Max( projected[1][0], px ); 3475 projected[1][1] = Max( projected[1][1], py ); 3476 projected[1][2] = Max( projected[1][2], pz ); 3477 } 3478 3479 if ( windowSpace ) { 3480 // convert to window coords 3481 projected[0][0] = projected[0][0] * 0.5f + 0.5f; 3482 projected[1][0] = projected[1][0] * 0.5f + 0.5f; 3483 3484 projected[0][1] = projected[0][1] * 0.5f + 0.5f; 3485 projected[1][1] = projected[1][1] * 0.5f + 0.5f; 3486 3487 #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] 3488 projected[0][2] = projected[0][2] * 0.5f + 0.5f; 3489 projected[1][2] = projected[1][2] * 0.5f + 0.5f; 3490 #endif 3491 3492 // clamp to [0, 1] range 3493 projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] ); 3494 projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] ); 3495 3496 projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] ); 3497 projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] ); 3498 3499 projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] ); 3500 projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] ); 3501 } 3502 3503 #endif 3504 } 3505 3506 /* 3507 ======================== 3508 idRenderMatrix::DepthBoundsForBounds 3509 3510 Calculates the depth bounds of the given bounding box projected with the given Model View Projection (MVP) matrix. 3511 If 'windowSpace' is true then the calculated depth bounds are moved and clamped to the [0, 1] range. 3512 3513 The given bounding box is not clipped to the MVP so the depth bounds may not be as tight as possible. 3514 ======================== 3515 */ 3516 void idRenderMatrix::DepthBoundsForBounds( float & min, float & max, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) { 3517 #ifdef ID_WIN_X86_SSE2_INTRIN 3518 3519 __m128 mvp2 = _mm_loadu_ps( mvp[2] ); 3520 __m128 mvp3 = _mm_loadu_ps( mvp[3] ); 3521 3522 __m128 b0 = _mm_loadu_bounds_0( bounds ); 3523 __m128 b1 = _mm_loadu_bounds_1( bounds ); 3524 3525 // take the four points on the X-Y plane 3526 __m128 vxy = _mm_unpacklo_ps( b0, b1 ); // min X, max X, min Y, max Y 3527 __m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // min X, max X, min X, max X 3528 __m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) ); // min Y, min Y, max Y, max Y 3529 3530 __m128 vz0 = _mm_splat_ps( b0, 2 ); // min Z, min Z, min Z, min Z 3531 __m128 vz1 = _mm_splat_ps( b1, 2 ); // max Z, max Z, max Z, max Z 3532 3533 // compute four partial Z,W values 3534 __m128 parz = _mm_splat_ps( mvp2, 3 ); 3535 __m128 parw = _mm_splat_ps( mvp3, 3 ); 3536 3537 parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz ); 3538 parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw ); 3539 3540 parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw ); 3541 parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz ); 3542 3543 __m128 z0 = _mm_madd_ps( vz0, _mm_splat_ps( mvp2, 2 ), parz ); 3544 __m128 w0 = _mm_madd_ps( vz0, _mm_splat_ps( mvp3, 2 ), parw ); 3545 3546 __m128 z1 = _mm_madd_ps( vz1, _mm_splat_ps( mvp2, 2 ), parz ); 3547 __m128 w1 = _mm_madd_ps( vz1, _mm_splat_ps( mvp3, 2 ), parw ); 3548 3549 __m128 s0 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w0 ); 3550 w0 = _mm_or_ps( w0, _mm_and_ps( vector_float_smallest_non_denorm, s0 ) ); 3551 3552 __m128 rw0 = _mm_rcp32_ps( w0 ); 3553 z0 = _mm_mul_ps( z0, rw0 ); 3554 z0 = _mm_sel_ps( z0, vector_float_neg_infinity, s0 ); 3555 3556 __m128 s1 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w1 ); 3557 w1 = _mm_or_ps( w1, _mm_and_ps( vector_float_smallest_non_denorm, s1 ) ); 3558 3559 __m128 rw1 = _mm_rcp32_ps( w1 ); 3560 z1 = _mm_mul_ps( z1, rw1 ); 3561 z1 = _mm_sel_ps( z1, vector_float_neg_infinity, s1 ); 3562 3563 __m128 minv = _mm_min_ps( z0, z1 ); 3564 __m128 maxv = _mm_max_ps( z0, z1 ); 3565 3566 minv = _mm_min_ps( minv, _mm_perm_ps( minv, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 3567 minv = _mm_min_ps( minv, _mm_perm_ps( minv, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 3568 3569 maxv = _mm_max_ps( maxv, _mm_perm_ps( maxv, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 3570 maxv = _mm_max_ps( maxv, _mm_perm_ps( maxv, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 3571 3572 if ( windowSpace ) { 3573 #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] 3574 minv = _mm_madd_ps( minv, vector_float_half, vector_float_half ); 3575 maxv = _mm_madd_ps( maxv, vector_float_half, vector_float_half ); 3576 #endif 3577 minv = _mm_max_ps( minv, vector_float_zero ); 3578 maxv = _mm_min_ps( maxv, vector_float_one ); 3579 } 3580 3581 _mm_store_ss( & min, minv ); 3582 _mm_store_ss( & max, maxv ); 3583 3584 #else 3585 3586 float localMin = RENDER_MATRIX_INFINITY; 3587 float localMax = - RENDER_MATRIX_INFINITY; 3588 3589 idVec3 v; 3590 for ( int x = 0; x < 2; x++ ) { 3591 v[0] = bounds[x][0]; 3592 for ( int y = 0; y < 2; y++ ) { 3593 v[1] = bounds[y][1]; 3594 for ( int z = 0; z < 2; z++ ) { 3595 v[2] = bounds[z][2]; 3596 3597 float tz = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; 3598 float tw = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; 3599 3600 if ( tw > idMath::FLT_SMALLEST_NON_DENORMAL ) { 3601 tz = tz / tw; 3602 } else { 3603 tz = -RENDER_MATRIX_INFINITY; 3604 } 3605 3606 localMin = Min( localMin, tz ); 3607 localMax = Max( localMax, tz ); 3608 } 3609 } 3610 } 3611 3612 if ( windowSpace ) { 3613 // convert to window coords 3614 #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] 3615 min = localMin * 0.5f + 0.5f; 3616 max = localMax * 0.5f + 0.5f; 3617 #endif 3618 // clamp to the [0, 1] range 3619 min = Max( min, 0.0f ); 3620 max = Min( max, 1.0f ); 3621 } 3622 3623 #endif 3624 } 3625 3626 /* 3627 ======================== 3628 idRenderMatrix::DepthBoundsForExtrudedBounds 3629 3630 Calculates the depth bounds of the given extruded bounding box projected with the given Model View Projection (MVP) matrix. 3631 The given bounding box is extruded in the 'extrudeDirection' up to the 'clipPlane'. 3632 If 'windowSpace' is true then the calculated depth bounds are moved and clamped to the [0, 1] range. 3633 3634 The extruded bounding box is not clipped to the MVP so the depth bounds may not be as tight as possible. 3635 ======================== 3636 */ 3637 void idRenderMatrix::DepthBoundsForExtrudedBounds( float & min, float & max, const idRenderMatrix & mvp, const idBounds & bounds, const idVec3 & extrudeDirection, const idPlane & clipPlane, bool windowSpace ) { 3638 assert( idMath::Fabs( extrudeDirection * clipPlane.Normal() ) >= idMath::FLT_SMALLEST_NON_DENORMAL ); 3639 3640 #ifdef ID_WIN_X86_SSE2_INTRIN 3641 3642 __m128 mvp2 = _mm_loadu_ps( mvp[2] ); 3643 __m128 mvp3 = _mm_loadu_ps( mvp[3] ); 3644 3645 __m128 b0 = _mm_loadu_bounds_0( bounds ); 3646 __m128 b1 = _mm_loadu_bounds_1( bounds ); 3647 3648 // take the four points on the X-Y plane 3649 __m128 vxy = _mm_unpacklo_ps( b0, b1 ); // min X, max X, min Y, max Y 3650 __m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // min X, max X, min X, max X 3651 __m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) ); // min Y, min Y, max Y, max Y 3652 3653 __m128 vz0 = _mm_splat_ps( b0, 2 ); // min Z, min Z, min Z, min Z 3654 __m128 vz1 = _mm_splat_ps( b1, 2 ); // max Z, max Z, max Z, max Z 3655 3656 __m128 minv; 3657 __m128 maxv; 3658 3659 // calculate the min/max depth values for the bounding box corners 3660 { 3661 // compute four partial Z,W values 3662 __m128 parz = _mm_splat_ps( mvp2, 3 ); 3663 __m128 parw = _mm_splat_ps( mvp3, 3 ); 3664 3665 parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz ); 3666 parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw ); 3667 3668 parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw ); 3669 parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz ); 3670 3671 __m128 z0 = _mm_madd_ps( vz0, _mm_splat_ps( mvp2, 2 ), parz ); 3672 __m128 w0 = _mm_madd_ps( vz0, _mm_splat_ps( mvp3, 2 ), parw ); 3673 3674 __m128 z1 = _mm_madd_ps( vz1, _mm_splat_ps( mvp2, 2 ), parz ); 3675 __m128 w1 = _mm_madd_ps( vz1, _mm_splat_ps( mvp3, 2 ), parw ); 3676 3677 __m128 s0 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w0 ); 3678 w0 = _mm_or_ps( w0, _mm_and_ps( vector_float_smallest_non_denorm, s0 ) ); 3679 3680 __m128 rw0 = _mm_rcp32_ps( w0 ); 3681 z0 = _mm_mul_ps( z0, rw0 ); 3682 z0 = _mm_sel_ps( z0, vector_float_neg_infinity, s0 ); 3683 3684 __m128 s1 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w1 ); 3685 w1 = _mm_or_ps( w1, _mm_and_ps( vector_float_smallest_non_denorm, s1 ) ); 3686 3687 __m128 rw1 = _mm_rcp32_ps( w1 ); 3688 z1 = _mm_mul_ps( z1, rw1 ); 3689 z1 = _mm_sel_ps( z1, vector_float_neg_infinity, s1 ); 3690 3691 minv = _mm_min_ps( z0, z1 ); 3692 maxv = _mm_max_ps( z0, z1 ); 3693 } 3694 3695 // calculate and include the min/max depth value for the extruded bounding box corners 3696 { 3697 __m128 clipX = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 0 ), 0 ); 3698 __m128 clipY = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 1 ), 0 ); 3699 __m128 clipZ = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 2 ), 0 ); 3700 __m128 clipW = _mm_splat_ps( _mm_load_ss( clipPlane.ToFloatPtr() + 3 ), 0 ); 3701 3702 __m128 extrudeX = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 0 ), 0 ); 3703 __m128 extrudeY = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 1 ), 0 ); 3704 __m128 extrudeZ = _mm_splat_ps( _mm_load_ss( extrudeDirection.ToFloatPtr() + 2 ), 0 ); 3705 3706 __m128 closing = _mm_madd_ps( clipX, extrudeX, _mm_madd_ps( clipY, extrudeY, _mm_mul_ps( clipZ, extrudeZ ) ) ); 3707 __m128 invClosing = _mm_rcp32_ps( closing ); 3708 invClosing = _mm_xor_ps( invClosing, vector_float_sign_bit ); 3709 3710 __m128 dt = _mm_madd_ps( clipX, vx, _mm_madd_ps( clipY, vy, clipW ) ); 3711 __m128 d0 = _mm_madd_ps( clipZ, vz0, dt ); 3712 __m128 d1 = _mm_madd_ps( clipZ, vz1, dt ); 3713 3714 d0 = _mm_mul_ps( d0, invClosing ); 3715 d1 = _mm_mul_ps( d1, invClosing ); 3716 3717 __m128 vx0 = _mm_madd_ps( extrudeX, d0, vx ); 3718 __m128 vx1 = _mm_madd_ps( extrudeX, d1, vx ); 3719 3720 __m128 vy0 = _mm_madd_ps( extrudeY, d0, vy ); 3721 __m128 vy1 = _mm_madd_ps( extrudeY, d1, vy ); 3722 3723 vz0 = _mm_madd_ps( extrudeZ, d0, vz0 ); 3724 vz1 = _mm_madd_ps( extrudeZ, d1, vz1 ); 3725 3726 __m128 mvp2X = _mm_splat_ps( mvp2, 0 ); 3727 __m128 mvp3X = _mm_splat_ps( mvp3, 0 ); 3728 3729 __m128 mvp2W = _mm_splat_ps( mvp2, 3 ); 3730 __m128 mvp3W = _mm_splat_ps( mvp3, 3 ); 3731 3732 __m128 z0 = _mm_madd_ps( vx0, mvp2X, mvp2W ); 3733 __m128 w0 = _mm_madd_ps( vx0, mvp3X, mvp3W ); 3734 3735 __m128 z1 = _mm_madd_ps( vx1, mvp2X, mvp2W ); 3736 __m128 w1 = _mm_madd_ps( vx1, mvp3X, mvp3W ); 3737 3738 __m128 mvp2Y = _mm_splat_ps( mvp2, 1 ); 3739 __m128 mvp3Y = _mm_splat_ps( mvp3, 1 ); 3740 3741 z0 = _mm_madd_ps( vy0, mvp2Y, z0 ); 3742 w0 = _mm_madd_ps( vy0, mvp3Y, w0 ); 3743 3744 z1 = _mm_madd_ps( vy1, mvp2Y, z1 ); 3745 w1 = _mm_madd_ps( vy1, mvp3Y, w1 ); 3746 3747 __m128 mvp2Z = _mm_splat_ps( mvp2, 2 ); 3748 __m128 mvp3Z = _mm_splat_ps( mvp3, 2 ); 3749 3750 z0 = _mm_madd_ps( vz0, mvp2Z, z0 ); 3751 w0 = _mm_madd_ps( vz0, mvp3Z, w0 ); 3752 3753 z1 = _mm_madd_ps( vz1, mvp2Z, z1 ); 3754 w1 = _mm_madd_ps( vz1, mvp3Z, w1 ); 3755 3756 __m128 s0 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w0 ); 3757 w0 = _mm_or_ps( w0, _mm_and_ps( vector_float_smallest_non_denorm, s0 ) ); 3758 3759 __m128 rw0 = _mm_rcp32_ps( w0 ); 3760 z0 = _mm_mul_ps( z0, rw0 ); 3761 z0 = _mm_sel_ps( z0, vector_float_neg_infinity, s0 ); 3762 3763 __m128 s1 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w1 ); 3764 w1 = _mm_or_ps( w1, _mm_and_ps( vector_float_smallest_non_denorm, s1 ) ); 3765 3766 __m128 rw1 = _mm_rcp32_ps( w1 ); 3767 z1 = _mm_mul_ps( z1, rw1 ); 3768 z1 = _mm_sel_ps( z1, vector_float_neg_infinity, s1 ); 3769 3770 minv = _mm_min_ps( minv, z0 ); 3771 maxv = _mm_max_ps( maxv, z0 ); 3772 3773 minv = _mm_min_ps( minv, z1 ); 3774 maxv = _mm_max_ps( maxv, z1 ); 3775 } 3776 3777 minv = _mm_min_ps( minv, _mm_perm_ps( minv, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 3778 minv = _mm_min_ps( minv, _mm_perm_ps( minv, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 3779 3780 maxv = _mm_max_ps( maxv, _mm_perm_ps( maxv, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 3781 maxv = _mm_max_ps( maxv, _mm_perm_ps( maxv, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 3782 3783 if ( windowSpace ) { 3784 #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] 3785 minv = _mm_madd_ps( minv, vector_float_half, vector_float_half ); 3786 maxv = _mm_madd_ps( maxv, vector_float_half, vector_float_half ); 3787 #endif 3788 minv = _mm_max_ps( minv, vector_float_zero ); 3789 maxv = _mm_min_ps( maxv, vector_float_one ); 3790 } 3791 3792 _mm_store_ss( & min, minv ); 3793 _mm_store_ss( & max, maxv ); 3794 3795 #else 3796 3797 const float closing = extrudeDirection * clipPlane.Normal(); 3798 const float invClosing = -1.0f / closing; 3799 3800 float localMin = RENDER_MATRIX_INFINITY; 3801 float localMax = - RENDER_MATRIX_INFINITY; 3802 3803 idVec3 v; 3804 for ( int x = 0; x < 2; x++ ) { 3805 v[0] = bounds[x][0]; 3806 for ( int y = 0; y < 2; y++ ) { 3807 v[1] = bounds[y][1]; 3808 for ( int z = 0; z < 2; z++ ) { 3809 v[2] = bounds[z][2]; 3810 3811 for ( int extrude = 0; extrude <= 1; extrude++ ) { 3812 3813 idVec3 test; 3814 if ( extrude ) { 3815 float extrudeDist = clipPlane.Distance( v ) * invClosing; 3816 test = v + extrudeDirection * extrudeDist; 3817 } else { 3818 test = v; 3819 } 3820 3821 float tz = test[0] * mvp[2][0] + test[1] * mvp[2][1] + test[2] * mvp[2][2] + mvp[2][3]; 3822 float tw = test[0] * mvp[3][0] + test[1] * mvp[3][1] + test[2] * mvp[3][2] + mvp[3][3]; 3823 3824 if ( tw > idMath::FLT_SMALLEST_NON_DENORMAL ) { 3825 tz = tz / tw; 3826 } else { 3827 tz = -RENDER_MATRIX_INFINITY; 3828 } 3829 3830 localMin = Min( localMin, tz ); 3831 localMax = Max( localMax, tz ); 3832 } 3833 } 3834 } 3835 } 3836 3837 if ( windowSpace ) { 3838 // convert to window coords 3839 #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] 3840 min = localMin * 0.5f + 0.5f; 3841 max = localMax * 0.5f + 0.5f; 3842 #endif 3843 // clamp to the [0, 1] range 3844 min = Max( min, 0.0f ); 3845 max = Min( max, 1.0f ); 3846 } 3847 3848 #endif 3849 } 3850 3851 /* 3852 ======================== 3853 PointInsideInfiniteShadow 3854 3855 Returns true if the 'localPoint' is inside the infinite shadow volume cast 3856 from the given occluder bounding box and the given light position. 3857 ======================== 3858 */ 3859 static bool PointInsideInfiniteShadow( const idBounds & occluderBounds, const idVec3 & localLightOrigin, const idVec3 & localPoint, const float epsilon ) { 3860 // Expand the bounds with an epsilon. 3861 const idBounds expandedBounds = occluderBounds.Expand( epsilon ); 3862 3863 // If the point is inside the bounding box then the point 3864 // is also inside the shadow projection. 3865 if ( expandedBounds.ContainsPoint( localPoint ) ) { 3866 return true; 3867 } 3868 3869 // If the light is inside the bounding box then the shadow is projected 3870 // in all directions and any point is inside the infinte shadow projection. 3871 if ( expandedBounds.ContainsPoint( localPoint ) ) { 3872 return true; 3873 } 3874 3875 // If the line from localLightOrigin to localPoint intersects the 3876 // bounding box then the point is inside the infinite shadow projection. 3877 if ( expandedBounds.LineIntersection( localLightOrigin, localPoint ) ) { 3878 return true; 3879 } 3880 3881 // The point is definitely not inside the projected shadow. 3882 return false; 3883 } 3884 3885 /* 3886 ======================== 3887 idRenderMatrix::DepthBoundsForShadowBounds 3888 3889 Calculates the depth bounds of the infinite shadow volume projected with the given Model View Projection (MVP) matrix. 3890 The infinite shadow volume is cast from the given occluder bounding box and the given light position. 3891 If 'windowSpace' is true then the calculated depth bounds are moved and clamped to the [0, 1] range. 3892 3893 The infinite shadow volume is fully clipped to the MVP to get the tightest possible bounds. 3894 3895 Note that this code assumes the MVP matrix has an infinite far clipping plane. When the far plane is at 3896 infinity the shadow volume is never far clipped and it is sufficient to test whether or not the center 3897 of the near clip plane is inside the shadow volume to calculate the correct minimum Z. If the far plane 3898 is not at infinity then this code would also have to test for the view frustum being completely contained 3899 inside the shadow volume to also calculate the correct maximum Z. This could be done, for instance, by 3900 testing if the center of the far clipping plane is contained inside the shadow volume. 3901 ======================== 3902 */ 3903 void idRenderMatrix::DepthBoundsForShadowBounds( float & min, float & max, const idRenderMatrix & mvp, const idBounds & bounds, const idVec3 & localLightOrigin, bool windowSpace ) { 3904 #ifdef ID_WIN_X86_SSE2_INTRIN 3905 3906 const __m128 mvp0 = _mm_loadu_ps( mvp[0] ); 3907 const __m128 mvp1 = _mm_loadu_ps( mvp[1] ); 3908 const __m128 mvp2 = _mm_loadu_ps( mvp[2] ); 3909 const __m128 mvp3 = _mm_loadu_ps( mvp[3] ); 3910 3911 const __m128 t0 = _mm_unpacklo_ps( mvp0, mvp2 ); // mvp[0][0], mvp[2][0], mvp[0][1], mvp[2][1] 3912 const __m128 t1 = _mm_unpackhi_ps( mvp0, mvp2 ); // mvp[0][2], mvp[2][2], mvp[0][3], mvp[2][3] 3913 const __m128 t2 = _mm_unpacklo_ps( mvp1, mvp3 ); // mvp[1][0], mvp[3][0], mvp[1][1], mvp[3][1] 3914 const __m128 t3 = _mm_unpackhi_ps( mvp1, mvp3 ); // mvp[1][2], mvp[3][2], mvp[1][3], mvp[3][3] 3915 3916 const __m128 mvpX = _mm_unpacklo_ps( t0, t2 ); // mvp[0][0], mvp[1][0], mvp[2][0], mvp[3][0] 3917 const __m128 mvpY = _mm_unpackhi_ps( t0, t2 ); // mvp[0][1], mvp[1][1], mvp[2][1], mvp[3][1] 3918 const __m128 mvpZ = _mm_unpacklo_ps( t1, t3 ); // mvp[0][2], mvp[1][2], mvp[2][2], mvp[3][2] 3919 const __m128 mvpW = _mm_unpackhi_ps( t1, t3 ); // mvp[0][3], mvp[1][3], mvp[2][3], mvp[3][3] 3920 3921 const __m128 b0 = _mm_loadu_bounds_0( bounds ); 3922 const __m128 b1 = _mm_loadu_bounds_1( bounds ); 3923 3924 const __m128 lightOriginX = _mm_load_ss( localLightOrigin.ToFloatPtr() + 0 ); 3925 const __m128 lightOriginY = _mm_load_ss( localLightOrigin.ToFloatPtr() + 1 ); 3926 const __m128 lightOriginZ = _mm_load_ss( localLightOrigin.ToFloatPtr() + 2 ); 3927 const __m128 lightOrigin = _mm_unpacklo_ps( _mm_unpacklo_ps( lightOriginX, lightOriginZ ), lightOriginY ); 3928 3929 // calculate the front facing polygon bits 3930 int frontBits = GetBoxFrontBits_SSE2( b0, b1, lightOrigin ); 3931 3932 const __m128 b0X = _mm_splat_ps( b0, 0 ); 3933 const __m128 b0Y = _mm_splat_ps( b0, 1 ); 3934 const __m128 b0Z = _mm_splat_ps( b0, 2 ); 3935 3936 const __m128 b1X = _mm_splat_ps( b1, 0 ); 3937 const __m128 b1Y = _mm_splat_ps( b1, 1 ); 3938 const __m128 b1Z = _mm_splat_ps( b1, 2 ); 3939 3940 // bounding box corners 3941 const __m128 np0 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) ); 3942 const __m128 np1 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) ); 3943 const __m128 np2 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) ); 3944 const __m128 np3 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b0Z, mvpZ, mvpW ) ) ); 3945 const __m128 np4 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) ); 3946 const __m128 np5 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b0Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) ); 3947 const __m128 np6 = _mm_madd_ps( b1X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) ); 3948 const __m128 np7 = _mm_madd_ps( b0X, mvpX, _mm_madd_ps( b1Y, mvpY, _mm_madd_ps( b1Z, mvpZ, mvpW ) ) ); 3949 3950 ALIGNTYPE16 idVec4 projectedNearPoints[8]; 3951 _mm_store_ps( projectedNearPoints[0].ToFloatPtr(), np0 ); 3952 _mm_store_ps( projectedNearPoints[1].ToFloatPtr(), np1 ); 3953 _mm_store_ps( projectedNearPoints[2].ToFloatPtr(), np2 ); 3954 _mm_store_ps( projectedNearPoints[3].ToFloatPtr(), np3 ); 3955 _mm_store_ps( projectedNearPoints[4].ToFloatPtr(), np4 ); 3956 _mm_store_ps( projectedNearPoints[5].ToFloatPtr(), np5 ); 3957 _mm_store_ps( projectedNearPoints[6].ToFloatPtr(), np6 ); 3958 _mm_store_ps( projectedNearPoints[7].ToFloatPtr(), np7 ); 3959 3960 // subtract the light position from the bounding box 3961 const __m128 lightX = _mm_splat_ps( lightOriginX, 0 ); 3962 const __m128 lightY = _mm_splat_ps( lightOriginY, 0 ); 3963 const __m128 lightZ = _mm_splat_ps( lightOriginZ, 0 ); 3964 3965 const __m128 d0X = _mm_sub_ps( b0X, lightX ); 3966 const __m128 d0Y = _mm_sub_ps( b0Y, lightY ); 3967 const __m128 d0Z = _mm_sub_ps( b0Z, lightZ ); 3968 3969 const __m128 d1X = _mm_sub_ps( b1X, lightX ); 3970 const __m128 d1Y = _mm_sub_ps( b1Y, lightY ); 3971 const __m128 d1Z = _mm_sub_ps( b1Z, lightZ ); 3972 3973 // bounding box corners projected to infinity from the light position 3974 const __m128 fp0 = _mm_madd_ps( d0X, mvpX, _mm_madd_ps( d0Y, mvpY, _mm_mul_ps( d0Z, mvpZ ) ) ); 3975 const __m128 fp1 = _mm_madd_ps( d1X, mvpX, _mm_madd_ps( d0Y, mvpY, _mm_mul_ps( d0Z, mvpZ ) ) ); 3976 const __m128 fp2 = _mm_madd_ps( d1X, mvpX, _mm_madd_ps( d1Y, mvpY, _mm_mul_ps( d0Z, mvpZ ) ) ); 3977 const __m128 fp3 = _mm_madd_ps( d0X, mvpX, _mm_madd_ps( d1Y, mvpY, _mm_mul_ps( d0Z, mvpZ ) ) ); 3978 const __m128 fp4 = _mm_madd_ps( d0X, mvpX, _mm_madd_ps( d0Y, mvpY, _mm_mul_ps( d1Z, mvpZ ) ) ); 3979 const __m128 fp5 = _mm_madd_ps( d1X, mvpX, _mm_madd_ps( d0Y, mvpY, _mm_mul_ps( d1Z, mvpZ ) ) ); 3980 const __m128 fp6 = _mm_madd_ps( d1X, mvpX, _mm_madd_ps( d1Y, mvpY, _mm_mul_ps( d1Z, mvpZ ) ) ); 3981 const __m128 fp7 = _mm_madd_ps( d0X, mvpX, _mm_madd_ps( d1Y, mvpY, _mm_mul_ps( d1Z, mvpZ ) ) ); 3982 3983 ALIGNTYPE16 idVec4 projectedFarPoints[8]; 3984 _mm_store_ps( projectedFarPoints[0].ToFloatPtr(), fp0 ); 3985 _mm_store_ps( projectedFarPoints[1].ToFloatPtr(), fp1 ); 3986 _mm_store_ps( projectedFarPoints[2].ToFloatPtr(), fp2 ); 3987 _mm_store_ps( projectedFarPoints[3].ToFloatPtr(), fp3 ); 3988 _mm_store_ps( projectedFarPoints[4].ToFloatPtr(), fp4 ); 3989 _mm_store_ps( projectedFarPoints[5].ToFloatPtr(), fp5 ); 3990 _mm_store_ps( projectedFarPoints[6].ToFloatPtr(), fp6 ); 3991 _mm_store_ps( projectedFarPoints[7].ToFloatPtr(), fp7 ); 3992 3993 ALIGNTYPE16 idVec4 clippedPoints[( 6 + 12 ) * 16]; 3994 int numClippedPoints = 0; 3995 3996 // clip the front facing bounding box polygons at the near cap 3997 const frontPolygons_t & frontPolygons = boxFrontPolygonsForFrontBits[frontBits]; 3998 for ( int i = 0; i < frontPolygons.count; i++ ) { 3999 const int polygon = frontPolygons.indices[i]; 4000 _mm_store_ps( clippedPoints[numClippedPoints + 0].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxPolygonVertices[polygon][0]].ToFloatPtr() ) ); 4001 _mm_store_ps( clippedPoints[numClippedPoints + 1].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxPolygonVertices[polygon][1]].ToFloatPtr() ) ); 4002 _mm_store_ps( clippedPoints[numClippedPoints + 2].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxPolygonVertices[polygon][2]].ToFloatPtr() ) ); 4003 _mm_store_ps( clippedPoints[numClippedPoints + 3].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxPolygonVertices[polygon][3]].ToFloatPtr() ) ); 4004 numClippedPoints += ClipHomogeneousPolygonToUnitCube_SSE2( &clippedPoints[numClippedPoints], 4 ); 4005 } 4006 4007 // clip the front facing bounding box polygons projected to the far cap 4008 for ( int i = 0; i < frontPolygons.count; i++ ) { 4009 const int polygon = frontPolygons.indices[i]; 4010 _mm_store_ps( clippedPoints[numClippedPoints + 0].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxPolygonVertices[polygon][0]].ToFloatPtr() ) ); 4011 _mm_store_ps( clippedPoints[numClippedPoints + 1].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxPolygonVertices[polygon][1]].ToFloatPtr() ) ); 4012 _mm_store_ps( clippedPoints[numClippedPoints + 2].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxPolygonVertices[polygon][2]].ToFloatPtr() ) ); 4013 _mm_store_ps( clippedPoints[numClippedPoints + 3].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxPolygonVertices[polygon][3]].ToFloatPtr() ) ); 4014 numClippedPoints += ClipHomogeneousPolygonToUnitCube_SSE2( &clippedPoints[numClippedPoints], 4 ); 4015 } 4016 4017 // clip the silhouette edge polygons that stretch to infinity 4018 const silhouetteEdges_t & silhouetteEdges = boxSilhouetteEdgesForFrontBits[frontBits]; 4019 for ( int i = 0; i < silhouetteEdges.count; i++ ) { 4020 const int edge = silhouetteEdges.indices[i]; 4021 _mm_store_ps( clippedPoints[numClippedPoints + 0].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxEdgeVertices[edge][0]].ToFloatPtr() ) ); 4022 _mm_store_ps( clippedPoints[numClippedPoints + 1].ToFloatPtr(), _mm_load_ps( projectedNearPoints[boxEdgeVertices[edge][1]].ToFloatPtr() ) ); 4023 _mm_store_ps( clippedPoints[numClippedPoints + 2].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxEdgeVertices[edge][1]].ToFloatPtr() ) ); 4024 _mm_store_ps( clippedPoints[numClippedPoints + 3].ToFloatPtr(), _mm_load_ps( projectedFarPoints[boxEdgeVertices[edge][0]].ToFloatPtr() ) ); 4025 numClippedPoints += ClipHomogeneousPolygonToUnitCube_SSE2( &clippedPoints[numClippedPoints], 4 ); 4026 } 4027 4028 // repeat the first clipped point at the end to get a multiple of 4 clipped points 4029 const __m128 point0 = _mm_load_ps( clippedPoints[0].ToFloatPtr() ); 4030 for ( int i = numClippedPoints; ( i & 3 ) != 0; i++ ) { 4031 _mm_store_ps( clippedPoints[i].ToFloatPtr(), point0 ); 4032 } 4033 4034 // test if the center of the near clip plane is inside the infinite shadow volume 4035 const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp ); 4036 const bool inside = PointInsideInfiniteShadow( bounds, localLightOrigin, localNearClipCenter, RENDER_MATRIX_PROJECTION_EPSILON ); 4037 4038 __m128 minZ = inside ? vector_float_neg_one : vector_float_pos_infinity; 4039 __m128 maxZ = vector_float_neg_infinity; 4040 4041 for ( int i = 0; i < numClippedPoints; i += 4 ) { 4042 const __m128 cp0 = _mm_load_ps( clippedPoints[i + 0].ToFloatPtr() ); 4043 const __m128 cp1 = _mm_load_ps( clippedPoints[i + 1].ToFloatPtr() ); 4044 const __m128 cp2 = _mm_load_ps( clippedPoints[i + 2].ToFloatPtr() ); 4045 const __m128 cp3 = _mm_load_ps( clippedPoints[i + 3].ToFloatPtr() ); 4046 4047 const __m128 s1 = _mm_unpackhi_ps( cp0, cp2 ); // cp0[2], cp2[2], cp0[3], cp2[3] 4048 const __m128 s3 = _mm_unpackhi_ps( cp1, cp3 ); // cp1[2], cp3[2], cp1[3], cp3[3] 4049 4050 const __m128 cpZ = _mm_unpacklo_ps( s1, s3 ); // cp0[2], cp1[2], cp2[2], cp3[2] 4051 const __m128 cpW = _mm_unpackhi_ps( s1, s3 ); // cp0[3], cp1[3], cp2[3], cp3[3] 4052 4053 const __m128 rW = _mm_rcp32_ps( cpW ); 4054 const __m128 rZ = _mm_mul_ps( cpZ, rW ); 4055 4056 minZ = _mm_min_ps( minZ, rZ ); 4057 maxZ = _mm_max_ps( maxZ, rZ ); 4058 } 4059 4060 minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 4061 minZ = _mm_min_ps( minZ, _mm_perm_ps( minZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 4062 4063 maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); 4064 maxZ = _mm_max_ps( maxZ, _mm_perm_ps( maxZ, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); 4065 4066 if ( windowSpace ) { 4067 #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] 4068 minZ = _mm_madd_ps( minZ, vector_float_half, vector_float_half ); 4069 maxZ = _mm_madd_ps( maxZ, vector_float_half, vector_float_half ); 4070 #endif 4071 4072 minZ = _mm_max_ps( _mm_min_ps( minZ, vector_float_one ), vector_float_zero ); 4073 maxZ = _mm_max_ps( _mm_min_ps( maxZ, vector_float_one ), vector_float_zero ); 4074 } 4075 4076 _mm_store_ss( & min, minZ ); 4077 _mm_store_ss( & max, maxZ ); 4078 4079 #else 4080 4081 const idVec3 points[8] = { 4082 idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ), 4083 idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ), 4084 idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ), 4085 idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ), 4086 idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ), 4087 idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ), 4088 idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ), 4089 idVec3( bounds[0][0], bounds[1][1], bounds[1][2] ) 4090 }; 4091 4092 // calculate the front facing polygon bits 4093 int frontBits = GetBoxFrontBits_Generic( bounds, localLightOrigin ); 4094 4095 // bounding box corners 4096 ALIGNTYPE16 idVec4 projectedNearPoints[8]; 4097 for ( int i = 0; i < 8; i++ ) { 4098 const idVec3 & v = points[i]; 4099 projectedNearPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3]; 4100 projectedNearPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3]; 4101 projectedNearPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; 4102 projectedNearPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; 4103 } 4104 4105 // bounding box corners projected to infinity from the light position 4106 ALIGNTYPE16 idVec4 projectedFarPoints[8]; 4107 for ( int i = 0; i < 8; i++ ) { 4108 const idVec3 v = points[i] - localLightOrigin; 4109 projectedFarPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2]; 4110 projectedFarPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2]; 4111 projectedFarPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2]; 4112 projectedFarPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2]; 4113 } 4114 4115 ALIGNTYPE16 idVec4 clippedPoints[( 6 + 12 ) * 16]; 4116 int numClippedPoints = 0; 4117 4118 // clip the front facing bounding box polygons at the near cap 4119 const frontPolygons_t & frontPolygons = boxFrontPolygonsForFrontBits[frontBits]; 4120 for ( int i = 0; i < frontPolygons.count; i++ ) { 4121 const int polygon = frontPolygons.indices[i]; 4122 clippedPoints[numClippedPoints + 0] = projectedNearPoints[boxPolygonVertices[polygon][0]]; 4123 clippedPoints[numClippedPoints + 1] = projectedNearPoints[boxPolygonVertices[polygon][1]]; 4124 clippedPoints[numClippedPoints + 2] = projectedNearPoints[boxPolygonVertices[polygon][2]]; 4125 clippedPoints[numClippedPoints + 3] = projectedNearPoints[boxPolygonVertices[polygon][3]]; 4126 numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 ); 4127 } 4128 4129 // clip the front facing bounding box polygons projected to the far cap 4130 for ( int i = 0; i < frontPolygons.count; i++ ) { 4131 const int polygon = frontPolygons.indices[i]; 4132 clippedPoints[numClippedPoints + 0] = projectedFarPoints[boxPolygonVertices[polygon][0]]; 4133 clippedPoints[numClippedPoints + 1] = projectedFarPoints[boxPolygonVertices[polygon][1]]; 4134 clippedPoints[numClippedPoints + 2] = projectedFarPoints[boxPolygonVertices[polygon][2]]; 4135 clippedPoints[numClippedPoints + 3] = projectedFarPoints[boxPolygonVertices[polygon][3]]; 4136 numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 ); 4137 } 4138 4139 // clip the silhouette edge polygons that stretch to infinity 4140 const silhouetteEdges_t & silhouetteEdges = boxSilhouetteEdgesForFrontBits[frontBits]; 4141 for ( int i = 0; i < silhouetteEdges.count; i++ ) { 4142 const int edge = silhouetteEdges.indices[i]; 4143 clippedPoints[numClippedPoints + 0] = projectedNearPoints[boxEdgeVertices[edge][0]]; 4144 clippedPoints[numClippedPoints + 1] = projectedNearPoints[boxEdgeVertices[edge][1]]; 4145 clippedPoints[numClippedPoints + 2] = projectedFarPoints[boxEdgeVertices[edge][1]]; 4146 clippedPoints[numClippedPoints + 3] = projectedFarPoints[boxEdgeVertices[edge][0]]; 4147 numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 ); 4148 } 4149 4150 // test if the center of the near clip plane is inside the infinite shadow volume 4151 const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp ); 4152 const bool inside = PointInsideInfiniteShadow( bounds, localLightOrigin, localNearClipCenter, RENDER_MATRIX_PROJECTION_EPSILON ); 4153 4154 min = inside ? -1.0f : RENDER_MATRIX_INFINITY; 4155 max = - RENDER_MATRIX_INFINITY; 4156 4157 for ( int i = 0; i < numClippedPoints; i++ ) { 4158 const idVec4 & c = clippedPoints[i]; 4159 4160 assert( c.w > idMath::FLT_SMALLEST_NON_DENORMAL ); 4161 4162 const float rw = 1.0f / c.w; 4163 const float pz = c.z * rw; 4164 4165 min = Min( min, pz ); 4166 max = Max( max, pz ); 4167 } 4168 4169 if ( windowSpace ) { 4170 // convert to window coords 4171 #if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] 4172 min = min * 0.5f + 0.5f; 4173 max = max * 0.5f + 0.5f; 4174 #endif 4175 // clamp to [0, 1] range 4176 min = idMath::ClampFloat( 0.0f, 1.0f, min ); 4177 max = idMath::ClampFloat( 0.0f, 1.0f, max ); 4178 } 4179 4180 #endif 4181 } 4182 4183 /* 4184 ======================== 4185 idRenderMatrix::GetFrustumPlanes 4186 4187 Normally the clip space extends from -1.0 to 1.0 on each axis, but by setting 'zeroToOne' 4188 to true, the clip space will extend from 0.0 to 1.0 on each axis for a light projection matrix. 4189 ======================== 4190 */ 4191 void idRenderMatrix::GetFrustumPlanes( idPlane planes[6], const idRenderMatrix & frustum, bool zeroToOne, bool normalize ) { 4192 // FIXME: need to know whether or not this is a D3D MVP. 4193 // We cannot just assume that it's an D3D MVP matrix when 4194 // zeroToOne = false and CLIP_SPACE_D3D is defined because 4195 // this code may be called for non-MVP matrices. 4196 const bool isZeroOneZ = false; 4197 4198 if ( zeroToOne ) { 4199 // left: inside(p) = p * frustum[0] > 0 4200 planes[0][0] = frustum[0][0]; 4201 planes[0][1] = frustum[0][1]; 4202 planes[0][2] = frustum[0][2]; 4203 planes[0][3] = frustum[0][3]; 4204 4205 // bottom: inside(p) = p * frustum[1] > 0 4206 planes[2][0] = frustum[1][0]; 4207 planes[2][1] = frustum[1][1]; 4208 planes[2][2] = frustum[1][2]; 4209 planes[2][3] = frustum[1][3]; 4210 4211 // near: inside(p) = p * frustum[2] > 0 4212 planes[4][0] = frustum[2][0]; 4213 planes[4][1] = frustum[2][1]; 4214 planes[4][2] = frustum[2][2]; 4215 planes[4][3] = frustum[2][3]; 4216 } else { 4217 // left: inside(p) = p * frustum[0] > - ( p * frustum[3] ) 4218 planes[0][0] = frustum[3][0] + frustum[0][0]; 4219 planes[0][1] = frustum[3][1] + frustum[0][1]; 4220 planes[0][2] = frustum[3][2] + frustum[0][2]; 4221 planes[0][3] = frustum[3][3] + frustum[0][3]; 4222 4223 // bottom: inside(p) = p * frustum[1] > -( p * frustum[3] ) 4224 planes[2][0] = frustum[3][0] + frustum[1][0]; 4225 planes[2][1] = frustum[3][1] + frustum[1][1]; 4226 planes[2][2] = frustum[3][2] + frustum[1][2]; 4227 planes[2][3] = frustum[3][3] + frustum[1][3]; 4228 4229 // near: inside(p) = p * frustum[2] > -( p * frustum[3] ) 4230 planes[4][0] = isZeroOneZ ? ( frustum[2][0] ) : ( frustum[3][0] + frustum[2][0] ); 4231 planes[4][1] = isZeroOneZ ? ( frustum[2][1] ) : ( frustum[3][1] + frustum[2][1] ); 4232 planes[4][2] = isZeroOneZ ? ( frustum[2][2] ) : ( frustum[3][2] + frustum[2][2] ); 4233 planes[4][3] = isZeroOneZ ? ( frustum[2][3] ) : ( frustum[3][3] + frustum[2][3] ); 4234 } 4235 4236 // right: inside(p) = p * frustum[0] < p * frustum[3] 4237 planes[1][0] = frustum[3][0] - frustum[0][0]; 4238 planes[1][1] = frustum[3][1] - frustum[0][1]; 4239 planes[1][2] = frustum[3][2] - frustum[0][2]; 4240 planes[1][3] = frustum[3][3] - frustum[0][3]; 4241 4242 // top: inside(p) = p * frustum[1] < p * frustum[3] 4243 planes[3][0] = frustum[3][0] - frustum[1][0]; 4244 planes[3][1] = frustum[3][1] - frustum[1][1]; 4245 planes[3][2] = frustum[3][2] - frustum[1][2]; 4246 planes[3][3] = frustum[3][3] - frustum[1][3]; 4247 4248 // far: inside(p) = p * frustum[2] < p * frustum[3] 4249 planes[5][0] = frustum[3][0] - frustum[2][0]; 4250 planes[5][1] = frustum[3][1] - frustum[2][1]; 4251 planes[5][2] = frustum[3][2] - frustum[2][2]; 4252 planes[5][3] = frustum[3][3] - frustum[2][3]; 4253 4254 // optionally normalize the planes 4255 if ( normalize ) { 4256 for ( int i = 0; i < 6; i++ ) { 4257 float s = idMath::InvSqrt( planes[i].Normal().LengthSqr() ); 4258 planes[i][0] *= s; 4259 planes[i][1] *= s; 4260 planes[i][2] *= s; 4261 planes[i][3] *= s; 4262 } 4263 } 4264 } 4265 4266 /* 4267 ======================== 4268 idRenderMatrix::GetFrustumCorners 4269 ======================== 4270 */ 4271 void idRenderMatrix::GetFrustumCorners( frustumCorners_t & corners, const idRenderMatrix & frustumTransform, const idBounds & frustumBounds ) { 4272 assert_16_byte_aligned( &corners ); 4273 4274 #ifdef ID_WIN_X86_SSE2_INTRIN 4275 4276 __m128 mvp0 = _mm_loadu_ps( frustumTransform[0] ); 4277 __m128 mvp1 = _mm_loadu_ps( frustumTransform[1] ); 4278 __m128 mvp2 = _mm_loadu_ps( frustumTransform[2] ); 4279 __m128 mvp3 = _mm_loadu_ps( frustumTransform[3] ); 4280 4281 __m128 b0 = _mm_loadu_bounds_0( frustumBounds ); 4282 __m128 b1 = _mm_loadu_bounds_1( frustumBounds ); 4283 4284 // take the four points on the X-Y plane 4285 __m128 vxy = _mm_unpacklo_ps( b0, b1 ); // min X, max X, min Y, max Y 4286 __m128 vx = _mm_perm_ps( vxy, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // min X, max X, min X, max X 4287 __m128 vy = _mm_perm_ps( vxy, _MM_SHUFFLE( 3, 3, 2, 2 ) ); // min Y, min Y, max Y, max Y 4288 4289 __m128 vz0 = _mm_splat_ps( b0, 2 ); // min Z, min Z, min Z, min Z 4290 __m128 vz1 = _mm_splat_ps( b1, 2 ); // max Z, max Z, max Z, max Z 4291 4292 // compute four partial X,Y,Z,W values 4293 __m128 parx = _mm_splat_ps( mvp0, 3 ); 4294 __m128 pary = _mm_splat_ps( mvp1, 3 ); 4295 __m128 parz = _mm_splat_ps( mvp2, 3 ); 4296 __m128 parw = _mm_splat_ps( mvp3, 3 ); 4297 4298 parx = _mm_madd_ps( vx, _mm_splat_ps( mvp0, 0 ), parx ); 4299 pary = _mm_madd_ps( vx, _mm_splat_ps( mvp1, 0 ), pary ); 4300 parz = _mm_madd_ps( vx, _mm_splat_ps( mvp2, 0 ), parz ); 4301 parw = _mm_madd_ps( vx, _mm_splat_ps( mvp3, 0 ), parw ); 4302 4303 parx = _mm_madd_ps( vy, _mm_splat_ps( mvp0, 1 ), parx ); 4304 pary = _mm_madd_ps( vy, _mm_splat_ps( mvp1, 1 ), pary ); 4305 parz = _mm_madd_ps( vy, _mm_splat_ps( mvp2, 1 ), parz ); 4306 parw = _mm_madd_ps( vy, _mm_splat_ps( mvp3, 1 ), parw ); 4307 4308 __m128 mvp0Z = _mm_splat_ps( mvp0, 2 ); 4309 __m128 mvp1Z = _mm_splat_ps( mvp1, 2 ); 4310 __m128 mvp2Z = _mm_splat_ps( mvp2, 2 ); 4311 __m128 mvp3Z = _mm_splat_ps( mvp3, 2 ); 4312 4313 __m128 x0 = _mm_madd_ps( vz0, mvp0Z, parx ); 4314 __m128 y0 = _mm_madd_ps( vz0, mvp1Z, pary ); 4315 __m128 z0 = _mm_madd_ps( vz0, mvp2Z, parz ); 4316 __m128 w0 = _mm_madd_ps( vz0, mvp3Z, parw ); 4317 4318 __m128 x1 = _mm_madd_ps( vz1, mvp0Z, parx ); 4319 __m128 y1 = _mm_madd_ps( vz1, mvp1Z, pary ); 4320 __m128 z1 = _mm_madd_ps( vz1, mvp2Z, parz ); 4321 __m128 w1 = _mm_madd_ps( vz1, mvp3Z, parw ); 4322 4323 __m128 s0 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w0 ); 4324 __m128 s1 = _mm_cmpgt_ps( vector_float_smallest_non_denorm, w1 ); 4325 4326 w0 = _mm_sel_ps( w0, vector_float_one, s0 ); 4327 w1 = _mm_sel_ps( w1, vector_float_one, s1 ); 4328 4329 __m128 rw0 = _mm_rcp32_ps( w0 ); 4330 __m128 rw1 = _mm_rcp32_ps( w1 ); 4331 4332 x0 = _mm_mul_ps( x0, rw0 ); 4333 y0 = _mm_mul_ps( y0, rw0 ); 4334 z0 = _mm_mul_ps( z0, rw0 ); 4335 4336 x1 = _mm_mul_ps( x1, rw1 ); 4337 y1 = _mm_mul_ps( y1, rw1 ); 4338 z1 = _mm_mul_ps( z1, rw1 ); 4339 4340 _mm_store_ps( corners.x + 0, x0 ); 4341 _mm_store_ps( corners.x + 4, x1 ); 4342 _mm_store_ps( corners.y + 0, y0 ); 4343 _mm_store_ps( corners.y + 4, y1 ); 4344 _mm_store_ps( corners.z + 0, z0 ); 4345 _mm_store_ps( corners.z + 4, z1 ); 4346 4347 #else 4348 4349 idVec3 v; 4350 for ( int x = 0; x < 2; x++ ) { 4351 v[0] = frustumBounds[x][0]; 4352 for ( int y = 0; y < 2; y++ ) { 4353 v[1] = frustumBounds[y][1]; 4354 for ( int z = 0; z < 2; z++ ) { 4355 v[2] = frustumBounds[z][2]; 4356 4357 float tx = v[0] * frustumTransform[0][0] + v[1] * frustumTransform[0][1] + v[2] * frustumTransform[0][2] + frustumTransform[0][3]; 4358 float ty = v[0] * frustumTransform[1][0] + v[1] * frustumTransform[1][1] + v[2] * frustumTransform[1][2] + frustumTransform[1][3]; 4359 float tz = v[0] * frustumTransform[2][0] + v[1] * frustumTransform[2][1] + v[2] * frustumTransform[2][2] + frustumTransform[2][3]; 4360 float tw = v[0] * frustumTransform[3][0] + v[1] * frustumTransform[3][1] + v[2] * frustumTransform[3][2] + frustumTransform[3][3]; 4361 4362 assert( tw > idMath::FLT_SMALLEST_NON_DENORMAL ); 4363 4364 float rw = 1.0f / tw; 4365 4366 corners.x[(z<<2)|(y<<1)|(x<<0)] = tx * rw; 4367 corners.y[(z<<2)|(y<<1)|(x<<0)] = ty * rw; 4368 corners.z[(z<<2)|(y<<1)|(x<<0)] = tz * rw; 4369 } 4370 } 4371 } 4372 4373 #endif 4374 } 4375 4376 /* 4377 ======================== 4378 idRenderMatrix::CullFrustumCornersToPlane 4379 ======================== 4380 */ 4381 frustumCull_t idRenderMatrix::CullFrustumCornersToPlane( const frustumCorners_t & corners, const idPlane & plane ) { 4382 assert_16_byte_aligned( &corners ); 4383 4384 #ifdef ID_WIN_X86_SSE2_INTRIN 4385 4386 __m128 vp = _mm_loadu_ps( plane.ToFloatPtr() ); 4387 4388 __m128 x0 = _mm_load_ps( corners.x + 0 ); 4389 __m128 y0 = _mm_load_ps( corners.y + 0 ); 4390 __m128 z0 = _mm_load_ps( corners.z + 0 ); 4391 4392 __m128 x1 = _mm_load_ps( corners.x + 4 ); 4393 __m128 y1 = _mm_load_ps( corners.y + 4 ); 4394 __m128 z1 = _mm_load_ps( corners.z + 4 ); 4395 4396 __m128 p0 = _mm_splat_ps( vp, 0 ); 4397 __m128 p1 = _mm_splat_ps( vp, 1 ); 4398 __m128 p2 = _mm_splat_ps( vp, 2 ); 4399 __m128 p3 = _mm_splat_ps( vp, 3 ); 4400 4401 __m128 d0 = _mm_madd_ps( x0, p0, _mm_madd_ps( y0, p1, _mm_madd_ps( z0, p2, p3 ) ) ); 4402 __m128 d1 = _mm_madd_ps( x1, p0, _mm_madd_ps( y1, p1, _mm_madd_ps( z1, p2, p3 ) ) ); 4403 4404 int b0 = _mm_movemask_ps( d0 ); 4405 int b1 = _mm_movemask_ps( d1 ); 4406 4407 unsigned int front = ( (unsigned int) -( ( b0 & b1 ) ^ 15 ) ) >> 31; 4408 unsigned int back = ( (unsigned int) -( b0 | b1 ) ) >> 31; 4409 4410 compile_time_assert( FRUSTUM_CULL_FRONT == 1 ); 4411 compile_time_assert( FRUSTUM_CULL_BACK == 2 ); 4412 compile_time_assert( FRUSTUM_CULL_CROSS == 3 ); 4413 4414 return (frustumCull_t) ( front | ( back << 1 ) ); 4415 4416 #else 4417 4418 bool front = false; 4419 bool back = false; 4420 for ( int i = 0; i < 8; i++ ) { 4421 const float d = corners.x[i] * plane[0] + corners.y[i] * plane[1] + corners.z[i] * plane[2] + plane[3]; 4422 if ( d >= 0.0f ) { 4423 front = true; 4424 } else if ( d <= 0.0f ) { 4425 back = true; 4426 } 4427 if ( back && front ) { 4428 return FRUSTUM_CULL_CROSS; 4429 } 4430 } 4431 if ( front ) { 4432 return FRUSTUM_CULL_FRONT; 4433 } else { 4434 return FRUSTUM_CULL_BACK; 4435 } 4436 4437 #endif 4438 }