DOOM-3-BFG

DOOM 3 BFG Edition
Log | Files | Refs

DynamicShadowVolume.cpp (52080B)


      1 /*
      2 ===========================================================================
      3 
      4 Doom 3 BFG Edition GPL Source Code
      5 Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. 
      6 
      7 This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").  
      8 
      9 Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
     10 it under the terms of the GNU General Public License as published by
     11 the Free Software Foundation, either version 3 of the License, or
     12 (at your option) any later version.
     13 
     14 Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
     15 but WITHOUT ANY WARRANTY; without even the implied warranty of
     16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     17 GNU General Public License for more details.
     18 
     19 You should have received a copy of the GNU General Public License
     20 along with Doom 3 BFG Edition Source Code.  If not, see <http://www.gnu.org/licenses/>.
     21 
     22 In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code.  If not, please request a copy in writing from id Software at the address below.
     23 
     24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
     25 
     26 ===========================================================================
     27 */
     28 
     29 #include "DynamicShadowVolume_local.h"
     30 
     31 #include "../../../idlib/sys/sys_intrinsics.h"
     32 #include "../../../idlib/geometry/DrawVert_intrinsics.h"
     33 
     34 #ifdef ID_WIN_X86_SSE2_INTRIN
     35 
     36 static const __m128i vector_int_neg_one		= _mm_set_epi32( -1, -1, -1, -1 );
     37 
     38 /*
     39 =====================
     40 TriangleFacing_SSE2
     41 =====================
     42 */
     43 static __forceinline __m128i TriangleFacing_SSE2(	const __m128 & vert0X, const __m128 & vert0Y, const __m128 & vert0Z,
     44 												const __m128 & vert1X, const __m128 & vert1Y, const __m128 & vert1Z,
     45 												const __m128 & vert2X, const __m128 & vert2Y, const __m128 & vert2Z,
     46 												const __m128 & lightOriginX, const __m128 & lightOriginY, const __m128 & lightOriginZ ) {
     47 	const __m128 sX = _mm_sub_ps( vert1X, vert0X );
     48 	const __m128 sY = _mm_sub_ps( vert1Y, vert0Y );
     49 	const __m128 sZ = _mm_sub_ps( vert1Z, vert0Z );
     50 
     51 	const __m128 tX = _mm_sub_ps( vert2X, vert0X );
     52 	const __m128 tY = _mm_sub_ps( vert2Y, vert0Y );
     53 	const __m128 tZ = _mm_sub_ps( vert2Z, vert0Z );
     54 
     55 	const __m128 normalX = _mm_nmsub_ps( tZ, sY, _mm_mul_ps( tY, sZ ) );
     56 	const __m128 normalY = _mm_nmsub_ps( tX, sZ, _mm_mul_ps( tZ, sX ) );
     57 	const __m128 normalZ = _mm_nmsub_ps( tY, sX, _mm_mul_ps( tX, sY ) );
     58 	const __m128 normalW = _mm_madd_ps( normalX, vert0X, _mm_madd_ps( normalY, vert0Y, _mm_mul_ps( normalZ, vert0Z ) ) );
     59 
     60 	const __m128 delta = _mm_nmsub_ps( lightOriginX, normalX, _mm_nmsub_ps( lightOriginY, normalY, _mm_nmsub_ps( lightOriginZ, normalZ, normalW ) ) );
     61 	return _mm_castps_si128( _mm_cmplt_ps( delta, _mm_setzero_ps() ) );
     62 }
     63 
     64 /*
     65 =====================
     66 TriangleCulled
     67 
     68 The clip space of the 'lightProject' is assumed to be in the range [0, 1].
     69 =====================
     70 */
     71 static __forceinline __m128i TriangleCulled_SSE2(	const __m128 & vert0X, const __m128 & vert0Y, const __m128 & vert0Z,
     72 												const __m128 & vert1X, const __m128 & vert1Y, const __m128 & vert1Z,
     73 												const __m128 & vert2X, const __m128 & vert2Y, const __m128 & vert2Z,
     74 												const __m128 & lightProjectX, const __m128 & lightProjectY, const __m128 & lightProjectZ, const __m128 & lightProjectW ) {
     75 
     76 	const __m128 mvpX0 = _mm_splat_ps( lightProjectX, 0 );
     77 	const __m128 mvpX1 = _mm_splat_ps( lightProjectX, 1 );
     78 	const __m128 mvpX2 = _mm_splat_ps( lightProjectX, 2 );
     79 	const __m128 mvpX3 = _mm_splat_ps( lightProjectX, 3 );
     80 
     81 	const __m128 c0X = _mm_madd_ps( vert0X, mvpX0, _mm_madd_ps( vert0Y, mvpX1, _mm_madd_ps( vert0Z, mvpX2, mvpX3 ) ) );
     82 	const __m128 c1X = _mm_madd_ps( vert1X, mvpX0, _mm_madd_ps( vert1Y, mvpX1, _mm_madd_ps( vert1Z, mvpX2, mvpX3 ) ) );
     83 	const __m128 c2X = _mm_madd_ps( vert2X, mvpX0, _mm_madd_ps( vert2Y, mvpX1, _mm_madd_ps( vert2Z, mvpX2, mvpX3 ) ) );
     84 
     85 	const __m128 mvpY0 = _mm_splat_ps( lightProjectY, 0 );
     86 	const __m128 mvpY1 = _mm_splat_ps( lightProjectY, 1 );
     87 	const __m128 mvpY2 = _mm_splat_ps( lightProjectY, 2 );
     88 	const __m128 mvpY3 = _mm_splat_ps( lightProjectY, 3 );
     89 
     90 	const __m128 c0Y = _mm_madd_ps( vert0X, mvpY0, _mm_madd_ps( vert0Y, mvpY1, _mm_madd_ps( vert0Z, mvpY2, mvpY3 ) ) );
     91 	const __m128 c1Y = _mm_madd_ps( vert1X, mvpY0, _mm_madd_ps( vert1Y, mvpY1, _mm_madd_ps( vert1Z, mvpY2, mvpY3 ) ) );
     92 	const __m128 c2Y = _mm_madd_ps( vert2X, mvpY0, _mm_madd_ps( vert2Y, mvpY1, _mm_madd_ps( vert2Z, mvpY2, mvpY3 ) ) );
     93 
     94 	const __m128 mvpZ0 = _mm_splat_ps( lightProjectZ, 0 );
     95 	const __m128 mvpZ1 = _mm_splat_ps( lightProjectZ, 1 );
     96 	const __m128 mvpZ2 = _mm_splat_ps( lightProjectZ, 2 );
     97 	const __m128 mvpZ3 = _mm_splat_ps( lightProjectZ, 3 );
     98 
     99 	const __m128 c0Z = _mm_madd_ps( vert0X, mvpZ0, _mm_madd_ps( vert0Y, mvpZ1, _mm_madd_ps( vert0Z, mvpZ2, mvpZ3 ) ) );
    100 	const __m128 c1Z = _mm_madd_ps( vert1X, mvpZ0, _mm_madd_ps( vert1Y, mvpZ1, _mm_madd_ps( vert1Z, mvpZ2, mvpZ3 ) ) );
    101 	const __m128 c2Z = _mm_madd_ps( vert2X, mvpZ0, _mm_madd_ps( vert2Y, mvpZ1, _mm_madd_ps( vert2Z, mvpZ2, mvpZ3 ) ) );
    102 
    103 	const __m128 mvpW0 = _mm_splat_ps( lightProjectW, 0 );
    104 	const __m128 mvpW1 = _mm_splat_ps( lightProjectW, 1 );
    105 	const __m128 mvpW2 = _mm_splat_ps( lightProjectW, 2 );
    106 	const __m128 mvpW3 = _mm_splat_ps( lightProjectW, 3 );
    107 
    108 	const __m128 c0W = _mm_madd_ps( vert0X, mvpW0, _mm_madd_ps( vert0Y, mvpW1, _mm_madd_ps( vert0Z, mvpW2, mvpW3 ) ) );
    109 	const __m128 c1W = _mm_madd_ps( vert1X, mvpW0, _mm_madd_ps( vert1Y, mvpW1, _mm_madd_ps( vert1Z, mvpW2, mvpW3 ) ) );
    110 	const __m128 c2W = _mm_madd_ps( vert2X, mvpW0, _mm_madd_ps( vert2Y, mvpW1, _mm_madd_ps( vert2Z, mvpW2, mvpW3 ) ) );
    111 
    112 	const __m128 zero = _mm_setzero_ps();
    113 
    114 	__m128 b0 = _mm_or_ps( _mm_or_ps( _mm_cmpgt_ps( c0X, zero ), _mm_cmpgt_ps( c1X, zero ) ), _mm_cmpgt_ps( c2X, zero ) );
    115 	__m128 b1 = _mm_or_ps( _mm_or_ps( _mm_cmpgt_ps( c0Y, zero ), _mm_cmpgt_ps( c1Y, zero ) ), _mm_cmpgt_ps( c2Y, zero ) );
    116 	__m128 b2 = _mm_or_ps( _mm_or_ps( _mm_cmpgt_ps( c0Z, zero ), _mm_cmpgt_ps( c1Z, zero ) ), _mm_cmpgt_ps( c2Z, zero ) );
    117 	__m128 b3 = _mm_or_ps( _mm_or_ps( _mm_cmpgt_ps( c0W, c0X ), _mm_cmpgt_ps( c1W, c1X ) ), _mm_cmpgt_ps( c2W, c2X ) );
    118 	__m128 b4 = _mm_or_ps( _mm_or_ps( _mm_cmpgt_ps( c0W, c0Y ), _mm_cmpgt_ps( c1W, c1Y ) ), _mm_cmpgt_ps( c2W, c2Y ) );
    119 	__m128 b5 = _mm_or_ps( _mm_or_ps( _mm_cmpgt_ps( c0W, c0Z ), _mm_cmpgt_ps( c1W, c1Z ) ), _mm_cmpgt_ps( c2W, c2Z ) );
    120 
    121 	b0 = _mm_and_ps( b0, b1 );
    122 	b2 = _mm_and_ps( b2, b3 );
    123 	b4 = _mm_and_ps( b4, b5 );
    124 	b0 = _mm_and_ps( b0, b2 );
    125 	b0 = _mm_and_ps( b0, b4 );
    126 
    127 	return _mm_castps_si128( _mm_cmpeq_ps( b0, zero ) );
    128 }
    129 
    130 #else
    131 
    132 /*
    133 =====================
    134 TriangleFacing
    135 
    136 Returns 255 if the triangle is facing the light origin, otherwise returns 0.
    137 =====================
    138 */
    139 static byte TriangleFacing_Generic( const idVec3 & v1, const idVec3 & v2, const idVec3 & v3, const idVec3 & lightOrigin ) {
    140 	const float sx = v2.x - v1.x;
    141 	const float sy = v2.y - v1.y;
    142 	const float sz = v2.z - v1.z;
    143 
    144 	const float tx = v3.x - v1.x;
    145 	const float ty = v3.y - v1.y;
    146 	const float tz = v3.z - v1.z;
    147 
    148 	const float normalX = ty * sz - tz * sy;
    149 	const float normalY = tz * sx - tx * sz;
    150 	const float normalZ = tx * sy - ty * sx;
    151 	const float normalW = normalX * v1.x + normalY * v1.y + normalZ * v1.z;
    152 
    153 	const float d = lightOrigin.x * normalX + lightOrigin.y * normalY + lightOrigin.z * normalZ - normalW;
    154 	return ( d > 0.0f ) ? 255 : 0;
    155 }
    156 
    157 /*
    158 =====================
    159 TriangleCulled
    160 
    161 Returns 255 if the triangle is culled to the light projection matrix, otherwise returns 0.
    162 The clip space of the 'lightProject' is assumed to be in the range [0, 1].
    163 =====================
    164 */
    165 static byte TriangleCulled_Generic( const idVec3 & v1, const idVec3 & v2, const idVec3 & v3, const idRenderMatrix & lightProject ) {
    166 	// transform the triangle
    167 	idVec4 c[3];
    168 	for ( int i = 0; i < 4; i++ ) {
    169 		c[0][i] = v1[0] * lightProject[i][0] + v1[1] * lightProject[i][1] + v1[2] * lightProject[i][2] + lightProject[i][3];
    170 		c[1][i] = v2[0] * lightProject[i][0] + v2[1] * lightProject[i][1] + v2[2] * lightProject[i][2] + lightProject[i][3];
    171 		c[2][i] = v3[0] * lightProject[i][0] + v3[1] * lightProject[i][1] + v3[2] * lightProject[i][2] + lightProject[i][3];
    172 	}
    173 
    174 	// calculate the culled bits
    175 	int bits = 0;
    176 	for ( int i = 0; i < 3; i++ ) {
    177 		const float minW = 0.0f;
    178 		const float maxW = c[i][3];
    179 
    180 		if ( c[i][0] > minW ) { bits |= ( 1 << 0 ); }
    181 		if ( c[i][0] < maxW ) { bits |= ( 1 << 1 ); }
    182 		if ( c[i][1] > minW ) { bits |= ( 1 << 2 ); }
    183 		if ( c[i][1] < maxW ) { bits |= ( 1 << 3 ); }
    184 		if ( c[i][2] > minW ) { bits |= ( 1 << 4 ); }
    185 		if ( c[i][2] < maxW ) { bits |= ( 1 << 5 ); }
    186 	}
    187 
    188 	// if any bits weren't set, the triangle is completely off one side of the frustum
    189 	return ( bits != 63 ) ? 255 : 0;
    190 }
    191 
    192 #endif
    193 
    194 /*
    195 =====================
    196 CalculateTriangleFacingCulledStatic
    197 =====================
    198 */
    199 static int CalculateTriangleFacingCulledStatic( byte * __restrict facing, byte * __restrict culled, const triIndex_t * __restrict indexes, int numIndexes,
    200 									const idDrawVert * __restrict verts, const int numVerts,
    201 									const idVec3 & lightOrigin, const idVec3 & viewOrigin,
    202 									bool cullShadowTrianglesToLight, const idRenderMatrix & lightProject,
    203 									bool * insideShadowVolume, const float radius ) {
    204 
    205 	assert_spu_local_store( facing );
    206 	assert_not_spu_local_store( indexes );
    207 	assert_not_spu_local_store( verts );
    208 
    209 	if ( insideShadowVolume != NULL ) {
    210 		*insideShadowVolume = false;
    211 	}
    212 
    213 	// calculate the start, end, dir and length of the line from the view origin to the light origin
    214 	const idVec3 lineStart = viewOrigin;
    215 	const idVec3 lineEnd = lightOrigin;
    216 	const idVec3 lineDelta = lineEnd - lineStart;
    217 	const float lineLengthSqr = lineDelta.LengthSqr();
    218 	const float lineLengthRcp = idMath::InvSqrt( lineLengthSqr );
    219 	const idVec3 lineDir = lineDelta * lineLengthRcp;
    220 	const float lineLength = lineLengthSqr * lineLengthRcp;
    221 
    222 #ifdef ID_WIN_X86_SSE2_INTRIN
    223 
    224 	idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 4 * 3 > indexedVertsODS( verts, numVerts, indexes, numIndexes );
    225 
    226 	const __m128 lightOriginX = _mm_splat_ps( _mm_load_ss( &lightOrigin.x ), 0 );
    227 	const __m128 lightOriginY = _mm_splat_ps( _mm_load_ss( &lightOrigin.y ), 0 );
    228 	const __m128 lightOriginZ = _mm_splat_ps( _mm_load_ss( &lightOrigin.z ), 0 );
    229 
    230 	const __m128 lightProjectX = _mm_loadu_ps( lightProject[0] );
    231 	const __m128 lightProjectY = _mm_loadu_ps( lightProject[1] );
    232 	const __m128 lightProjectZ = _mm_loadu_ps( lightProject[2] );
    233 	const __m128 lightProjectW = _mm_loadu_ps( lightProject[3] );
    234 
    235 	const __m128i cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? vector_int_neg_one : vector_int_zero;
    236 
    237 	__m128i numFrontFacing = _mm_setzero_si128();
    238 
    239 	for ( int i = 0, j = 0; i < numIndexes; ) {
    240 
    241 		const int batchStart = i;
    242 		const int batchEnd = indexedVertsODS.FetchNextBatch();
    243 		const int batchEnd4x = batchEnd - 4 * 3;
    244 		const int indexStart = j;
    245 
    246 		for ( ; i <= batchEnd4x; i += 4 * 3, j += 4 ) {
    247 			const __m128 vertA0 = _mm_load_ps( indexedVertsODS[i + 0 * 3 + 0].xyz.ToFloatPtr() );
    248 			const __m128 vertA1 = _mm_load_ps( indexedVertsODS[i + 0 * 3 + 1].xyz.ToFloatPtr() );
    249 			const __m128 vertA2 = _mm_load_ps( indexedVertsODS[i + 0 * 3 + 2].xyz.ToFloatPtr() );
    250 
    251 			const __m128 vertB0 = _mm_load_ps( indexedVertsODS[i + 1 * 3 + 0].xyz.ToFloatPtr() );
    252 			const __m128 vertB1 = _mm_load_ps( indexedVertsODS[i + 1 * 3 + 1].xyz.ToFloatPtr() );
    253 			const __m128 vertB2 = _mm_load_ps( indexedVertsODS[i + 1 * 3 + 2].xyz.ToFloatPtr() );
    254 
    255 			const __m128 vertC0 = _mm_load_ps( indexedVertsODS[i + 2 * 3 + 0].xyz.ToFloatPtr() );
    256 			const __m128 vertC1 = _mm_load_ps( indexedVertsODS[i + 2 * 3 + 1].xyz.ToFloatPtr() );
    257 			const __m128 vertC2 = _mm_load_ps( indexedVertsODS[i + 2 * 3 + 2].xyz.ToFloatPtr() );
    258 
    259 			const __m128 vertD0 = _mm_load_ps( indexedVertsODS[i + 3 * 3 + 0].xyz.ToFloatPtr() );
    260 			const __m128 vertD1 = _mm_load_ps( indexedVertsODS[i + 3 * 3 + 1].xyz.ToFloatPtr() );
    261 			const __m128 vertD2 = _mm_load_ps( indexedVertsODS[i + 3 * 3 + 2].xyz.ToFloatPtr() );
    262 
    263 			const __m128 r0X = _mm_unpacklo_ps( vertA0, vertC0 );	// vertA0.x, vertC0.x, vertA0.z, vertC0.z
    264 			const __m128 r0Y = _mm_unpackhi_ps( vertA0, vertC0 );	// vertA0.y, vertC0.y, vertA0.w, vertC0.w
    265 			const __m128 r0Z = _mm_unpacklo_ps( vertB0, vertD0 );	// vertB0.x, vertD0.x, vertB0.z, vertD0.z
    266 			const __m128 r0W = _mm_unpackhi_ps( vertB0, vertD0 );	// vertB0.y, vertD0.y, vertB0.w, vertD0.w
    267 
    268 			const __m128 vert0X = _mm_unpacklo_ps( r0X, r0Z );		// vertA0.x, vertB0.x, vertC0.x, vertD0.x
    269 			const __m128 vert0Y = _mm_unpackhi_ps( r0X, r0Z );		// vertA0.y, vertB0.y, vertC0.y, vertD0.y
    270 			const __m128 vert0Z = _mm_unpacklo_ps( r0Y, r0W );		// vertA0.z, vertB0.z, vertC0.z, vertD0.z
    271 
    272 			const __m128 r1X = _mm_unpacklo_ps( vertA1, vertC1 );	// vertA1.x, vertC1.x, vertA1.z, vertC1.z
    273 			const __m128 r1Y = _mm_unpackhi_ps( vertA1, vertC1 );	// vertA1.y, vertC1.y, vertA1.w, vertC1.w
    274 			const __m128 r1Z = _mm_unpacklo_ps( vertB1, vertD1 );	// vertB1.x, vertD1.x, vertB1.z, vertD1.z
    275 			const __m128 r1W = _mm_unpackhi_ps( vertB1, vertD1 );	// vertB1.y, vertD1.y, vertB1.w, vertD1.w
    276 
    277 			const __m128 vert1X = _mm_unpacklo_ps( r1X, r1Z );		// vertA1.x, vertB1.x, vertC1.x, vertD1.x
    278 			const __m128 vert1Y = _mm_unpackhi_ps( r1X, r1Z );		// vertA1.y, vertB1.y, vertC1.y, vertD1.y
    279 			const __m128 vert1Z = _mm_unpacklo_ps( r1Y, r1W );		// vertA1.z, vertB1.z, vertC1.z, vertD1.z
    280 
    281 			const __m128 r2X = _mm_unpacklo_ps( vertA2, vertC2 );	// vertA2.x, vertC2.x, vertA2.z, vertC2.z
    282 			const __m128 r2Y = _mm_unpackhi_ps( vertA2, vertC2 );	// vertA2.y, vertC2.y, vertA2.w, vertC2.w
    283 			const __m128 r2Z = _mm_unpacklo_ps( vertB2, vertD2 );	// vertB2.x, vertD2.x, vertB2.z, vertD2.z
    284 			const __m128 r2W = _mm_unpackhi_ps( vertB2, vertD2 );	// vertB2.y, vertD2.y, vertB2.w, vertD2.w
    285 
    286 			const __m128 vert2X = _mm_unpacklo_ps( r2X, r2Z );		// vertA2.x, vertB2.x, vertC2.x, vertD2.x
    287 			const __m128 vert2Y = _mm_unpackhi_ps( r2X, r2Z );		// vertA2.y, vertB2.y, vertC2.y, vertD2.y
    288 			const __m128 vert2Z = _mm_unpacklo_ps( r2Y, r2W );		// vertA2.z, vertB2.z, vertC2.z, vertD2.z
    289 
    290 			const __m128i triangleCulled = TriangleCulled_SSE2( vert0X, vert0Y, vert0Z, vert1X, vert1Y, vert1Z, vert2X, vert2Y, vert2Z, lightProjectX, lightProjectY, lightProjectZ, lightProjectW );
    291 
    292 			__m128i triangleFacing = TriangleFacing_SSE2( vert0X, vert0Y, vert0Z, vert1X, vert1Y, vert1Z, vert2X, vert2Y, vert2Z, lightOriginX, lightOriginY, lightOriginZ );
    293 
    294 			// optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume
    295 			triangleFacing = _mm_or_si128( triangleFacing, _mm_and_si128( triangleCulled, cullShadowTrianglesToLightMask ) );
    296 
    297 			// store culled
    298 			const __m128i culled_s = _mm_packs_epi32( triangleCulled, triangleCulled );
    299 			const __m128i culled_b = _mm_packs_epi16( culled_s, culled_s );
    300 			*(int *)&culled[j] = _mm_cvtsi128_si32( culled_b );
    301 
    302 			// store facing
    303 			const __m128i facing_s = _mm_packs_epi32( triangleFacing, triangleFacing );
    304 			const __m128i facing_b = _mm_packs_epi16( facing_s, facing_s );
    305 			*(int *)&facing[j] = _mm_cvtsi128_si32( facing_b );
    306 
    307 			// count the number of facing triangles
    308 			numFrontFacing = _mm_add_epi32( numFrontFacing, _mm_and_si128( triangleFacing, vector_int_one ) );
    309 		}
    310 
    311 		if ( insideShadowVolume != NULL ) {
    312 			for ( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ ) {
    313 				if ( !facing[n] ) {
    314 					if ( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, indexedVertsODS[k + 2].xyz, indexedVertsODS[k + 1].xyz, indexedVertsODS[k + 0].xyz ) ) {
    315 						*insideShadowVolume = true;
    316 						insideShadowVolume = NULL;
    317 						break;
    318 					}
    319 				}
    320 			}
    321 		}
    322 	}
    323 
    324 	numFrontFacing = _mm_add_epi32( numFrontFacing, _mm_shuffle_epi32( numFrontFacing, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
    325 	numFrontFacing = _mm_add_epi32( numFrontFacing, _mm_shuffle_epi32( numFrontFacing, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
    326 
    327 	return _mm_cvtsi128_si32( numFrontFacing );
    328 
    329 #else
    330 
    331 	idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 1 > indexedVertsODS( verts, numVerts, indexes, numIndexes );
    332 
    333 	const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0;
    334 
    335 	int numFrontFacing = 0;
    336 
    337 	for ( int i = 0, j = 0; i < numIndexes; ) {
    338 
    339 		const int batchStart = i;
    340 		const int batchEnd = indexedVertsODS.FetchNextBatch();
    341 		const int indexStart = j;
    342 
    343 		for ( ; i <= batchEnd - 3; i += 3, j++ ) {
    344 			const idVec3 & v1 = indexedVertsODS[i + 0].xyz;
    345 			const idVec3 & v2 = indexedVertsODS[i + 1].xyz;
    346 			const idVec3 & v3 = indexedVertsODS[i + 2].xyz;
    347 
    348 			const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject );
    349 
    350 			byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin );
    351 
    352 			// optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume
    353 			triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask );
    354 
    355 			culled[j] = triangleCulled;
    356 			facing[j] = triangleFacing;
    357 
    358 			// count the number of facing triangles
    359 			numFrontFacing += ( triangleFacing & 1 );
    360 		}
    361 
    362 		if ( insideShadowVolume != NULL ) {
    363 			for ( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ ) {
    364 				if ( !facing[n] ) {
    365 					if ( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, indexedVertsODS[k + 2].xyz, indexedVertsODS[k + 1].xyz, indexedVertsODS[k + 0].xyz ) ) {
    366 						*insideShadowVolume = true;
    367 						insideShadowVolume = NULL;
    368 						break;
    369 					}
    370 				}
    371 			}
    372 		}
    373 	}
    374 
    375 	return numFrontFacing;
    376 
    377 #endif
    378 }
    379 
    380 /*
    381 =====================
    382 CalculateTriangleFacingCulledSkinned
    383 =====================
    384 */
    385 static int CalculateTriangleFacingCulledSkinned( byte * __restrict facing, byte * __restrict culled, idVec4 * __restrict tempVerts, const triIndex_t * __restrict indexes, int numIndexes,
    386 												const idDrawVert * __restrict verts, const int numVerts, const idJointMat * __restrict joints,
    387 												const idVec3 & lightOrigin, const idVec3 & viewOrigin,
    388 												bool cullShadowTrianglesToLight, const idRenderMatrix & lightProject,
    389 												bool * insideShadowVolume, const float radius ) {
    390 	assert_spu_local_store( facing );
    391 	assert_spu_local_store( joints );
    392 	assert_not_spu_local_store( indexes );
    393 	assert_not_spu_local_store( verts );
    394 
    395 	if ( insideShadowVolume != NULL ) {
    396 		*insideShadowVolume = false;
    397 	}
    398 
    399 	// calculate the start, end, dir and length of the line from the view origin to the light origin
    400 	const idVec3 lineStart = viewOrigin;
    401 	const idVec3 lineEnd = lightOrigin;
    402 	const idVec3 lineDelta = lineEnd - lineStart;
    403 	const float lineLengthSqr = lineDelta.LengthSqr();
    404 	const float lineLengthRcp = idMath::InvSqrt( lineLengthSqr );
    405 	const idVec3 lineDir = lineDelta * lineLengthRcp;
    406 	const float lineLength = lineLengthSqr * lineLengthRcp;
    407 
    408 #ifdef ID_WIN_X86_SSE2_INTRIN
    409 
    410 	idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
    411 
    412 	for ( int i = 0; i < numVerts; ) {
    413 
    414 		const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
    415 
    416 		for ( ; i <= nextNumVerts; i++ ) {
    417 			__m128 v = LoadSkinnedDrawVertPosition( vertsODS[i], joints );
    418 			_mm_store_ps( tempVerts[i].ToFloatPtr(), v );
    419 		}
    420 	}
    421 
    422 	idODSStreamedArray< triIndex_t, 256, SBT_QUAD, 4 * 3 > indexesODS( indexes, numIndexes );
    423 
    424 	const __m128 lightOriginX = _mm_splat_ps( _mm_load_ss( &lightOrigin.x ), 0 );
    425 	const __m128 lightOriginY = _mm_splat_ps( _mm_load_ss( &lightOrigin.y ), 0 );
    426 	const __m128 lightOriginZ = _mm_splat_ps( _mm_load_ss( &lightOrigin.z ), 0 );
    427 
    428 	const __m128 lightProjectX = _mm_loadu_ps( lightProject[0] );
    429 	const __m128 lightProjectY = _mm_loadu_ps( lightProject[1] );
    430 	const __m128 lightProjectZ = _mm_loadu_ps( lightProject[2] );
    431 	const __m128 lightProjectW = _mm_loadu_ps( lightProject[3] );
    432 
    433 	const __m128i cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? vector_int_neg_one : vector_int_zero;
    434 
    435 	__m128i numFrontFacing = _mm_setzero_si128();
    436 
    437 	for ( int i = 0, j = 0; i < numIndexes; ) {
    438 
    439 		const int batchStart = i;
    440 		const int batchEnd = indexesODS.FetchNextBatch();
    441 		const int batchEnd4x = batchEnd - 4 * 3;
    442 		const int indexStart = j;
    443 
    444 		for ( ; i <= batchEnd4x; i += 4 * 3, j += 4 ) {
    445 			const int indexA0 = indexesODS[( i + 0 * 3 + 0 )];
    446 			const int indexA1 = indexesODS[( i + 0 * 3 + 1 )];
    447 			const int indexA2 = indexesODS[( i + 0 * 3 + 2 )];
    448 
    449 			const int indexB0 = indexesODS[( i + 1 * 3 + 0 )];
    450 			const int indexB1 = indexesODS[( i + 1 * 3 + 1 )];
    451 			const int indexB2 = indexesODS[( i + 1 * 3 + 2 )];
    452 
    453 			const int indexC0 = indexesODS[( i + 2 * 3 + 0 )];
    454 			const int indexC1 = indexesODS[( i + 2 * 3 + 1 )];
    455 			const int indexC2 = indexesODS[( i + 2 * 3 + 2 )];
    456 
    457 			const int indexD0 = indexesODS[( i + 3 * 3 + 0 )];
    458 			const int indexD1 = indexesODS[( i + 3 * 3 + 1 )];
    459 			const int indexD2 = indexesODS[( i + 3 * 3 + 2 )];
    460 
    461 			const __m128 vertA0 = _mm_load_ps( tempVerts[indexA0].ToFloatPtr() );
    462 			const __m128 vertA1 = _mm_load_ps( tempVerts[indexA1].ToFloatPtr() );
    463 			const __m128 vertA2 = _mm_load_ps( tempVerts[indexA2].ToFloatPtr() );
    464 
    465 			const __m128 vertB0 = _mm_load_ps( tempVerts[indexB0].ToFloatPtr() );
    466 			const __m128 vertB1 = _mm_load_ps( tempVerts[indexB1].ToFloatPtr() );
    467 			const __m128 vertB2 = _mm_load_ps( tempVerts[indexB2].ToFloatPtr() );
    468 
    469 			const __m128 vertC0 = _mm_load_ps( tempVerts[indexC0].ToFloatPtr() );
    470 			const __m128 vertC1 = _mm_load_ps( tempVerts[indexC1].ToFloatPtr() );
    471 			const __m128 vertC2 = _mm_load_ps( tempVerts[indexC2].ToFloatPtr() );
    472 
    473 			const __m128 vertD0 = _mm_load_ps( tempVerts[indexD0].ToFloatPtr() );
    474 			const __m128 vertD1 = _mm_load_ps( tempVerts[indexD1].ToFloatPtr() );
    475 			const __m128 vertD2 = _mm_load_ps( tempVerts[indexD2].ToFloatPtr() );
    476 
    477 			const __m128 r0X = _mm_unpacklo_ps( vertA0, vertC0 );	// vertA0.x, vertC0.x, vertA0.z, vertC0.z
    478 			const __m128 r0Y = _mm_unpackhi_ps( vertA0, vertC0 );	// vertA0.y, vertC0.y, vertA0.w, vertC0.w
    479 			const __m128 r0Z = _mm_unpacklo_ps( vertB0, vertD0 );	// vertB0.x, vertD0.x, vertB0.z, vertD0.z
    480 			const __m128 r0W = _mm_unpackhi_ps( vertB0, vertD0 );	// vertB0.y, vertD0.y, vertB0.w, vertD0.w
    481 
    482 			const __m128 vert0X = _mm_unpacklo_ps( r0X, r0Z );		// vertA0.x, vertB0.x, vertC0.x, vertD0.x
    483 			const __m128 vert0Y = _mm_unpackhi_ps( r0X, r0Z );		// vertA0.y, vertB0.y, vertC0.y, vertD0.y
    484 			const __m128 vert0Z = _mm_unpacklo_ps( r0Y, r0W );		// vertA0.z, vertB0.z, vertC0.z, vertD0.z
    485 
    486 			const __m128 r1X = _mm_unpacklo_ps( vertA1, vertC1 );	// vertA1.x, vertC1.x, vertA1.z, vertC1.z
    487 			const __m128 r1Y = _mm_unpackhi_ps( vertA1, vertC1 );	// vertA1.y, vertC1.y, vertA1.w, vertC1.w
    488 			const __m128 r1Z = _mm_unpacklo_ps( vertB1, vertD1 );	// vertB1.x, vertD1.x, vertB1.z, vertD1.z
    489 			const __m128 r1W = _mm_unpackhi_ps( vertB1, vertD1 );	// vertB1.y, vertD1.y, vertB1.w, vertD1.w
    490 
    491 			const __m128 vert1X = _mm_unpacklo_ps( r1X, r1Z );		// vertA1.x, vertB1.x, vertC1.x, vertD1.x
    492 			const __m128 vert1Y = _mm_unpackhi_ps( r1X, r1Z );		// vertA1.y, vertB1.y, vertC1.y, vertD1.y
    493 			const __m128 vert1Z = _mm_unpacklo_ps( r1Y, r1W );		// vertA1.z, vertB1.z, vertC1.z, vertD1.z
    494 
    495 			const __m128 r2X = _mm_unpacklo_ps( vertA2, vertC2 );	// vertA2.x, vertC2.x, vertA2.z, vertC2.z
    496 			const __m128 r2Y = _mm_unpackhi_ps( vertA2, vertC2 );	// vertA2.y, vertC2.y, vertA2.w, vertC2.w
    497 			const __m128 r2Z = _mm_unpacklo_ps( vertB2, vertD2 );	// vertB2.x, vertD2.x, vertB2.z, vertD2.z
    498 			const __m128 r2W = _mm_unpackhi_ps( vertB2, vertD2 );	// vertB2.y, vertD2.y, vertB2.w, vertD2.w
    499 
    500 			const __m128 vert2X = _mm_unpacklo_ps( r2X, r2Z );		// vertA2.x, vertB2.x, vertC2.x, vertD2.x
    501 			const __m128 vert2Y = _mm_unpackhi_ps( r2X, r2Z );		// vertA2.y, vertB2.y, vertC2.y, vertD2.y
    502 			const __m128 vert2Z = _mm_unpacklo_ps( r2Y, r2W );		// vertA2.z, vertB2.z, vertC2.z, vertD2.z
    503 
    504 			const __m128i triangleCulled = TriangleCulled_SSE2( vert0X, vert0Y, vert0Z, vert1X, vert1Y, vert1Z, vert2X, vert2Y, vert2Z, lightProjectX, lightProjectY, lightProjectZ, lightProjectW );
    505 
    506 			__m128i triangleFacing = TriangleFacing_SSE2( vert0X, vert0Y, vert0Z, vert1X, vert1Y, vert1Z, vert2X, vert2Y, vert2Z, lightOriginX, lightOriginY, lightOriginZ );
    507 
    508 			// optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume
    509 			triangleFacing = _mm_or_si128( triangleFacing, _mm_and_si128( triangleCulled, cullShadowTrianglesToLightMask ) );
    510 
    511 			// store culled
    512 			const __m128i culled_s = _mm_packs_epi32( triangleCulled, triangleCulled );
    513 			const __m128i culled_b = _mm_packs_epi16( culled_s, culled_s );
    514 			*(int *)&culled[j] = _mm_cvtsi128_si32( culled_b );
    515 
    516 			// store facing
    517 			const __m128i facing_s = _mm_packs_epi32( triangleFacing, triangleFacing );
    518 			const __m128i facing_b = _mm_packs_epi16( facing_s, facing_s );
    519 			*(int *)&facing[j] = _mm_cvtsi128_si32( facing_b );
    520 
    521 			// count the number of facing triangles
    522 			numFrontFacing = _mm_add_epi32( numFrontFacing, _mm_and_si128( triangleFacing, vector_int_one ) );
    523 		}
    524 	
    525 		if ( insideShadowVolume != NULL ) {
    526 			for ( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ ) {
    527 				if ( !facing[n] ) {
    528 					const int i0 = indexesODS[k + 0];
    529 					const int i1 = indexesODS[k + 1];
    530 					const int i2 = indexesODS[k + 2];
    531 					if ( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, tempVerts[i2].ToVec3(), tempVerts[i1].ToVec3(), tempVerts[i0].ToVec3() ) ) {
    532 						*insideShadowVolume = true;
    533 						insideShadowVolume = NULL;
    534 						break;
    535 					}
    536 				}
    537 			}
    538 		}
    539 	}
    540 
    541 	numFrontFacing = _mm_add_epi32( numFrontFacing, _mm_shuffle_epi32( numFrontFacing, _MM_SHUFFLE( 1, 0, 3, 2 ) ) );
    542 	numFrontFacing = _mm_add_epi32( numFrontFacing, _mm_shuffle_epi32( numFrontFacing, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
    543 
    544 	return _mm_cvtsi128_si32( numFrontFacing );
    545 
    546 #else
    547 
    548 	idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
    549 
    550 	for ( int i = 0; i < numVerts; ) {
    551 
    552 		const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
    553 
    554 		for ( ; i <= nextNumVerts; i++ ) {
    555 			tempVerts[i].ToVec3() = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
    556 			tempVerts[i].w = 1.0f;
    557 		}
    558 	}
    559 
    560 	idODSStreamedArray< triIndex_t, 256, SBT_QUAD, 1 > indexesODS( indexes, numIndexes );
    561 
    562 	const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0;
    563 
    564 	int numFrontFacing = 0;
    565 
    566 	for ( int i = 0, j = 0; i < numIndexes; ) {
    567 
    568 		const int batchStart = i;
    569 		const int batchEnd = indexesODS.FetchNextBatch();
    570 		const int indexStart = j;
    571 
    572 		for ( ; i <= batchEnd - 3; i += 3, j++ ) {
    573 			const int i0 = indexesODS[i + 0];
    574 			const int i1 = indexesODS[i + 1];
    575 			const int i2 = indexesODS[i + 2];
    576 
    577 			const idVec3 & v1 = tempVerts[i0].ToVec3();
    578 			const idVec3 & v2 = tempVerts[i1].ToVec3();
    579 			const idVec3 & v3 = tempVerts[i2].ToVec3();
    580 
    581 			const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject );
    582 
    583 			byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin );
    584 
    585 			// optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume
    586 			triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask );
    587 
    588 			culled[j] = triangleCulled;
    589 			facing[j] = triangleFacing;
    590 
    591 			// count the number of facing triangles
    592 			numFrontFacing += ( triangleFacing & 1 );
    593 		}
    594 
    595 		if ( insideShadowVolume != NULL ) {
    596 			for ( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ ) {
    597 				if ( !facing[n] ) {
    598 					const int i0 = indexesODS[k + 0];
    599 					const int i1 = indexesODS[k + 1];
    600 					const int i2 = indexesODS[k + 2];
    601 					if ( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, tempVerts[i2].ToVec3(), tempVerts[i1].ToVec3(), tempVerts[i0].ToVec3() ) ) {
    602 						*insideShadowVolume = true;
    603 						insideShadowVolume = NULL;
    604 						break;
    605 					}
    606 				}
    607 			}
    608 		}
    609 	}
    610 
    611 	return numFrontFacing;
    612 
    613 #endif
    614 }
    615 
    616 /*
    617 ============
    618 StreamOut
    619 ============
    620 */
    621 static void StreamOut( void * dst, const void * src, int numBytes ) {
    622 	numBytes = ( numBytes + 15 ) & ~15;
    623 	assert_16_byte_aligned( dst );
    624 	assert_16_byte_aligned( src );
    625 
    626 #ifdef ID_WIN_X86_SSE2_INTRIN
    627 	int i = 0;
    628 	for ( ; i + 128 <= numBytes; i += 128 ) {
    629 		__m128i d0 = _mm_load_si128( (const __m128i *)( (byte *)src + i + 0*16 ) );
    630 		__m128i d1 = _mm_load_si128( (const __m128i *)( (byte *)src + i + 1*16 ) );
    631 		__m128i d2 = _mm_load_si128( (const __m128i *)( (byte *)src + i + 2*16 ) );
    632 		__m128i d3 = _mm_load_si128( (const __m128i *)( (byte *)src + i + 3*16 ) );
    633 		__m128i d4 = _mm_load_si128( (const __m128i *)( (byte *)src + i + 4*16 ) );
    634 		__m128i d5 = _mm_load_si128( (const __m128i *)( (byte *)src + i + 5*16 ) );
    635 		__m128i d6 = _mm_load_si128( (const __m128i *)( (byte *)src + i + 6*16 ) );
    636 		__m128i d7 = _mm_load_si128( (const __m128i *)( (byte *)src + i + 7*16 ) );
    637 		_mm_stream_si128( (__m128i *)( (byte *)dst + i + 0*16 ), d0 );
    638 		_mm_stream_si128( (__m128i *)( (byte *)dst + i + 1*16 ), d1 );
    639 		_mm_stream_si128( (__m128i *)( (byte *)dst + i + 2*16 ), d2 );
    640 		_mm_stream_si128( (__m128i *)( (byte *)dst + i + 3*16 ), d3 );
    641 		_mm_stream_si128( (__m128i *)( (byte *)dst + i + 4*16 ), d4 );
    642 		_mm_stream_si128( (__m128i *)( (byte *)dst + i + 5*16 ), d5 );
    643 		_mm_stream_si128( (__m128i *)( (byte *)dst + i + 6*16 ), d6 );
    644 		_mm_stream_si128( (__m128i *)( (byte *)dst + i + 7*16 ), d7 );
    645 	}
    646 	for ( ; i + 16 <= numBytes; i += 16 ) {
    647 		__m128i d = _mm_load_si128( (__m128i *)( (byte *)src + i ) );
    648 		_mm_stream_si128( (__m128i *)( (byte *)dst + i ), d );
    649 	}
    650 #else
    651 	memcpy( dst, src, numBytes );
    652 #endif
    653 }
    654 
    655 /*
    656 ============
    657 R_CreateShadowVolumeTriangles
    658 ============
    659 */
    660 static void R_CreateShadowVolumeTriangles( triIndex_t *__restrict shadowIndices, triIndex_t *__restrict indexBuffer, int & numShadowIndexesTotal,
    661 												const byte *__restrict facing, const silEdge_t *__restrict silEdges, const int numSilEdges,
    662 												const triIndex_t *__restrict indexes, const int numIndexes, const bool includeCaps ) {
    663 	assert_spu_local_store( facing );
    664 	assert_not_spu_local_store( shadowIndices );
    665 	assert_not_spu_local_store( silEdges );
    666 	assert_not_spu_local_store( indexes );
    667 
    668 #if 1
    669 
    670 	const int IN_BUFFER_SIZE = 64;
    671 	const int OUT_BUFFER_SIZE = IN_BUFFER_SIZE * 8;			// each silhouette edge or cap triangle may create 6 indices (8 > 6)
    672 	const int OUT_BUFFER_DEPTH = 4;							// quad buffer to allow overlapped output streaming
    673 	const int OUT_BUFFER_MASK = ( OUT_BUFFER_SIZE * OUT_BUFFER_DEPTH - 1 );
    674 
    675 	compile_time_assert( OUT_BUFFER_SIZE * OUT_BUFFER_DEPTH * sizeof( triIndex_t ) == OUTPUT_INDEX_BUFFER_SIZE );
    676 	assert_16_byte_aligned( indexBuffer );
    677 
    678 	int numShadowIndices = 0;
    679 	int numStreamedIndices = 0;
    680 
    681 	{
    682 		idODSStreamedArray< silEdge_t, IN_BUFFER_SIZE, SBT_DOUBLE, 1 > silEdgesODS( silEdges, numSilEdges );
    683 
    684 		for ( int i = 0; i < numSilEdges; ) {
    685 
    686 			const int nextNumSilEdges = silEdgesODS.FetchNextBatch();
    687 
    688 			// NOTE: we rely on FetchNextBatch() to wait for all previous DMAs to complete
    689 			while( numShadowIndices - numStreamedIndices >= OUT_BUFFER_SIZE ) {
    690 				StreamOut( shadowIndices + numStreamedIndices, & indexBuffer[numStreamedIndices & OUT_BUFFER_MASK], OUT_BUFFER_SIZE * sizeof( triIndex_t ) );
    691 				numStreamedIndices += OUT_BUFFER_SIZE;
    692 			}
    693 
    694 			for ( ; i + 4 <= nextNumSilEdges; i += 4 ) {
    695 				const silEdge_t & sil0 = silEdgesODS[i + 0];
    696 				const silEdge_t & sil1 = silEdgesODS[i + 1];
    697 				const silEdge_t & sil2 = silEdgesODS[i + 2];
    698 				const silEdge_t & sil3 = silEdgesODS[i + 3];
    699 
    700 				{
    701 					const byte f1a = facing[sil0.p1];
    702 					const byte f2a = facing[sil0.p2];
    703 					const byte ta = ( f1a ^ f2a ) & 6;
    704 					const triIndex_t v1a = sil0.v1 << 1;
    705 					const triIndex_t v2a = sil0.v2 << 1;
    706 
    707 					WriteIndexPair( &indexBuffer[( numShadowIndices + 0 ) & OUT_BUFFER_MASK], v1a ^ (   0 & 1 ), v2a ^ ( f1a & 1 ) );
    708 					WriteIndexPair( &indexBuffer[( numShadowIndices + 2 ) & OUT_BUFFER_MASK], v2a ^ ( f2a & 1 ), v1a ^ ( f2a & 1 ) );
    709 					WriteIndexPair( &indexBuffer[( numShadowIndices + 4 ) & OUT_BUFFER_MASK], v1a ^ ( f1a & 1 ), v2a ^ (   1 & 1 ) );
    710 
    711 					numShadowIndices += ta;
    712 				}
    713 
    714 				{
    715 					const byte f1b = facing[sil1.p1];
    716 					const byte f2b = facing[sil1.p2];
    717 					const byte tb = ( f1b ^ f2b ) & 6;
    718 					const triIndex_t v1b = sil1.v1 << 1;
    719 					const triIndex_t v2b = sil1.v2 << 1;
    720 
    721 					WriteIndexPair( &indexBuffer[( numShadowIndices + 0 ) & OUT_BUFFER_MASK], v1b ^ (   0 & 1 ), v2b ^ ( f1b & 1 ) );
    722 					WriteIndexPair( &indexBuffer[( numShadowIndices + 2 ) & OUT_BUFFER_MASK], v2b ^ ( f2b & 1 ), v1b ^ ( f2b & 1 ) );
    723 					WriteIndexPair( &indexBuffer[( numShadowIndices + 4 ) & OUT_BUFFER_MASK], v1b ^ ( f1b & 1 ), v2b ^ (   1 & 1 ) );
    724 
    725 					numShadowIndices += tb;
    726 				}
    727 
    728 				{
    729 					const byte f1c = facing[sil2.p1];
    730 					const byte f2c = facing[sil2.p2];
    731 					const byte tc = ( f1c ^ f2c ) & 6;
    732 					const triIndex_t v1c = sil2.v1 << 1;
    733 					const triIndex_t v2c = sil2.v2 << 1;
    734 
    735 					WriteIndexPair( &indexBuffer[( numShadowIndices + 0 ) & OUT_BUFFER_MASK], v1c ^ (   0 & 1 ), v2c ^ ( f1c & 1 ) );
    736 					WriteIndexPair( &indexBuffer[( numShadowIndices + 2 ) & OUT_BUFFER_MASK], v2c ^ ( f2c & 1 ), v1c ^ ( f2c & 1 ) );
    737 					WriteIndexPair( &indexBuffer[( numShadowIndices + 4 ) & OUT_BUFFER_MASK], v1c ^ ( f1c & 1 ), v2c ^ (   1 & 1 ) );
    738 
    739 					numShadowIndices += tc;
    740 				}
    741 
    742 				{
    743 					const byte f1d = facing[sil3.p1];
    744 					const byte f2d = facing[sil3.p2];
    745 					const byte td = ( f1d ^ f2d ) & 6;
    746 					const triIndex_t v1d = sil3.v1 << 1;
    747 					const triIndex_t v2d = sil3.v2 << 1;
    748 
    749 					WriteIndexPair( &indexBuffer[( numShadowIndices + 0 ) & OUT_BUFFER_MASK], v1d ^ (   0 & 1 ), v2d ^ ( f1d & 1 ) );
    750 					WriteIndexPair( &indexBuffer[( numShadowIndices + 2 ) & OUT_BUFFER_MASK], v2d ^ ( f2d & 1 ), v1d ^ ( f2d & 1 ) );
    751 					WriteIndexPair( &indexBuffer[( numShadowIndices + 4 ) & OUT_BUFFER_MASK], v1d ^ ( f1d & 1 ), v2d ^ (   1 & 1 ) );
    752 
    753 					numShadowIndices += td;
    754 				}
    755 			}
    756 			for ( ; i + 1 <= nextNumSilEdges; i++ ) {
    757 				const silEdge_t & sil = silEdgesODS[i];
    758 
    759 				const byte f1 = facing[sil.p1];
    760 				const byte f2 = facing[sil.p2];
    761 				const byte t = ( f1 ^ f2 ) & 6;
    762 				const triIndex_t v1 = sil.v1 << 1;
    763 				const triIndex_t v2 = sil.v2 << 1;
    764 
    765 				WriteIndexPair( &indexBuffer[( numShadowIndices + 0 ) & OUT_BUFFER_MASK], v1 ^ (  0 & 1 ), v2 ^ ( f1 & 1 ) );
    766 				WriteIndexPair( &indexBuffer[( numShadowIndices + 2 ) & OUT_BUFFER_MASK], v2 ^ ( f2 & 1 ), v1 ^ ( f2 & 1 ) );
    767 				WriteIndexPair( &indexBuffer[( numShadowIndices + 4 ) & OUT_BUFFER_MASK], v1 ^ ( f1 & 1 ), v2 ^ (  1 & 1 ) );
    768 
    769 				numShadowIndices += t;
    770 			}
    771 		}
    772 	}
    773 
    774 	if ( includeCaps ) {
    775 		idODSStreamedArray< triIndex_t, IN_BUFFER_SIZE, SBT_QUAD, 1 > indexesODS( indexes, numIndexes );
    776 
    777 		for ( int i = 0, j = 0; i < numIndexes; ) {
    778 
    779 			const int nextNumIndexes = indexesODS.FetchNextBatch();
    780 
    781 			// NOTE: we rely on FetchNextBatch() to wait for all previous DMAs to complete
    782 			while( numShadowIndices - numStreamedIndices >= OUT_BUFFER_SIZE ) {
    783 				StreamOut( shadowIndices + numStreamedIndices, & indexBuffer[numStreamedIndices & OUT_BUFFER_MASK], OUT_BUFFER_SIZE * sizeof( triIndex_t ) );
    784 				numStreamedIndices += OUT_BUFFER_SIZE;
    785 			}
    786 
    787 			for ( ; i + 4 * 3 <= nextNumIndexes; i += 4 * 3, j += 4 ) {
    788 				const byte ta = ~facing[j + 0] & 6;
    789 				const byte tb = ~facing[j + 1] & 6;
    790 				const byte tc = ~facing[j + 2] & 6;
    791 				const byte td = ~facing[j + 3] & 6;
    792 
    793 				const triIndex_t i0a = indexesODS[i + 0 * 3 + 0] << 1;
    794 				const triIndex_t i1a = indexesODS[i + 0 * 3 + 1] << 1;
    795 				const triIndex_t i2a = indexesODS[i + 0 * 3 + 2] << 1;
    796 
    797 				WriteIndexPair( &indexBuffer[( numShadowIndices + 0 ) & OUT_BUFFER_MASK], i2a + 0, i1a + 0 );
    798 				WriteIndexPair( &indexBuffer[( numShadowIndices + 2 ) & OUT_BUFFER_MASK], i0a + 0, i0a + 1 );
    799 				WriteIndexPair( &indexBuffer[( numShadowIndices + 4 ) & OUT_BUFFER_MASK], i1a + 1, i2a + 1 );
    800 
    801 				numShadowIndices += ta;
    802 
    803 				const triIndex_t i0b = indexesODS[i + 1 * 3 + 0] << 1;
    804 				const triIndex_t i1b = indexesODS[i + 1 * 3 + 1] << 1;
    805 				const triIndex_t i2b = indexesODS[i + 1 * 3 + 2] << 1;
    806 
    807 				WriteIndexPair( &indexBuffer[( numShadowIndices + 0 ) & OUT_BUFFER_MASK], i2b + 0, i1b + 0 );
    808 				WriteIndexPair( &indexBuffer[( numShadowIndices + 2 ) & OUT_BUFFER_MASK], i0b + 0, i0b + 1 );
    809 				WriteIndexPair( &indexBuffer[( numShadowIndices + 4 ) & OUT_BUFFER_MASK], i1b + 1, i2b + 1 );
    810 
    811 				numShadowIndices += tb;
    812 
    813 				const triIndex_t i0c = indexesODS[i + 2 * 3 + 0] << 1;
    814 				const triIndex_t i1c = indexesODS[i + 2 * 3 + 1] << 1;
    815 				const triIndex_t i2c = indexesODS[i + 2 * 3 + 2] << 1;
    816 
    817 				WriteIndexPair( &indexBuffer[( numShadowIndices + 0 ) & OUT_BUFFER_MASK], i2c + 0, i1c + 0 );
    818 				WriteIndexPair( &indexBuffer[( numShadowIndices + 2 ) & OUT_BUFFER_MASK], i0c + 0, i0c + 1 );
    819 				WriteIndexPair( &indexBuffer[( numShadowIndices + 4 ) & OUT_BUFFER_MASK], i1c + 1, i2c + 1 );
    820 
    821 				numShadowIndices += tc;
    822 
    823 				const triIndex_t i0d = indexesODS[i + 3 * 3 + 0] << 1;
    824 				const triIndex_t i1d = indexesODS[i + 3 * 3 + 1] << 1;
    825 				const triIndex_t i2d = indexesODS[i + 3 * 3 + 2] << 1;
    826 
    827 				WriteIndexPair( &indexBuffer[( numShadowIndices + 0 ) & OUT_BUFFER_MASK], i2d + 0, i1d + 0 );
    828 				WriteIndexPair( &indexBuffer[( numShadowIndices + 2 ) & OUT_BUFFER_MASK], i0d + 0, i0d + 1 );
    829 				WriteIndexPair( &indexBuffer[( numShadowIndices + 4 ) & OUT_BUFFER_MASK], i1d + 1, i2d + 1 );
    830 
    831 				numShadowIndices += td;
    832 			}
    833 
    834 			for ( ; i + 3 <= nextNumIndexes; i += 3, j++ ) {
    835 				const byte t = ~facing[j] & 6;
    836 
    837 				const triIndex_t i0 = indexesODS[i + 0] << 1;
    838 				const triIndex_t i1 = indexesODS[i + 1] << 1;
    839 				const triIndex_t i2 = indexesODS[i + 2] << 1;
    840 
    841 				WriteIndexPair( &indexBuffer[( numShadowIndices + 0 ) & OUT_BUFFER_MASK], i2 + 0, i1 + 0 );
    842 				WriteIndexPair( &indexBuffer[( numShadowIndices + 2 ) & OUT_BUFFER_MASK], i0 + 0, i0 + 1 );
    843 				WriteIndexPair( &indexBuffer[( numShadowIndices + 4 ) & OUT_BUFFER_MASK], i1 + 1, i2 + 1 );
    844 
    845 				numShadowIndices += t;
    846 			}
    847 		}
    848 	}
    849 
    850 	while( numShadowIndices - numStreamedIndices >= OUT_BUFFER_SIZE ) {
    851 		StreamOut( shadowIndices + numStreamedIndices, & indexBuffer[numStreamedIndices & OUT_BUFFER_MASK], OUT_BUFFER_SIZE * sizeof( triIndex_t ) );
    852 		numStreamedIndices += OUT_BUFFER_SIZE;
    853 	}
    854 	if ( numShadowIndices > numStreamedIndices ) {
    855 		assert( numShadowIndices - numStreamedIndices < OUT_BUFFER_SIZE );
    856 		StreamOut( shadowIndices + numStreamedIndices, & indexBuffer[numStreamedIndices & OUT_BUFFER_MASK], ( numShadowIndices - numStreamedIndices ) * sizeof( triIndex_t ) );
    857 	}
    858 
    859 	numShadowIndexesTotal = numShadowIndices;
    860 
    861 #if defined( ID_WIN_X86_SSE2_INTRIN )
    862 	_mm_sfence();
    863 #endif
    864 
    865 #else	// NOTE: this code will not work on the SPU because it tries to write directly to the destination
    866 
    867 	triIndex_t * shadowIndexPtr = shadowIndices;
    868 
    869 	{
    870 		idODSStreamedArray< silEdge_t, 128, SBT_DOUBLE, 1 > silEdgesODS( silEdges, numSilEdges );
    871 
    872 		for ( int i = 0; i < numSilEdges; ) {
    873 
    874 			const int nextNumSilEdges = silEdgesODS.FetchNextBatch() - 1;
    875 
    876 			for ( ; i <= nextNumSilEdges; i++ ) {
    877 				const silEdge_t & sil = silEdgesODS[i];
    878 
    879 				const byte f1 = facing[sil.p1] & 1;
    880 				const byte f2 = facing[sil.p2] & 1;
    881 
    882 				if ( ( f1 ^ f2 ) == 0 ) {
    883 					continue;
    884 				}
    885 
    886 				const triIndex_t v1 = sil.v1 << 1;
    887 				const triIndex_t v2 = sil.v2 << 1;
    888 
    889 				// set the two triangle winding orders based on facing
    890 				// without using a poorly-predictable branch
    891 #if 1
    892 				// only write dwords to write combined memory
    893 				WriteIndexPair( shadowIndexPtr + 0, v1 ^  0, v2 ^ f1 );
    894 				WriteIndexPair( shadowIndexPtr + 2, v2 ^ f2, v1 ^ f2 );
    895 				WriteIndexPair( shadowIndexPtr + 4, v1 ^ f1, v2 ^  1 );
    896 #else
    897 				shadowIndexPtr[0] == v1;
    898 				shadowIndexPtr[1] == v2 ^ f1;
    899 				shadowIndexPtr[2] == v2 ^ f2;
    900 				shadowIndexPtr[3] == v1 ^ f2;
    901 				shadowIndexPtr[4] == v1 ^ f1;
    902 				shadowIndexPtr[5] == v2 ^ 1;
    903 #endif
    904 				shadowIndexPtr += 6;
    905 			}
    906 		}
    907 	}
    908 
    909 	if ( includeCaps ) {
    910 		idODSStreamedArray< triIndex_t, 256, SBT_QUAD, 1 > indexesODS( indexes, numIndexes );
    911 
    912 		for ( int i = 0, j = 0; i < numIndexes; ) {
    913 
    914 			const int nextNumIndexes = indexesODS.FetchNextBatch() - 3;
    915 
    916 			for ( ; i <= nextNumIndexes; i += 3, j++ ) {
    917 				if ( facing[j] ) {
    918 					continue;
    919 				}
    920 
    921 				const triIndex_t i0 = indexesODS[i + 0] << 1;
    922 				const triIndex_t i1 = indexesODS[i + 1] << 1;
    923 				const triIndex_t i2 = indexesODS[i + 2] << 1;
    924 #if 1
    925 				// only write dwords to write combined memory
    926 				WriteIndexPair( shadowIndexPtr + 0, i2 + 0, i1 + 0 );
    927 				WriteIndexPair( shadowIndexPtr + 2, i0 + 0, i0 + 1 );
    928 				WriteIndexPair( shadowIndexPtr + 4, i1 + 1, i2 + 1 );
    929 #else
    930 				shadowIndexPtr[0] = i2;
    931 				shadowIndexPtr[1] = i1;
    932 				shadowIndexPtr[2] = i0;
    933 				shadowIndexPtr[3] = i0 + 1;
    934 				shadowIndexPtr[4] = i1 + 1;
    935 				shadowIndexPtr[5] = i2 + 1;
    936 #endif
    937 				shadowIndexPtr += 6;
    938 			}
    939 		}
    940 	}
    941 
    942 	numShadowIndexesTotal = shadowIndexPtr - shadowIndices;
    943 
    944 #endif
    945 }
    946 
    947 /*
    948 =====================
    949 R_CreateLightTriangles
    950 =====================
    951 */
    952 void R_CreateLightTriangles( triIndex_t * __restrict lightIndices, triIndex_t * __restrict indexBuffer, int & numLightIndicesTotal,
    953 								const byte * __restrict culled, const triIndex_t * __restrict indexes, const int numIndexes ) {
    954 	assert_spu_local_store( culled );
    955 	assert_not_spu_local_store( lightIndices );
    956 	assert_not_spu_local_store( indexes );
    957 
    958 #if 1
    959 
    960 	const int IN_BUFFER_SIZE = 256;
    961 	const int OUT_BUFFER_SIZE = IN_BUFFER_SIZE * 2;			// there are never more indices generated than the original indices
    962 	const int OUT_BUFFER_DEPTH = 4;							// quad buffer to allow overlapped output streaming
    963 	const int OUT_BUFFER_MASK = ( OUT_BUFFER_SIZE * OUT_BUFFER_DEPTH - 1 );
    964 
    965 	compile_time_assert( OUT_BUFFER_SIZE * OUT_BUFFER_DEPTH * sizeof( triIndex_t ) == OUTPUT_INDEX_BUFFER_SIZE );
    966 	assert_16_byte_aligned( indexBuffer );
    967 
    968 	int numLightIndices = 0;
    969 	int numStreamedIndices = 0;
    970 
    971 	idODSStreamedArray< triIndex_t, IN_BUFFER_SIZE, SBT_QUAD, 1 > indexesODS( indexes, numIndexes );
    972 
    973 	for ( int i = 0, j = 0; i < numIndexes; ) {
    974 
    975 		const int nextNumIndexes = indexesODS.FetchNextBatch();
    976 
    977 		// NOTE: we rely on FetchNextBatch() to wait for all previous DMAs to complete
    978 		while( numLightIndices - numStreamedIndices >= OUT_BUFFER_SIZE ) {
    979 			StreamOut( lightIndices + numStreamedIndices, & indexBuffer[numStreamedIndices & OUT_BUFFER_MASK], OUT_BUFFER_SIZE * sizeof( triIndex_t ) );
    980 			numStreamedIndices += OUT_BUFFER_SIZE;
    981 		}
    982 
    983 		for ( ; i + 4 * 3 <= nextNumIndexes; i += 4 * 3, j += 4 ) {
    984 			const byte ta = ~culled[j + 0] & 3;
    985 			const byte tb = ~culled[j + 1] & 3;
    986 			const byte tc = ~culled[j + 2] & 3;
    987 			const byte td = ~culled[j + 3] & 3;
    988 
    989 			indexBuffer[( numLightIndices + 0 ) & OUT_BUFFER_MASK] = indexesODS[i + 0 * 3 + 0];
    990 			indexBuffer[( numLightIndices + 1 ) & OUT_BUFFER_MASK] = indexesODS[i + 0 * 3 + 1];
    991 			indexBuffer[( numLightIndices + 2 ) & OUT_BUFFER_MASK] = indexesODS[i + 0 * 3 + 2];
    992 
    993 			numLightIndices += ta;
    994 
    995 			indexBuffer[( numLightIndices + 0 ) & OUT_BUFFER_MASK] = indexesODS[i + 1 * 3 + 0];
    996 			indexBuffer[( numLightIndices + 1 ) & OUT_BUFFER_MASK] = indexesODS[i + 1 * 3 + 1];
    997 			indexBuffer[( numLightIndices + 2 ) & OUT_BUFFER_MASK] = indexesODS[i + 1 * 3 + 2];
    998 
    999 			numLightIndices += tb;
   1000 
   1001 			indexBuffer[( numLightIndices + 0 ) & OUT_BUFFER_MASK] = indexesODS[i + 2 * 3 + 0];
   1002 			indexBuffer[( numLightIndices + 1 ) & OUT_BUFFER_MASK] = indexesODS[i + 2 * 3 + 1];
   1003 			indexBuffer[( numLightIndices + 2 ) & OUT_BUFFER_MASK] = indexesODS[i + 2 * 3 + 2];
   1004 
   1005 			numLightIndices += tc;
   1006 
   1007 			indexBuffer[( numLightIndices + 0 ) & OUT_BUFFER_MASK] = indexesODS[i + 3 * 3 + 0];
   1008 			indexBuffer[( numLightIndices + 1 ) & OUT_BUFFER_MASK] = indexesODS[i + 3 * 3 + 1];
   1009 			indexBuffer[( numLightIndices + 2 ) & OUT_BUFFER_MASK] = indexesODS[i + 3 * 3 + 2];
   1010 
   1011 			numLightIndices += td;
   1012 		}
   1013 
   1014 		for ( ; i + 3 <= nextNumIndexes; i += 3, j++ ) {
   1015 			const byte t = ~culled[j] & 3;
   1016 
   1017 			indexBuffer[( numLightIndices + 0 ) & OUT_BUFFER_MASK] = indexesODS[i + 0];
   1018 			indexBuffer[( numLightIndices + 1 ) & OUT_BUFFER_MASK] = indexesODS[i + 1];
   1019 			indexBuffer[( numLightIndices + 2 ) & OUT_BUFFER_MASK] = indexesODS[i + 2];
   1020 
   1021 			numLightIndices += t;
   1022 		}
   1023 	}
   1024 
   1025 	while( numLightIndices - numStreamedIndices >= OUT_BUFFER_SIZE ) {
   1026 		StreamOut( lightIndices + numStreamedIndices, & indexBuffer[numStreamedIndices & OUT_BUFFER_MASK], OUT_BUFFER_SIZE * sizeof( triIndex_t ) );
   1027 		numStreamedIndices += OUT_BUFFER_SIZE;
   1028 	}
   1029 	if ( numLightIndices > numStreamedIndices ) {
   1030 		assert( numLightIndices - numStreamedIndices < OUT_BUFFER_SIZE );
   1031 		StreamOut( lightIndices + numStreamedIndices, & indexBuffer[numStreamedIndices & OUT_BUFFER_MASK], ( numLightIndices - numStreamedIndices ) * sizeof( triIndex_t ) );
   1032 	}
   1033 
   1034 	numLightIndicesTotal = numLightIndices;
   1035 
   1036 #if defined( ID_WIN_X86_SSE2_INTRIN )
   1037 	_mm_sfence();
   1038 #endif
   1039 
   1040 #else	// NOTE: this code will not work on the SPU because it tries to write directly to the destination
   1041 
   1042 	int numLightIndices = 0;
   1043 
   1044 	idODSStreamedArray< triIndex_t, 256, SBT_QUAD, 1 > indexesODS( indexes, numIndexes );
   1045 
   1046 	for ( int i = 0, j = 0; i < numIndexes; ) {
   1047 
   1048 		const int nextNumIndexes = indexesODS.FetchNextBatch() - 3;
   1049 
   1050 		for ( ; i <= nextNumIndexes; i += 3, j++ ) {
   1051 			if ( culled[j] ) {
   1052 				continue;
   1053 			}
   1054 
   1055 			lightIndices[numLightIndices + 0] = indexesODS[i + 0];
   1056 			lightIndices[numLightIndices + 1] = indexesODS[i + 1];
   1057 			lightIndices[numLightIndices + 2] = indexesODS[i + 2];
   1058 
   1059 			numLightIndices += 3;
   1060 		}
   1061 	}
   1062 
   1063 	numLightIndicesTotal = numLightIndices;
   1064 
   1065 #endif
   1066 }
   1067 
   1068 /*
   1069 =====================
   1070 DynamicShadowVolumeJob
   1071 
   1072 Creates shadow volume indices for a surface that intersects a light.
   1073 Optionally also creates new surface indices with just the triangles
   1074 inside the light volume. These indices will be unique for a given
   1075 light / surface combination.
   1076 
   1077 The shadow volume indices are created using the original surface vertices.
   1078 However, the indices are setup to be used with a shadow volume vertex buffer
   1079 with all vertices duplicated where the even vertices have the same positions
   1080 as the surface vertices (at the near cap) and each odd vertex has the
   1081 same position as the previous even vertex but is projected to infinity
   1082 (the far cap) in the vertex program.
   1083 =====================
   1084 */
   1085 void DynamicShadowVolumeJob( const dynamicShadowVolumeParms_t * parms ) {
   1086 	if ( parms->tempFacing == NULL ) {
   1087 		*const_cast< byte ** >( &parms->tempFacing ) = (byte *)_alloca16( TEMP_FACING( parms->numIndexes ) );
   1088 	}
   1089 	if ( parms->tempCulled == NULL ) {
   1090 		*const_cast< byte ** >( &parms->tempCulled ) = (byte *)_alloca16( TEMP_CULL( parms->numIndexes ) );
   1091 	}
   1092 	if ( parms->tempVerts == NULL && parms->joints != NULL ) {
   1093 		*const_cast< idVec4 ** >( &parms->tempVerts ) = (idVec4 *)_alloca16( TEMP_VERTS( parms->numVerts ) );
   1094 	}
   1095 	if ( parms->indexBuffer == NULL ) {
   1096 		*const_cast< triIndex_t ** >( &parms->indexBuffer ) = (triIndex_t *)_alloca16( OUTPUT_INDEX_BUFFER_SIZE );
   1097 	}
   1098 
   1099 	assert( parms->joints == NULL || parms->numJoints > 0 );
   1100 
   1101 	// Calculate the shadow depth bounds.
   1102 	float shadowZMin = parms->lightZMin;
   1103 	float shadowZMax = parms->lightZMax;
   1104 	if ( parms->useShadowDepthBounds ) {
   1105 		idRenderMatrix::DepthBoundsForShadowBounds( shadowZMin, shadowZMax, parms->triangleMVP, parms->triangleBounds, parms->localLightOrigin, true );
   1106 		shadowZMin = Max( shadowZMin, parms->lightZMin );
   1107 		shadowZMax = Min( shadowZMax, parms->lightZMax );
   1108 	}
   1109 
   1110 	bool renderZFail = false;
   1111 	int numShadowIndices = 0;
   1112 	int numLightIndices = 0;
   1113 
   1114 	// The shadow volume may be depth culled if either the shadow volume was culled to the view frustum or if the
   1115 	// depth range of the visible part of the shadow volume is outside the depth range of the light volume.
   1116 	if ( shadowZMin < shadowZMax ) {
   1117 
   1118 		// Check if we need to render the shadow volume with Z-fail.
   1119 		bool * preciseInsideShadowVolume = NULL;
   1120 		// If the view is potentially inside the shadow volume bounds we may need to render with Z-fail.
   1121 		if ( R_ViewPotentiallyInsideInfiniteShadowVolume( parms->triangleBounds, parms->localLightOrigin, parms->localViewOrigin, parms->zNear * INSIDE_SHADOW_VOLUME_EXTRA_STRETCH ) ) {
   1122 			// Optionally perform a more precise test to see whether or not the view is inside the shadow volume.
   1123 			if ( parms->useShadowPreciseInsideTest ) {
   1124 				preciseInsideShadowVolume = & renderZFail;
   1125 			} else {
   1126 				renderZFail = true;
   1127 			}
   1128 		}
   1129 
   1130 		// Calculate the facing of each triangle and cull each triangle to the light volume.
   1131 		// Optionally also calculate more precisely whether or not the view is inside the shadow volume.
   1132 		int numFrontFacing = 0;
   1133 		if ( parms->joints != NULL ) {
   1134 			numFrontFacing = CalculateTriangleFacingCulledSkinned( parms->tempFacing, parms->tempCulled, parms->tempVerts, parms->indexes, parms->numIndexes,
   1135 																parms->verts, parms->numVerts, parms->joints,
   1136 																parms->localLightOrigin, parms->localViewOrigin,
   1137 																parms->cullShadowTrianglesToLight, parms->localLightProject,
   1138 																preciseInsideShadowVolume, parms->zNear * INSIDE_SHADOW_VOLUME_EXTRA_STRETCH );
   1139 		} else {
   1140 			numFrontFacing = CalculateTriangleFacingCulledStatic( parms->tempFacing, parms->tempCulled, parms->indexes, parms->numIndexes,
   1141 																parms->verts, parms->numVerts,
   1142 																parms->localLightOrigin, parms->localViewOrigin,
   1143 																parms->cullShadowTrianglesToLight, parms->localLightProject,
   1144 																preciseInsideShadowVolume, parms->zNear * INSIDE_SHADOW_VOLUME_EXTRA_STRETCH );
   1145 		}
   1146 
   1147 		// Create shadow volume indices.
   1148 		if ( parms->shadowIndices != NULL  ) {
   1149 			const int numTriangles = parms->numIndexes / 3;
   1150 
   1151 			// If there are any triangles facing away from the light.
   1152 			if ( numTriangles - numFrontFacing > 0 ) {
   1153 				// Set the "fake triangle" used by dangling edges to facing so a dangling edge will
   1154 				// make a silhouette if the triangle that uses the dangling edges is not facing.
   1155 				// Note that dangling edges outside the light frustum do not make silhouettes because
   1156 				// a triangle outside the light frustum is also set to facing just like the "fake triangle"
   1157 				// used by a dangling edge.
   1158 				parms->tempFacing[numTriangles] = 255;
   1159 
   1160 				// Check if we can avoid rendering the shadow volume caps.
   1161 				bool renderShadowCaps = parms->forceShadowCaps || renderZFail;
   1162 
   1163 				// Create new triangles along the silhouette planes and optionally add end-cap triangles on the model and on the distant projection.
   1164 				R_CreateShadowVolumeTriangles( parms->shadowIndices, parms->indexBuffer, numShadowIndices, parms->tempFacing,
   1165 												parms->silEdges, parms->numSilEdges, parms->indexes, parms->numIndexes, renderShadowCaps );
   1166 
   1167 				assert( numShadowIndices <= parms->maxShadowIndices );
   1168 			}
   1169 		}
   1170 
   1171 		// Create new indices with only the triangles that are inside the light volume.
   1172 		if ( parms->lightIndices != NULL ) {
   1173 			R_CreateLightTriangles( parms->lightIndices, parms->indexBuffer, numLightIndices, parms->tempCulled, parms->indexes, parms->numIndexes );
   1174 
   1175 			assert( numLightIndices <= parms->maxLightIndices );
   1176 		}
   1177 	}
   1178 
   1179 	// write out the number of shadow indices
   1180 	if ( parms->numShadowIndices != NULL ) {
   1181 		*parms->numShadowIndices = numShadowIndices;
   1182 	}
   1183 	// write out the number of light indices
   1184 	if ( parms->numLightIndices != NULL ) {
   1185 		*parms->numLightIndices = numLightIndices;
   1186 	}
   1187 	// write out whether or not the shadow volume needs to be rendered with Z-Fail
   1188 	if ( parms->renderZFail != NULL ) {
   1189 		*parms->renderZFail = renderZFail;
   1190 	}
   1191 	// write out the shadow depth bounds
   1192 	if ( parms->shadowZMin != NULL ) {
   1193 		*parms->shadowZMin = shadowZMin;
   1194 	}
   1195 	if ( parms->shadowZMax != NULL ) {
   1196 		*parms->shadowZMax = shadowZMax;
   1197 	}
   1198 	// write out the shadow volume state
   1199 	if ( parms->shadowVolumeState != NULL ) {
   1200 		*parms->shadowVolumeState = SHADOWVOLUME_DONE;
   1201 	}
   1202 }
   1203 
   1204 REGISTER_PARALLEL_JOB( DynamicShadowVolumeJob, "DynamicShadowVolumeJob" );