DOOM-3-BFG

DOOM 3 BFG Edition
Log | Files | Refs

ModelOverlay.cpp (25473B)


      1 /*
      2 ===========================================================================
      3 
      4 Doom 3 BFG Edition GPL Source Code
      5 Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. 
      6 
      7 This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").  
      8 
      9 Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
     10 it under the terms of the GNU General Public License as published by
     11 the Free Software Foundation, either version 3 of the License, or
     12 (at your option) any later version.
     13 
     14 Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
     15 but WITHOUT ANY WARRANTY; without even the implied warranty of
     16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     17 GNU General Public License for more details.
     18 
     19 You should have received a copy of the GNU General Public License
     20 along with Doom 3 BFG Edition Source Code.  If not, see <http://www.gnu.org/licenses/>.
     21 
     22 In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code.  If not, please request a copy in writing from id Software at the address below.
     23 
     24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
     25 
     26 ===========================================================================
     27 */
     28 
     29 #pragma hdrstop
     30 #include "../idlib/precompiled.h"
     31 
     32 #include "tr_local.h"
     33 #include "Model_local.h"
     34 
     35 #include "../idlib/geometry/DrawVert_intrinsics.h"
     36 
     37 /*
     38 ====================
     39 idRenderModelOverlay::idRenderModelOverlay
     40 ====================
     41 */
     42 idRenderModelOverlay::idRenderModelOverlay() :
     43 		firstOverlay( 0 ),
     44 		nextOverlay( 0 ),
     45 		firstDeferredOverlay( 0 ),
     46 		nextDeferredOverlay( 0 ),
     47 		numOverlayMaterials( 0 ) {
     48 	memset( overlays, 0, sizeof( overlays ) );
     49 }
     50 
     51 /*
     52 ====================
     53 idRenderModelOverlay::~idRenderModelOverlay
     54 ====================
     55 */
     56 idRenderModelOverlay::~idRenderModelOverlay() {
     57 	for ( unsigned int i = 0; i < MAX_OVERLAYS; i++ ) {
     58 		FreeOverlay( overlays[i] );
     59 	}
     60 }
     61 
     62 /*
     63 =================
     64 idRenderModelOverlay::ReUse
     65 =================
     66 */
     67 void idRenderModelOverlay::ReUse() {
     68 	firstOverlay = 0;
     69 	nextOverlay = 0;
     70 	firstDeferredOverlay = 0;
     71 	nextDeferredOverlay = 0;
     72 	numOverlayMaterials = 0;
     73 
     74 	for ( unsigned int i = 0; i < MAX_OVERLAYS; i++ ) {
     75 		FreeOverlay( overlays[i] );
     76 	}
     77 }
     78 
     79 /*
     80 ====================
     81 idRenderModelOverlay::FreeOverlay
     82 ====================
     83 */
     84 void idRenderModelOverlay::FreeOverlay( overlay_t & overlay ) {
     85 	if ( overlay.verts != NULL ) {
     86 		Mem_Free( overlay.verts );
     87 	}
     88 	if ( overlay.indexes != NULL ) {
     89 		Mem_Free( overlay.indexes );
     90 	}
     91 	memset( &overlay, 0, sizeof( overlay ) );
     92 }
     93 
     94 /*
     95 ====================
     96 R_OverlayPointCullStatic
     97 ====================
     98 */
     99 static void R_OverlayPointCullStatic( byte * cullBits, halfFloat_t * texCoordS, halfFloat_t * texCoordT, const idPlane * planes, const idDrawVert * verts, const int numVerts ) {
    100 	assert_16_byte_aligned( cullBits );
    101 	assert_16_byte_aligned( texCoordS );
    102 	assert_16_byte_aligned( texCoordT );
    103 	assert_16_byte_aligned( verts );
    104 
    105 #ifdef ID_WIN_X86_SSE2_INTRIN
    106 
    107 	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
    108 
    109 	const __m128 vector_float_zero	= { 0.0f, 0.0f, 0.0f, 0.0f };
    110 	const __m128 vector_float_one	= { 1.0f, 1.0f, 1.0f, 1.0f };
    111 	const __m128i vector_int_mask0	= _mm_set1_epi32( 1 << 0 );
    112 	const __m128i vector_int_mask1	= _mm_set1_epi32( 1 << 1 );
    113 	const __m128i vector_int_mask2	= _mm_set1_epi32( 1 << 2 );
    114 	const __m128i vector_int_mask3	= _mm_set1_epi32( 1 << 3 );
    115 
    116 	const __m128 p0 = _mm_loadu_ps( planes[0].ToFloatPtr() );
    117 	const __m128 p1 = _mm_loadu_ps( planes[1].ToFloatPtr() );
    118 
    119 	const __m128 p0X = _mm_splat_ps( p0, 0 );
    120 	const __m128 p0Y = _mm_splat_ps( p0, 1 );
    121 	const __m128 p0Z = _mm_splat_ps( p0, 2 );
    122 	const __m128 p0W = _mm_splat_ps( p0, 3 );
    123 
    124 	const __m128 p1X = _mm_splat_ps( p1, 0 );
    125 	const __m128 p1Y = _mm_splat_ps( p1, 1 );
    126 	const __m128 p1Z = _mm_splat_ps( p1, 2 );
    127 	const __m128 p1W = _mm_splat_ps( p1, 3 );
    128 
    129 	for ( int i = 0; i < numVerts; ) {
    130 
    131 		const int nextNumVerts = vertsODS.FetchNextBatch() - 4;
    132 
    133 		for ( ; i <= nextNumVerts; i += 4 ) {
    134 			const __m128 v0 = _mm_load_ps( vertsODS[i + 0].xyz.ToFloatPtr() );
    135 			const __m128 v1 = _mm_load_ps( vertsODS[i + 1].xyz.ToFloatPtr() );
    136 			const __m128 v2 = _mm_load_ps( vertsODS[i + 2].xyz.ToFloatPtr() );
    137 			const __m128 v3 = _mm_load_ps( vertsODS[i + 3].xyz.ToFloatPtr() );
    138 
    139 			const __m128 r0 = _mm_unpacklo_ps( v0, v2 );	// v0.x, v2.x, v0.z, v2.z
    140 			const __m128 r1 = _mm_unpackhi_ps( v0, v2 );	// v0.y, v2.y, v0.w, v2.w
    141 			const __m128 r2 = _mm_unpacklo_ps( v1, v3 );	// v1.x, v3.x, v1.z, v3.z
    142 			const __m128 r3 = _mm_unpackhi_ps( v1, v3 );	// v1.y, v3.y, v1.w, v3.w
    143 
    144 			const __m128 vX = _mm_unpacklo_ps( r0, r2 );	// v0.x, v1.x, v2.x, v3.x
    145 			const __m128 vY = _mm_unpackhi_ps( r0, r2 );	// v0.y, v1.y, v2.y, v3.y
    146 			const __m128 vZ = _mm_unpacklo_ps( r1, r3 );	// v0.z, v1.z, v2.z, v3.z
    147 
    148 			const __m128 d0 = _mm_madd_ps( vX, p0X, _mm_madd_ps( vY, p0Y, _mm_madd_ps( vZ, p0Z, p0W ) ) );
    149 			const __m128 d1 = _mm_madd_ps( vX, p1X, _mm_madd_ps( vY, p1Y, _mm_madd_ps( vZ, p1Z, p1W ) ) );
    150 			const __m128 d2 = _mm_sub_ps( vector_float_one, d0 );
    151 			const __m128 d3 = _mm_sub_ps( vector_float_one, d1 );
    152 
    153 			__m128i flt16S = FastF32toF16( __m128c( d0 ) );
    154 			__m128i flt16T = FastF32toF16( __m128c( d1 ) );
    155 
    156 			_mm_storel_epi64( (__m128i *)&texCoordS[i], flt16S );
    157 			_mm_storel_epi64( (__m128i *)&texCoordT[i], flt16T );
    158 
    159 			__m128i c0 = __m128c( _mm_cmplt_ps( d0, vector_float_zero ) );
    160 			__m128i c1 = __m128c( _mm_cmplt_ps( d1, vector_float_zero ) );
    161 			__m128i c2 = __m128c( _mm_cmplt_ps( d2, vector_float_zero ) );
    162 			__m128i c3 = __m128c( _mm_cmplt_ps( d3, vector_float_zero ) );
    163 
    164 			c0 = _mm_and_si128( c0, vector_int_mask0 );
    165 			c1 = _mm_and_si128( c1, vector_int_mask1 );
    166 			c2 = _mm_and_si128( c2, vector_int_mask2 );
    167 			c3 = _mm_and_si128( c3, vector_int_mask3 );
    168 
    169 			c0 = _mm_or_si128( c0, c1 );
    170 			c2 = _mm_or_si128( c2, c3 );
    171 			c0 = _mm_or_si128( c0, c2 );
    172 
    173 			c0 = _mm_packs_epi32( c0, c0 );
    174 			c0 = _mm_packus_epi16( c0, c0 );
    175 
    176 			*(unsigned int *)&cullBits[i] = _mm_cvtsi128_si32( c0 );
    177 		}
    178 	}
    179 
    180 #else
    181 
    182 	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
    183 
    184 	for ( int i = 0; i < numVerts; ) {
    185 
    186 		const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
    187 
    188 		for ( ; i <= nextNumVerts; i++ ) {
    189 			const idVec3 & v = vertsODS[i].xyz;
    190 
    191 			const float d0 = planes[0].Distance( v );
    192 			const float d1 = planes[1].Distance( v );
    193 			const float d2 = 1.0f - d0;
    194 			const float d3 = 1.0f - d1;
    195 
    196 			halfFloat_t s = Scalar_FastF32toF16( d0 );
    197 			halfFloat_t t = Scalar_FastF32toF16( d1 );
    198 
    199 			texCoordS[i] = s;
    200 			texCoordT[i] = t;
    201 
    202 			byte bits;
    203 			bits  = IEEE_FLT_SIGNBITSET( d0 ) << 0;
    204 			bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1;
    205 			bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2;
    206 			bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3;
    207 
    208 			cullBits[i] = bits;
    209 		}
    210 	}
    211 
    212 #endif
    213 }
    214 
    215 /*
    216 ====================
    217 R_OverlayPointCullSkinned
    218 ====================
    219 */
    220 static void R_OverlayPointCullSkinned( byte * cullBits, halfFloat_t * texCoordS, halfFloat_t * texCoordT, const idPlane * planes, const idDrawVert * verts, const int numVerts, const idJointMat * joints ) {
    221 	assert_16_byte_aligned( cullBits );
    222 	assert_16_byte_aligned( texCoordS );
    223 	assert_16_byte_aligned( texCoordT );
    224 	assert_16_byte_aligned( verts );
    225 
    226 #ifdef ID_WIN_X86_SSE2_INTRIN
    227 
    228 	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
    229 
    230 	const __m128 vector_float_zero	= { 0.0f, 0.0f, 0.0f, 0.0f };
    231 	const __m128 vector_float_one	= { 1.0f, 1.0f, 1.0f, 1.0f };
    232 	const __m128i vector_int_mask0	= _mm_set1_epi32( 1 << 0 );
    233 	const __m128i vector_int_mask1	= _mm_set1_epi32( 1 << 1 );
    234 	const __m128i vector_int_mask2	= _mm_set1_epi32( 1 << 2 );
    235 	const __m128i vector_int_mask3	= _mm_set1_epi32( 1 << 3 );
    236 
    237 	const __m128 p0 = _mm_loadu_ps( planes[0].ToFloatPtr() );
    238 	const __m128 p1 = _mm_loadu_ps( planes[1].ToFloatPtr() );
    239 
    240 	const __m128 p0X = _mm_splat_ps( p0, 0 );
    241 	const __m128 p0Y = _mm_splat_ps( p0, 1 );
    242 	const __m128 p0Z = _mm_splat_ps( p0, 2 );
    243 	const __m128 p0W = _mm_splat_ps( p0, 3 );
    244 
    245 	const __m128 p1X = _mm_splat_ps( p1, 0 );
    246 	const __m128 p1Y = _mm_splat_ps( p1, 1 );
    247 	const __m128 p1Z = _mm_splat_ps( p1, 2 );
    248 	const __m128 p1W = _mm_splat_ps( p1, 3 );
    249 
    250 	for ( int i = 0; i < numVerts; ) {
    251 
    252 		const int nextNumVerts = vertsODS.FetchNextBatch() - 4;
    253 
    254 		for ( ; i <= nextNumVerts; i += 4 ) {
    255 			const __m128 v0 = LoadSkinnedDrawVertPosition( vertsODS[i + 0], joints );
    256 			const __m128 v1 = LoadSkinnedDrawVertPosition( vertsODS[i + 1], joints );
    257 			const __m128 v2 = LoadSkinnedDrawVertPosition( vertsODS[i + 2], joints );
    258 			const __m128 v3 = LoadSkinnedDrawVertPosition( vertsODS[i + 3], joints );
    259 
    260 			const __m128 r0 = _mm_unpacklo_ps( v0, v2 );	// v0.x, v2.x, v0.z, v2.z
    261 			const __m128 r1 = _mm_unpackhi_ps( v0, v2 );	// v0.y, v2.y, v0.w, v2.w
    262 			const __m128 r2 = _mm_unpacklo_ps( v1, v3 );	// v1.x, v3.x, v1.z, v3.z
    263 			const __m128 r3 = _mm_unpackhi_ps( v1, v3 );	// v1.y, v3.y, v1.w, v3.w
    264 
    265 			const __m128 vX = _mm_unpacklo_ps( r0, r2 );	// v0.x, v1.x, v2.x, v3.x
    266 			const __m128 vY = _mm_unpackhi_ps( r0, r2 );	// v0.y, v1.y, v2.y, v3.y
    267 			const __m128 vZ = _mm_unpacklo_ps( r1, r3 );	// v0.z, v1.z, v2.z, v3.z
    268 
    269 			const __m128 d0 = _mm_madd_ps( vX, p0X, _mm_madd_ps( vY, p0Y, _mm_madd_ps( vZ, p0Z, p0W ) ) );
    270 			const __m128 d1 = _mm_madd_ps( vX, p1X, _mm_madd_ps( vY, p1Y, _mm_madd_ps( vZ, p1Z, p1W ) ) );
    271 			const __m128 d2 = _mm_sub_ps( vector_float_one, d0 );
    272 			const __m128 d3 = _mm_sub_ps( vector_float_one, d1 );
    273 
    274 			__m128i flt16S = FastF32toF16( __m128c( d0 ) );
    275 			__m128i flt16T = FastF32toF16( __m128c( d1 ) );
    276 
    277 			_mm_storel_epi64( (__m128i *)&texCoordS[i], flt16S );
    278 			_mm_storel_epi64( (__m128i *)&texCoordT[i], flt16T );
    279 
    280 			__m128i c0 = __m128c( _mm_cmplt_ps( d0, vector_float_zero ) );
    281 			__m128i c1 = __m128c( _mm_cmplt_ps( d1, vector_float_zero ) );
    282 			__m128i c2 = __m128c( _mm_cmplt_ps( d2, vector_float_zero ) );
    283 			__m128i c3 = __m128c( _mm_cmplt_ps( d3, vector_float_zero ) );
    284 
    285 			c0 = _mm_and_si128( c0, vector_int_mask0 );
    286 			c1 = _mm_and_si128( c1, vector_int_mask1 );
    287 			c2 = _mm_and_si128( c2, vector_int_mask2 );
    288 			c3 = _mm_and_si128( c3, vector_int_mask3 );
    289 
    290 			c0 = _mm_or_si128( c0, c1 );
    291 			c2 = _mm_or_si128( c2, c3 );
    292 			c0 = _mm_or_si128( c0, c2 );
    293 
    294 			c0 = _mm_packs_epi32( c0, c0 );
    295 			c0 = _mm_packus_epi16( c0, c0 );
    296 
    297 			*(unsigned int *)&cullBits[i] = _mm_cvtsi128_si32( c0 );
    298 		}
    299 	}
    300 
    301 #else
    302 
    303 	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
    304 
    305 	for ( int i = 0; i < numVerts; ) {
    306 
    307 		const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
    308 
    309 		for ( ; i <= nextNumVerts; i++ ) {
    310 			const idVec3 transformed = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
    311 
    312 			const float d0 = planes[0].Distance( transformed );
    313 			const float d1 = planes[1].Distance( transformed );
    314 			const float d2 = 1.0f - d0;
    315 			const float d3 = 1.0f - d1;
    316 
    317 			halfFloat_t s = Scalar_FastF32toF16( d0 );
    318 			halfFloat_t t = Scalar_FastF32toF16( d1 );
    319 
    320 			texCoordS[i] = s;
    321 			texCoordT[i] = t;
    322 
    323 			byte bits;
    324 			bits  = IEEE_FLT_SIGNBITSET( d0 ) << 0;
    325 			bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1;
    326 			bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2;
    327 			bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3;
    328 
    329 			cullBits[i] = bits;
    330 		}
    331 	}
    332 
    333 #endif
    334 }
    335 
    336 /*
    337 =====================
    338 idRenderModelOverlay::CreateOverlay
    339 
    340 This projects on both front and back sides to avoid seams
    341 The material should be clamped, because entire triangles are added, some of which
    342 may extend well past the 0.0 to 1.0 texture range
    343 =====================
    344 */
    345 void idRenderModelOverlay::CreateOverlay( const idRenderModel *model, const idPlane localTextureAxis[2], const idMaterial *material ) {
    346 	// count up the maximum possible vertices and indexes per surface
    347 	int maxVerts = 0;
    348 	int maxIndexes = 0;
    349 	for ( int surfNum = 0; surfNum < model->NumSurfaces(); surfNum++ ) {
    350 		const modelSurface_t *surf = model->Surface( surfNum );
    351 		if ( surf->geometry->numVerts > maxVerts ) {
    352 			maxVerts = surf->geometry->numVerts;
    353 		}
    354 		if ( surf->geometry->numIndexes > maxIndexes ) {
    355 			maxIndexes = surf->geometry->numIndexes;
    356 		}
    357 	}
    358 	maxIndexes += 3 * 16 / sizeof( triIndex_t );	// to allow the index size to be a multiple of 16 bytes
    359 
    360 	// make temporary buffers for the building process
    361 	idTempArray< byte > cullBits( maxVerts );
    362 	idTempArray< halfFloat_t > texCoordS( maxVerts );
    363 	idTempArray< halfFloat_t > texCoordT( maxVerts );
    364 	idTempArray< triIndex_t > vertexRemap( maxVerts );
    365 	idTempArray< overlayVertex_t > overlayVerts( maxVerts );
    366 	idTempArray< triIndex_t > overlayIndexes( maxIndexes );
    367 
    368 	// pull out the triangles we need from the base surfaces
    369 	for ( int surfNum = 0; surfNum < model->NumBaseSurfaces(); surfNum++ ) {
    370 		const modelSurface_t *surf = model->Surface( surfNum );
    371 
    372 		if ( surf->geometry == NULL || surf->shader == NULL ) {
    373 			continue;
    374 		}
    375 
    376 		// some surfaces can explicitly disallow overlays
    377 		if ( !surf->shader->AllowOverlays() ) {
    378 			continue;
    379 		}
    380 
    381 		const srfTriangles_t *tri = surf->geometry;
    382 
    383 		// try to cull the whole surface along the first texture axis
    384 		const float d0 = tri->bounds.PlaneDistance( localTextureAxis[0] );
    385 		if ( d0 < 0.0f || d0 > 1.0f ) {
    386 			continue;
    387 		}
    388 
    389 		// try to cull the whole surface along the second texture axis
    390 		const float d1 = tri->bounds.PlaneDistance( localTextureAxis[1] );
    391 		if ( d1 < 0.0f || d1 > 1.0f ) {
    392 			continue;
    393 		}
    394 
    395 		if ( tri->staticModelWithJoints != NULL && r_useGPUSkinning.GetBool() ) {
    396 			R_OverlayPointCullSkinned( cullBits.Ptr(), texCoordS.Ptr(), texCoordT.Ptr(), localTextureAxis, tri->verts, tri->numVerts, tri->staticModelWithJoints->jointsInverted );
    397 		} else {
    398 			R_OverlayPointCullStatic( cullBits.Ptr(), texCoordS.Ptr(), texCoordT.Ptr(), localTextureAxis, tri->verts, tri->numVerts );
    399 		}
    400 
    401 		// start streaming the indexes
    402 		idODSStreamedArray< triIndex_t, 256, SBT_QUAD, 3 > indexesODS( tri->indexes, tri->numIndexes );
    403 
    404 		memset( vertexRemap.Ptr(), -1, vertexRemap.Size() );
    405 		int numIndexes = 0;
    406 		int numVerts = 0;
    407 		int maxReferencedVertex = 0;
    408 
    409 		// find triangles that need the overlay
    410 		for ( int i = 0; i < tri->numIndexes; ) {
    411 
    412 			const int nextNumIndexes = indexesODS.FetchNextBatch() - 3;
    413 
    414 			for ( ; i <= nextNumIndexes; i += 3 ) {
    415 				const int i0 = indexesODS[i + 0];
    416 				const int i1 = indexesODS[i + 1];
    417 				const int i2 = indexesODS[i + 2];
    418 
    419 				// skip triangles completely off one side
    420 				if ( cullBits[i0] & cullBits[i1] & cullBits[i2] ) {
    421 					continue;
    422 				}
    423 
    424 				// we could do more precise triangle culling, like a light interaction does, but it's not worth it
    425 
    426 				// keep this triangle
    427 				for ( int j = 0; j < 3; j++ ) {
    428 					int index = tri->indexes[i + j];
    429 					if ( vertexRemap[index] == (triIndex_t) -1 ) {
    430 						vertexRemap[index] = numVerts;
    431 
    432 						overlayVerts[numVerts].vertexNum = index;
    433 						overlayVerts[numVerts].st[0] = texCoordS[index];
    434 						overlayVerts[numVerts].st[1] = texCoordT[index];
    435 						numVerts++;
    436 
    437 						maxReferencedVertex = Max( maxReferencedVertex, index );
    438 					}
    439 					overlayIndexes[numIndexes] = vertexRemap[index];
    440 					numIndexes++;
    441 				}
    442 			}
    443 		}
    444 
    445 		if ( numIndexes == 0 ) {
    446 			continue;
    447 		}
    448 
    449 		// add degenerate triangles until the index size is a multiple of 16 bytes
    450 		for ( ; ( ( ( numIndexes * sizeof( triIndex_t ) ) & 15 ) != 0 ); numIndexes += 3 ) {
    451 			overlayIndexes[numIndexes + 0] = 0;
    452 			overlayIndexes[numIndexes + 1] = 0;
    453 			overlayIndexes[numIndexes + 2] = 0;
    454 		}
    455 
    456 		// allocate a new overlay
    457 		overlay_t & overlay = overlays[nextOverlay++ & ( MAX_OVERLAYS - 1 )];
    458 		FreeOverlay( overlay );
    459 		overlay.material = material;
    460 		overlay.surfaceNum = surfNum;
    461 		overlay.surfaceId = surf->id;
    462 		overlay.numIndexes = numIndexes;
    463 		overlay.indexes = (triIndex_t *)Mem_Alloc( numIndexes * sizeof( overlay.indexes[0] ), TAG_MODEL );
    464 		memcpy( overlay.indexes, overlayIndexes.Ptr(), numIndexes * sizeof( overlay.indexes[0] ) );
    465 		overlay.numVerts = numVerts;
    466 		overlay.verts = (overlayVertex_t *)Mem_Alloc( numVerts * sizeof( overlay.verts[0] ), TAG_MODEL );
    467 		memcpy( overlay.verts, overlayVerts.Ptr(), numVerts * sizeof( overlay.verts[0] ) );
    468 		overlay.maxReferencedVertex = maxReferencedVertex;
    469 
    470 		if ( nextOverlay - firstOverlay > MAX_OVERLAYS ) {
    471 			firstOverlay = nextOverlay - MAX_OVERLAYS;
    472 		}
    473 	}
    474 }
    475 
    476 /*
    477 ====================
    478 idRenderModelOverlay::CreateDeferredOverlays
    479 ====================
    480 */
    481 void idRenderModelOverlay::CreateDeferredOverlays( const idRenderModel * model ) {
    482 	for ( unsigned int i = firstDeferredOverlay; i < nextDeferredOverlay; i++ ) {
    483 		const overlayProjectionParms_t & parms = deferredOverlays[i & ( MAX_DEFERRED_OVERLAYS - 1 )];
    484 		if ( parms.startTime > tr.viewDef->renderView.time[0] -  DEFFERED_OVERLAY_TIMEOUT ) {
    485 			CreateOverlay( model, parms.localTextureAxis, parms.material );
    486 		}
    487 	}
    488 	firstDeferredOverlay = 0;
    489 	nextDeferredOverlay = 0;
    490 }
    491 
    492 /*
    493 ====================
    494 idRenderModelOverlay::AddDeferredOverlay
    495 ====================
    496 */
    497 void idRenderModelOverlay::AddDeferredOverlay( const overlayProjectionParms_t & localParms ) {
    498 	deferredOverlays[nextDeferredOverlay++ & ( MAX_DEFERRED_OVERLAYS - 1 )] = localParms;
    499 	if ( nextDeferredOverlay - firstDeferredOverlay > MAX_DEFERRED_OVERLAYS ) {
    500 		firstDeferredOverlay = nextDeferredOverlay - MAX_DEFERRED_OVERLAYS;
    501 	}
    502 }
    503 
    504 /*
    505 ====================
    506 R_CopyOverlaySurface
    507 ====================
    508 */
    509 static void R_CopyOverlaySurface( idDrawVert * verts, int numVerts, triIndex_t * indexes, int numIndexes, const overlay_t * overlay, const idDrawVert * sourceVerts ) {
    510 	assert_16_byte_aligned( &verts[numVerts] );
    511 	assert_16_byte_aligned( &indexes[numIndexes] );
    512 	assert_16_byte_aligned( overlay->verts );
    513 	assert_16_byte_aligned( overlay->indexes );
    514 	assert( ( ( overlay->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 );
    515 	assert( ( ( overlay->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
    516 
    517 #ifdef ID_WIN_X86_SSE2_INTRIN
    518 
    519 	const __m128i vector_int_clear_last = _mm_set_epi32( 0, -1, -1, -1 );
    520 	const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
    521 	const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts );
    522 
    523 	// copy vertices
    524 	for ( int i = 0; i < overlay->numVerts; i++ ) {
    525 		const overlayVertex_t &overlayVert = overlay->verts[i];
    526 		const idDrawVert &srcVert = sourceVerts[overlayVert.vertexNum];
    527 		idDrawVert &dstVert = verts[numVerts + i];
    528 
    529 		__m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert +  0 ) );
    530 		__m128i v1 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 16 ) );
    531 		__m128i st = _mm_cvtsi32_si128( *(unsigned int *)overlayVert.st );
    532 
    533 		st = _mm_shuffle_epi32( st, _MM_SHUFFLE( 0, 1, 2, 3 ) );
    534 		v0 = _mm_and_si128( v0, vector_int_clear_last );
    535 		v0 = _mm_or_si128( v0, st );
    536 
    537 		_mm_stream_si128( (__m128i *)( (byte *)&dstVert +  0 ), v0 );
    538 		_mm_stream_si128( (__m128i *)( (byte *)&dstVert + 16 ), v1 );
    539 	}
    540 
    541 	// copy indexes
    542 	assert( ( overlay->numIndexes & 7 ) == 0 );
    543 	assert( sizeof( triIndex_t ) == 2 );
    544 	for ( int i = 0; i < overlay->numIndexes; i += 8 ) {
    545 		__m128i vi = _mm_load_si128( (const __m128i *)&overlay->indexes[i] );
    546 
    547 		vi = _mm_add_epi16( vi, vector_short_num_verts );
    548 
    549 		_mm_stream_si128( (__m128i *)&indexes[numIndexes + i], vi );
    550 	}
    551 
    552 	_mm_sfence();
    553 
    554 #else
    555 
    556 	// copy vertices
    557 	for ( int i = 0; i < overlay->numVerts; i++ ) {
    558 		const overlayVertex_t &overlayVert = overlay->verts[i];
    559 
    560 		// NOTE: bad out-of-order write-combined write, SIMD code does the right thing
    561 		verts[numVerts + i] = sourceVerts[overlayVert.vertexNum];
    562 		verts[numVerts + i].st[0] = overlayVert.st[0];
    563 		verts[numVerts + i].st[1] = overlayVert.st[1];
    564 	}
    565 
    566 	// copy indexes
    567 	for ( int i = 0; i < overlay->numIndexes; i += 2 ) {
    568 		assert( overlay->indexes[i + 0] < overlay->numVerts && overlay->indexes[i + 1] < overlay->numVerts );
    569 		WriteIndexPair( &indexes[numIndexes + i], numVerts + overlay->indexes[i + 0], numVerts + overlay->indexes[i + 1] );
    570 	}
    571 
    572 #endif
    573 }
    574 
    575 /*
    576 =====================
    577 idRenderModelOverlay::GetNumOverlayDrawSurfs
    578 =====================
    579 */
    580 unsigned int idRenderModelOverlay::GetNumOverlayDrawSurfs() {
    581 	numOverlayMaterials = 0;
    582 
    583 	for ( unsigned int i = firstOverlay; i < nextOverlay; i++ ) {
    584 		const overlay_t & overlay = overlays[i & ( MAX_OVERLAYS - 1 )];
    585 
    586 		unsigned int j = 0;
    587 		for ( ; j < numOverlayMaterials; j++ ) {
    588 			if ( overlayMaterials[j] == overlay.material ) {
    589 				break;
    590 			}
    591 		}
    592 		if ( j >= numOverlayMaterials ) {
    593 			overlayMaterials[numOverlayMaterials++] = overlay.material;
    594 		}
    595 	}
    596 
    597 	return numOverlayMaterials;
    598 }
    599 
    600 /*
    601 ====================
    602 idRenderModelOverlay::CreateOverlayDrawSurf
    603 ====================
    604 */
    605 drawSurf_t * idRenderModelOverlay::CreateOverlayDrawSurf( const viewEntity_t *space, const idRenderModel *baseModel, unsigned int index ) {
    606 	if ( index < 0 || index >= numOverlayMaterials ) {
    607 		return NULL;
    608 	}
    609 
    610 	// md5 models won't have any surfaces when r_showSkel is set
    611 	if ( baseModel == NULL || baseModel->IsDefaultModel() || baseModel->NumSurfaces() == 0 ) {
    612 		return NULL;
    613 	}
    614 
    615 	assert( baseModel->IsDynamicModel() == DM_STATIC );
    616 
    617 	const idRenderModelStatic * staticModel = static_cast< const idRenderModelStatic * >( baseModel );
    618 
    619 	const idMaterial * material = overlayMaterials[index];
    620 
    621 	int maxVerts = 0;
    622 	int maxIndexes = 0;
    623 	for ( unsigned int i = firstOverlay; i < nextOverlay; i++ ) {
    624 		const overlay_t & overlay = overlays[i & ( MAX_OVERLAYS - 1 )];
    625 		if ( overlay.material == material ) {
    626 			maxVerts += overlay.numVerts;
    627 			maxIndexes += overlay.numIndexes;
    628 		}
    629 	}
    630 
    631 	if ( maxVerts == 0 || maxIndexes == 0 ) {
    632 		return NULL;
    633 	}
    634 
    635 	// create a new triangle surface in frame memory so it gets automatically disposed of
    636 	srfTriangles_t *newTri = (srfTriangles_t *)R_ClearedFrameAlloc( sizeof( *newTri ), FRAME_ALLOC_SURFACE_TRIANGLES );
    637 	newTri->staticModelWithJoints = ( staticModel->jointsInverted != NULL ) ? const_cast< idRenderModelStatic * >( staticModel ) : NULL;	// allow GPU skinning
    638 
    639 	newTri->ambientCache = vertexCache.AllocVertex( NULL, ALIGN( maxVerts * sizeof( idDrawVert ), VERTEX_CACHE_ALIGN ) );
    640 	newTri->indexCache = vertexCache.AllocIndex( NULL, ALIGN( maxIndexes * sizeof( triIndex_t ), INDEX_CACHE_ALIGN ) );
    641 
    642 	idDrawVert * mappedVerts = (idDrawVert *)vertexCache.MappedVertexBuffer( newTri->ambientCache );
    643 	triIndex_t * mappedIndexes = (triIndex_t *)vertexCache.MappedIndexBuffer( newTri->indexCache );
    644 
    645 	int numVerts = 0;
    646 	int numIndexes = 0;
    647 
    648 	for ( unsigned int i = firstOverlay; i < nextOverlay; i++ ) {
    649 		overlay_t & overlay = overlays[i & ( MAX_OVERLAYS - 1 )];
    650 
    651 		if ( overlay.numVerts == 0 ) {
    652 			if ( i == firstOverlay ) {
    653 				firstOverlay++;
    654 			}
    655 			continue;
    656 		}
    657 
    658 		if ( overlay.material != material ) {
    659 			continue;
    660 		}
    661 
    662 		// get the source model surface for this overlay surface
    663 		const modelSurface_t * baseSurf = ( overlay.surfaceNum < staticModel->NumSurfaces() ) ? staticModel->Surface( overlay.surfaceNum ) : NULL;
    664 
    665 		// if the surface ids no longer match
    666 		if ( baseSurf == NULL || baseSurf->id != overlay.surfaceId ) {
    667 			// find the surface with the correct id
    668 			if ( staticModel->FindSurfaceWithId( overlay.surfaceId, overlay.surfaceNum ) ) {
    669 				baseSurf = staticModel->Surface( overlay.surfaceNum );
    670 			} else {
    671 				// the surface with this id no longer exists
    672 				FreeOverlay( overlay );
    673 				if ( i == firstOverlay ) {
    674 					firstOverlay++;
    675 				}
    676 				continue;
    677 			}
    678 		}
    679 
    680 		// check for out of range vertex references
    681 		const srfTriangles_t * baseTri = baseSurf->geometry;
    682 		if ( overlay.maxReferencedVertex >= baseTri->numVerts ) {
    683 			// This can happen when playing a demofile and a model has been changed since it was recorded, so just issue a warning and go on.
    684 			common->Warning( "idRenderModelOverlay::CreateOverlayDrawSurf: overlay vertex out of range.  Model has probably changed since generating the overlay." );
    685 			FreeOverlay( overlay );
    686 			if ( i == firstOverlay ) {
    687 				firstOverlay++;
    688 			}
    689 			continue;
    690 		}
    691 
    692 		// use SIMD optimized routine to copy the vertices and indices directly to write-combined memory
    693 		R_CopyOverlaySurface( mappedVerts, numVerts, mappedIndexes, numIndexes, &overlay, baseTri->verts );
    694 
    695 		numIndexes += overlay.numIndexes;
    696 		numVerts += overlay.numVerts;
    697 	}
    698 
    699 	newTri->numVerts = numVerts;
    700 	newTri->numIndexes = numIndexes;
    701 	
    702 	// create the drawsurf
    703 	drawSurf_t * drawSurf = (drawSurf_t *)R_FrameAlloc( sizeof( *drawSurf ), FRAME_ALLOC_DRAW_SURFACE );
    704 	drawSurf->frontEndGeo = newTri;
    705 	drawSurf->numIndexes = newTri->numIndexes;
    706 	drawSurf->ambientCache = newTri->ambientCache;
    707 	drawSurf->indexCache = newTri->indexCache;
    708 	drawSurf->shadowCache = 0;
    709 	drawSurf->space = space;
    710 	drawSurf->scissorRect = space->scissorRect;
    711 	drawSurf->extraGLState = 0;
    712 	drawSurf->renderZFail = 0;
    713 
    714 	R_SetupDrawSurfShader( drawSurf, material, &space->entityDef->parms );
    715 	R_SetupDrawSurfJoints( drawSurf, newTri, NULL );
    716 
    717 	return drawSurf;
    718 }
    719 
    720 /*
    721 ====================
    722 idRenderModelOverlay::ReadFromDemoFile
    723 ====================
    724 */
    725 void idRenderModelOverlay::ReadFromDemoFile( idDemoFile *f ) {
    726 	// FIXME: implement
    727 }
    728 
    729 /*
    730 ====================
    731 idRenderModelOverlay::WriteToDemoFile
    732 ====================
    733 */
    734 void idRenderModelOverlay::WriteToDemoFile( idDemoFile *f ) const {
    735 	// FIXME: implement
    736 }