SoftwareCache.h (22385B)
1 /* 2 =========================================================================== 3 4 Doom 3 BFG Edition GPL Source Code 5 Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. 6 7 This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). 8 9 Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation, either version 3 of the License, or 12 (at your option) any later version. 13 14 Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License 20 along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>. 21 22 In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below. 23 24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA. 25 26 =========================================================================== 27 */ 28 #ifndef __SOFTWARECACHE_H__ 29 #define __SOFTWARECACHE_H__ 30 31 #pragma warning( disable : 4324 ) // structure was padded due to __declspec(align()) 32 33 /* 34 ================================================================================================ 35 36 On-Demand Streamed Objects and Arrays 37 38 idODSObject // DMA in a single object 39 idODSCachedObject // DMA in a single object through a software cache 40 idODSArray // DMA in an array with objects 41 idODSIndexedArray // DMA gather from an array with objects 42 idODSStreamedArray // overlapped DMA streaming of an array with objects 43 idODSStreamedIndexedArray // overlapped DMA gather from an array with objects 44 45 On the SPU the 'idODSObject' streams the data into temporary memory using the DMA controller 46 and the object constructor immediately waits for the DMA transfer to complete. In other words 47 there is no caching and every random memory access incurs full memory latency. This should be 48 used to stream in objects that are only used once at unpredictable times. 49 50 The 'idODSCachedObject' uses an object based software cache on the SPU which is useful for 51 streaming in objects that may be used repeatedly or which usage can be predicted allowing 52 the objects to be prefetched. 53 54 class idMyType {}; 55 class idMyCache : public idSoftwareCache< idMyType, 8, 4 > {}; 56 idMyCache myCache; 57 idMyType * myPtr; 58 idODSCachedObject< idMyType, idMyCache > myODS( myPtr, myCache ); 59 60 The 'idSoftwareCache' implements a Prefetch() function that can be used to prefetch whole 61 objects into the cache well before they are needed. However, any idODSObject, idODSArray, 62 idODSIndexedArray etc. after calling the Prefetch() function will have to wait for the 63 prefetch to complete. In other words, make sure there is enough "work" done in between 64 a Prefetch() call and the first next idODS* object. 65 66 The 'idODSArray' streams in a block of objects that are tightly packed in memory. 67 68 The 'idODSIndexedArray' is used to gather a number of objects that are not necessarily 69 contiguous in memory. On the SPU a DMA-list is used in the 'idODSIndexedArray' constructor 70 to efficiently gather all the objects. 71 72 The 'idODSStreamedArray' is used for sequentially reading a large input array. Overlapped 73 streaming is used where one batch of array elements can be accessed while the next batch 74 is being streamed in. 75 76 The 'idODSStreamedIndexedArray' is used for gathering elements from an array using a 77 sequentially read index. Overlapped streaming is used for both the index and the array 78 elements where one batch of array elements can be accessed while the next batch of 79 indices/array elements is being streamed in. 80 81 Outside the SPU, data is never copied to temporary memory because this would cause 82 significant load-hit-store penalties. Instead, the object constructor issues prefetch 83 instructions where appropriate and only maintains pointers to the actual data. In the 84 case of 'idODSObject' or 'idODSCachedObject' the class is no more than a simple wrapper 85 of a pointer and the class should completely compile away with zero overhead. 86 87 COMMON MISTAKES: 88 89 1. When using ODS objects do not forget to set the "globalDmaTag" that is used to issue 90 and wait for DMAs. 91 92 void cellSpursJobMain2( CellSpursJobContext2 * stInfo, CellSpursJob256 * job ) { 93 globalDmaTag = stInfo->dmaTag; // for ODS objects 94 } 95 96 2. ODS objects can consume quite a bit of stack space. You may have to increase the SPU job 97 stack size. For instance: 98 99 job->header.sizeStack = SPURS_QUADWORDS( 16 * 1024 ); // the ODS objects get pretty large 100 101 Make sure you measure the size of each ODS object and if there are recursive functions 102 using ODS objects make sure the recursion is bounded. When the stack overflows the scratch 103 and output memory may get overwritten and the results will be undefined. Finding stack 104 overflows is painful. 105 106 3. While you can setup a regular DMA list entry to use a NULL pointer with zero size, do not use 107 a NULL pointer for a cache DMA list entry. This confuses SPURS and can cause your SPU binary 108 to get corrupted. 109 110 ================================================================================================ 111 */ 112 113 extern uint32 globalDmaTag; 114 115 #define MAX_DMA_SIZE ( 1 << 14 ) 116 #define ODS_ROUND16( x ) ( ( x + 15 ) & ~15 ) 117 118 enum streamBufferType_t { 119 SBT_DOUBLE = 2, 120 SBT_QUAD = 4 121 }; 122 123 124 /* 125 ================================================================================================ 126 127 non-SPU code 128 129 ================================================================================================ 130 */ 131 132 /* 133 ================================================ 134 idSoftwareCache 135 ================================================ 136 */ 137 template< typename _type_, int _entries_ = 8, int _associativity_ = 4, bool aligned = false > 138 class ALIGNTYPE128 idSoftwareCache { 139 public: 140 void Prefetch( const _type_ * obj ) { 141 ::Prefetch( obj, 0 ); 142 } 143 }; 144 145 /* 146 ================================================ 147 idODSObject 148 ================================================ 149 */ 150 template< typename _type_ > 151 class idODSObject { 152 public: 153 idODSObject( const _type_ * obj ) : objectPtr( obj ) {} 154 operator const _type_ & () const { return *objectPtr; } 155 const _type_ * operator->() const { return objectPtr; } 156 const _type_ & Get() const { return *objectPtr; } 157 const _type_ * Ptr() const { return objectPtr; } 158 const _type_ * OriginalPtr() const { return objectPtr; } 159 160 private: 161 const _type_ * objectPtr; 162 }; 163 164 /* 165 ================================================ 166 idODSCachedObject 167 ================================================ 168 */ 169 template< typename _type_, typename _cache_ > 170 class idODSCachedObject { 171 public: 172 idODSCachedObject( const _type_ * obj, _cache_ & cache ) : objectPtr( obj ) {} 173 operator const _type_ & () const { return *objectPtr; } 174 const _type_ * operator->() const { return objectPtr; } 175 const _type_ & Get() const { return *objectPtr; } 176 const _type_ * Ptr() const { return objectPtr; } 177 const _type_ * OriginalPtr() const { return objectPtr; } 178 179 private: 180 const _type_ * objectPtr; 181 }; 182 183 /* 184 ================================================ 185 idODSArray 186 ================================================ 187 */ 188 template< typename _type_, int max > 189 class idODSArray { 190 public: 191 idODSArray( const _type_ * array, int num ) : arrayPtr( array ), arrayNum( num ) { 192 assert( num <= max ); 193 Prefetch( array, 0 ); 194 } 195 const _type_ & operator[]( int index ) const { 196 assert( index >= 0 && index < arrayNum ); 197 return arrayPtr[index]; 198 } 199 const _type_ * Ptr() const { return arrayPtr; } 200 const int Num() const { return arrayNum; } 201 202 private: 203 const _type_ * arrayPtr; 204 int arrayNum; 205 }; 206 207 /* 208 ================================================ 209 idODSIndexedArray 210 ================================================ 211 */ 212 template< typename _elemType_, typename _indexType_, int max > 213 class idODSIndexedArray { 214 public: 215 idODSIndexedArray( const _elemType_ * array, const _indexType_ * index, int num ) : arrayNum( num ) { 216 assert( num <= max ); 217 for ( int i = 0; i < num; i++ ) { 218 Prefetch( arrayPtr, abs( index[i] ) * sizeof( _elemType_ ) ); 219 arrayPtr[i] = array + abs( index[i] ); 220 } 221 } 222 const _elemType_ & operator[]( int index ) const { 223 assert( index >= 0 && index < arrayNum ); 224 return * arrayPtr[index]; 225 } 226 void ReplicateUpToMultipleOfFour() { 227 assert( ( max & 3 ) == 0 ); 228 while( ( arrayNum & 3 ) != 0 ) { 229 arrayPtr[arrayNum++] = arrayPtr[0]; 230 } 231 } 232 233 private: 234 const _elemType_ * arrayPtr[max]; 235 int arrayNum; 236 }; 237 238 /* 239 ================================================ 240 idODSStreamedOutputArray 241 ================================================ 242 */ 243 template< typename _type_, int _bufferSize_ > 244 class ALIGNTYPE16 idODSStreamedOutputArray { 245 public: 246 idODSStreamedOutputArray( _type_ * array, int * numElements, int maxElements ) : 247 localNum( 0 ), 248 outArray( array ), 249 outNum( numElements ), 250 outMax( maxElements ) { 251 compile_time_assert( CONST_ISPOWEROFTWO( _bufferSize_ ) ); 252 compile_time_assert( ( ( _bufferSize_ * sizeof( _type_ ) ) & 15 ) == 0 ); 253 compile_time_assert( _bufferSize_ * sizeof( _type_ ) < MAX_DMA_SIZE ); 254 assert_16_byte_aligned( array ); 255 } 256 ~idODSStreamedOutputArray() { 257 *outNum = localNum; 258 } 259 260 int Num() const { return localNum; } 261 void Append( _type_ element ) { assert( localNum < outMax ); outArray[localNum++] = element; } 262 _type_ & Alloc() { assert( localNum < outMax ); return outArray[localNum++]; } 263 264 private: 265 int localNum; 266 _type_ * outArray; 267 int * outNum; 268 int outMax; 269 }; 270 271 /* 272 ================================================ 273 idODSStreamedArray 274 ================================================ 275 */ 276 template< typename _type_, int _bufferSize_, streamBufferType_t _sbt_ = SBT_DOUBLE, int _roundUpToMultiple_ = 1 > 277 class ALIGNTYPE16 idODSStreamedArray { 278 public: 279 idODSStreamedArray( const _type_ * array, const int numElements ) : 280 cachedArrayStart( 0 ), 281 cachedArrayEnd( 0 ), 282 streamArrayEnd( 0 ), 283 inArray( array ), 284 inArrayNum( numElements ), 285 inArrayNumRoundedUp( numElements ) { 286 compile_time_assert( CONST_ISPOWEROFTWO( _bufferSize_ ) ); 287 compile_time_assert( ( ( _bufferSize_ * sizeof( _type_ ) ) & 15 ) == 0 ); 288 compile_time_assert( _bufferSize_ * sizeof( _type_ ) < MAX_DMA_SIZE ); 289 compile_time_assert( _roundUpToMultiple_ >= 1 ); 290 assert_16_byte_aligned( array ); 291 assert( (uintptr_t)array > _bufferSize_ * sizeof( _type_ ) ); 292 // Fetch the first batch of elements. 293 FetchNextBatch(); 294 // Calculate the rounded up size here making the mod effectively for free because we have to wait 295 // for memory access anyway while the above FetchNextBatch() does not need the rounded up size yet. 296 inArrayNumRoundedUp += _roundUpToMultiple_ - 1; 297 inArrayNumRoundedUp -= inArrayNumRoundedUp % ( ( _roundUpToMultiple_ > 1 ) ? _roundUpToMultiple_ : 1 ); 298 } 299 ~idODSStreamedArray() { 300 // Flush the accessible part of the array. 301 FlushArray( inArray, cachedArrayStart * sizeof( _type_ ), cachedArrayEnd * sizeof( _type_ ) ); 302 } 303 304 // Fetches a new batch of array elements and returns the first index after this new batch. 305 // After calling this, the elements starting at the index returned by the previous call to 306 // FetchNextBach() (or zero if not yet called) up to (excluding) the index returned by 307 // this call to FetchNextBatch() can be accessed through the [] operator. When quad-buffering, 308 // the elements starting at the index returned by the second-from-last call to FetchNextBatch() 309 // can still be accessed. This is useful when the algorithm needs to successively access 310 // an odd number of elements at the same time that may cross a single buffer boundary. 311 int FetchNextBatch() { 312 // If not everything has been streamed already. 313 if ( cachedArrayEnd < inArrayNum ) { 314 cachedArrayEnd = streamArrayEnd; 315 cachedArrayStart = Max( cachedArrayEnd - _bufferSize_ * ( _sbt_ - 1 ), 0 ); 316 317 // Flush the last batch of elements that is no longer accessible. 318 FlushArray( inArray, ( cachedArrayStart - _bufferSize_ ) * sizeof( _type_ ), cachedArrayStart * sizeof( _type_ ) ); 319 320 // Prefetch the next batch of elements. 321 if ( streamArrayEnd < inArrayNum ) { 322 streamArrayEnd = Min( streamArrayEnd + _bufferSize_, inArrayNum ); 323 for ( unsigned int offset = cachedArrayEnd * sizeof( _type_ ); offset < streamArrayEnd * sizeof( _type_ ); offset += CACHE_LINE_SIZE ) { 324 Prefetch( inArray, offset ); 325 } 326 } 327 } 328 return ( cachedArrayEnd == inArrayNum ) ? inArrayNumRoundedUp : cachedArrayEnd; 329 } 330 331 // Provides access to the elements starting at the index returned by the next-to-last call 332 // to FetchNextBach() (or zero if only called once so far) up to (excluding) the index 333 // returned by the last call to FetchNextBatch(). When quad-buffering, the elements starting 334 // at the index returned by the second-from-last call to FetchNextBatch() can still be accessed. 335 // This is useful when the algorithm needs to successively access an odd number of elements 336 // at the same time that may cross a single buffer boundary. 337 const _type_ & operator[]( int index ) const { 338 assert( ( index >= cachedArrayStart && index < cachedArrayEnd ) || ( cachedArrayEnd == inArrayNum && index >= inArrayNum && index < inArrayNumRoundedUp ) ); 339 if ( _roundUpToMultiple_ > 1 ) { 340 index &= ( index - inArrayNum ) >> 31; 341 } 342 return inArray[index]; 343 } 344 345 private: 346 int cachedArrayStart; 347 int cachedArrayEnd; 348 int streamArrayEnd; 349 const _type_ * inArray; 350 int inArrayNum; 351 int inArrayNumRoundedUp; 352 353 static void FlushArray( const void * flushArray, int flushStart, int flushEnd ) { 354 #if 0 355 // arrayFlushBase is rounded up so we do not flush anything before the array. 356 // arrayFlushStart is rounded down so we start right after the last cache line that was previously flushed. 357 // arrayFlushEnd is rounded down so we do not flush a cache line that holds data that may still be partially 358 // accessible or a cache line that stretches beyond the end of the array. 359 const uintptr_t arrayAddress = (uintptr_t)flushArray; 360 const uintptr_t arrayFlushBase = ( arrayAddress + CACHE_LINE_SIZE - 1 ) & ~( CACHE_LINE_SIZE - 1 ); 361 const uintptr_t arrayFlushStart = ( arrayAddress + flushStart ) & ~( CACHE_LINE_SIZE - 1 ); 362 const uintptr_t arrayFlushEnd = ( arrayAddress + flushEnd ) & ~( CACHE_LINE_SIZE - 1 ); 363 for ( uintptr_t offset = Max( arrayFlushBase, arrayFlushStart ); offset < arrayFlushEnd; offset += CACHE_LINE_SIZE ) { 364 FlushCacheLine( flushArray, offset - arrayAddress ); 365 } 366 #endif 367 } 368 }; 369 370 /* 371 ================================================ 372 idODSStreamedIndexedArray 373 374 For gathering elements from an array using a sequentially read index. 375 This uses overlapped streaming for both the index and the array elements 376 where one batch of indices and/or array elements can be accessed while 377 the next batch is being streamed in. 378 379 NOTE: currently the size of array elements must be a multiple of 16 bytes. 380 An index with offsets and more complex logic is needed to support other sizes. 381 ================================================ 382 */ 383 template< typename _elemType_, typename _indexType_, int _bufferSize_, streamBufferType_t _sbt_ = SBT_DOUBLE, int _roundUpToMultiple_ = 1 > 384 class ALIGNTYPE16 idODSStreamedIndexedArray { 385 public: 386 idODSStreamedIndexedArray( const _elemType_ * array, const int numElements, const _indexType_ * index, const int numIndices ) : 387 cachedArrayStart( 0 ), 388 cachedArrayEnd( 0 ), 389 streamArrayEnd( 0 ), 390 cachedIndexStart( 0 ), 391 cachedIndexEnd( 0 ), 392 streamIndexEnd( 0 ), 393 inArray( array ), 394 inArrayNum( numElements ), 395 inIndex( index ), 396 inIndexNum( numIndices ), 397 inIndexNumRoundedUp( numIndices ) { 398 compile_time_assert( CONST_ISPOWEROFTWO( _bufferSize_ ) ); 399 compile_time_assert( ( ( _bufferSize_ * sizeof( _indexType_ ) ) & 15 ) == 0 ); 400 compile_time_assert( _bufferSize_ * sizeof( _indexType_ ) < MAX_DMA_SIZE ); 401 compile_time_assert( _bufferSize_ * sizeof( _elemType_ ) < MAX_DMA_SIZE ); 402 compile_time_assert( ( sizeof( _elemType_ ) & 15 ) == 0 ); // to avoid complexity due to cellDmaListGet 403 compile_time_assert( _roundUpToMultiple_ >= 1 ); 404 assert_16_byte_aligned( index ); 405 assert_16_byte_aligned( array ); 406 assert( (uintptr_t)index > _bufferSize_ * sizeof( _indexType_ ) ); 407 assert( (uintptr_t)array > _bufferSize_ * sizeof( _elemType_ ) ); 408 // Fetch the first batch of indices. 409 FetchNextBatch(); 410 // Fetch the first batch of elements and the next batch of indices. 411 FetchNextBatch(); 412 // Calculate the rounded up size here making the mod effectively for free because we have to wait 413 // for memory access anyway while the above FetchNextBatch() do not need the rounded up size yet. 414 inIndexNumRoundedUp += _roundUpToMultiple_ - 1; 415 inIndexNumRoundedUp -= inIndexNumRoundedUp % ( ( _roundUpToMultiple_ > 1 ) ? _roundUpToMultiple_ : 1 ); 416 } 417 ~idODSStreamedIndexedArray() { 418 // Flush the accessible part of the index. 419 FlushArray( inIndex, cachedIndexStart * sizeof( _indexType_ ), cachedIndexEnd * sizeof( _indexType_ ) ); 420 // Flush the accessible part of the array. 421 FlushArray( inArray, cachedArrayStart * sizeof( _elemType_ ), cachedArrayEnd * sizeof( _elemType_ ) ); 422 } 423 424 // Fetches a new batch of array elements and returns the first index after this new batch. 425 // After calling this, the elements starting at the index returned by the previous call to 426 // FetchNextBach() (or zero if not yet called) up to (excluding) the index returned by 427 // this call to FetchNextBatch() can be accessed through the [] operator. When quad-buffering, 428 // the elements starting at the index returned by the second-from-last call to FetchNextBatch() 429 // can still be accessed. This is useful when the algorithm needs to successively access 430 // an odd number of elements at the same time that may cross a single buffer boundary. 431 int FetchNextBatch() { 432 // If not everything has been streamed already. 433 if ( cachedArrayEnd < inIndexNum ) { 434 if ( streamIndexEnd > 0 ) { 435 cachedArrayEnd = streamArrayEnd; 436 cachedArrayStart = Max( cachedArrayEnd - _bufferSize_ * ( _sbt_ - 1 ), 0 ); 437 cachedIndexEnd = streamIndexEnd; 438 cachedIndexStart = Max( cachedIndexEnd - _bufferSize_ * ( _sbt_ - 1 ), 0 ); 439 440 // Flush the last batch of indices that are no longer accessible. 441 FlushArray( inIndex, ( cachedIndexStart - _bufferSize_ ) * sizeof( _indexType_ ), cachedIndexStart * sizeof( _indexType_ ) ); 442 // Flush the last batch of elements that is no longer accessible. 443 FlushArray( inArray, ( cachedArrayStart - _bufferSize_ ) * sizeof( _elemType_ ), cachedArrayStart * sizeof( _elemType_ ) ); 444 445 // Prefetch the next batch of elements. 446 if ( streamArrayEnd < inIndexNum ) { 447 streamArrayEnd = cachedIndexEnd; 448 for ( int i = cachedArrayEnd; i < streamArrayEnd; i++ ) { 449 assert( i >= cachedIndexStart && i < cachedIndexEnd ); 450 assert( inIndex[i] >= 0 && inIndex[i] < inArrayNum ); 451 452 Prefetch( inArray, inIndex[i] * sizeof( _elemType_ ) ); 453 } 454 } 455 } 456 457 // Prefetch the next batch of indices. 458 if ( streamIndexEnd < inIndexNum ) { 459 streamIndexEnd = Min( streamIndexEnd + _bufferSize_, inIndexNum ); 460 for ( unsigned int offset = cachedIndexEnd * sizeof( _indexType_ ); offset < streamIndexEnd * sizeof( _indexType_ ); offset += CACHE_LINE_SIZE ) { 461 Prefetch( inIndex, offset ); 462 } 463 } 464 } 465 return ( cachedArrayEnd == inIndexNum ) ? inIndexNumRoundedUp : cachedArrayEnd; 466 } 467 468 // Provides access to the elements starting at the index returned by the next-to-last call 469 // to FetchNextBach() (or zero if only called once so far) up to (excluding) the index 470 // returned by the last call to FetchNextBatch(). When quad-buffering, the elements starting 471 // at the index returned by the second-from-last call to FetchNextBatch() can still be accessed. 472 // This is useful when the algorithm needs to successively access an odd number of elements 473 // at the same time that may cross a single buffer boundary. 474 const _elemType_ & operator[]( int index ) const { 475 assert( ( index >= cachedArrayStart && index < cachedArrayEnd ) || ( cachedArrayEnd == inIndexNum && index >= inIndexNum && index < inIndexNumRoundedUp ) ); 476 if ( _roundUpToMultiple_ > 1 ) { 477 index &= ( index - inIndexNum ) >> 31; 478 } 479 return inArray[inIndex[index]]; 480 } 481 482 private: 483 int cachedArrayStart; 484 int cachedArrayEnd; 485 int streamArrayEnd; 486 int cachedIndexStart; 487 int cachedIndexEnd; 488 int streamIndexEnd; 489 const _elemType_ * inArray; 490 int inArrayNum; 491 const _indexType_ * inIndex; 492 int inIndexNum; 493 int inIndexNumRoundedUp; 494 495 static void FlushArray( const void * flushArray, int flushStart, int flushEnd ) { 496 #if 0 497 // arrayFlushBase is rounded up so we do not flush anything before the array. 498 // arrayFlushStart is rounded down so we start right after the last cache line that was previously flushed. 499 // arrayFlushEnd is rounded down so we do not flush a cache line that holds data that may still be partially 500 // accessible or a cache line that stretches beyond the end of the array. 501 const uintptr_t arrayAddress = (uintptr_t)flushArray; 502 const uintptr_t arrayFlushBase = ( arrayAddress + CACHE_LINE_SIZE - 1 ) & ~( CACHE_LINE_SIZE - 1 ); 503 const uintptr_t arrayFlushStart = ( arrayAddress + flushStart ) & ~( CACHE_LINE_SIZE - 1 ); 504 const uintptr_t arrayFlushEnd = ( arrayAddress + flushEnd ) & ~( CACHE_LINE_SIZE - 1 ); 505 for ( uintptr_t offset = Max( arrayFlushBase, arrayFlushStart ); offset < arrayFlushEnd; offset += CACHE_LINE_SIZE ) { 506 FlushCacheLine( flushArray, offset - arrayAddress ); 507 } 508 #endif 509 } 510 }; 511 512 513 #endif // !__SOFTWARECACHE_H__