r_draw16.asm (30058B)
1 .386P 2 .model FLAT 3 ; 4 ; d_draw16.s 5 ; x86 assembly-language horizontal 8-bpp span-drawing code, with 16-pixel 6 ; subdivision. 7 ; 8 9 include qasm.inc 10 include d_if.inc 11 12 if id386 13 14 ;---------------------------------------------------------------------- 15 ; 8-bpp horizontal span drawing code for polygons, with no transparency and 16 ; 16-pixel subdivision. 17 ; 18 ; Assumes there is at least one span in pspans, and that every span 19 ; contains at least one pixel 20 ;---------------------------------------------------------------------- 21 22 _DATA SEGMENT 23 24 _DATA ENDS 25 _TEXT SEGMENT 26 27 ; out-of-line, rarely-needed clamping code 28 29 LClampHigh0: 30 mov esi,ds:dword ptr[_bbextents] 31 jmp LClampReentry0 32 LClampHighOrLow0: 33 jg LClampHigh0 34 xor esi,esi 35 jmp LClampReentry0 36 37 LClampHigh1: 38 mov edx,ds:dword ptr[_bbextentt] 39 jmp LClampReentry1 40 LClampHighOrLow1: 41 jg LClampHigh1 42 xor edx,edx 43 jmp LClampReentry1 44 45 LClampLow2: 46 mov ebp,4096 47 jmp LClampReentry2 48 LClampHigh2: 49 mov ebp,ds:dword ptr[_bbextents] 50 jmp LClampReentry2 51 52 LClampLow3: 53 mov ecx,4096 54 jmp LClampReentry3 55 LClampHigh3: 56 mov ecx,ds:dword ptr[_bbextentt] 57 jmp LClampReentry3 58 59 LClampLow4: 60 mov eax,4096 61 jmp LClampReentry4 62 LClampHigh4: 63 mov eax,ds:dword ptr[_bbextents] 64 jmp LClampReentry4 65 66 LClampLow5: 67 mov ebx,4096 68 jmp LClampReentry5 69 LClampHigh5: 70 mov ebx,ds:dword ptr[_bbextentt] 71 jmp LClampReentry5 72 73 74 pspans equ 4+16 75 76 align 4 77 public _D_DrawSpans16 78 _D_DrawSpans16: 79 push ebp ; preserve caller's stack frame 80 push edi 81 push esi ; preserve register variables 82 push ebx 83 84 ; 85 ; set up scaled-by-16 steps, for 16-long segments; also set up cacheblock 86 ; and span list pointers 87 ; 88 ; TODO: any overlap from rearranging? 89 fld ds:dword ptr[_d_sdivzstepu] 90 fmul ds:dword ptr[fp_16] 91 mov edx,ds:dword ptr[_cacheblock] 92 fld ds:dword ptr[_d_tdivzstepu] 93 fmul ds:dword ptr[fp_16] 94 mov ebx,ds:dword ptr[pspans+esp] ; point to the first span descriptor 95 fld ds:dword ptr[_d_zistepu] 96 fmul ds:dword ptr[fp_16] 97 mov ds:dword ptr[pbase],edx ; pbase = cacheblock 98 fstp ds:dword ptr[zi16stepu] 99 fstp ds:dword ptr[tdivz16stepu] 100 fstp ds:dword ptr[sdivz16stepu] 101 102 LSpanLoop: 103 ; 104 ; set up the initial s/z, t/z, and 1/z on the FP stack, and generate the 105 ; initial s and t values 106 ; 107 ; FIXME: pipeline FILD? 108 fild ds:dword ptr[espan_t_v+ebx] 109 fild ds:dword ptr[espan_t_u+ebx] 110 111 fld st(1) ; dv | du | dv 112 fmul ds:dword ptr[_d_sdivzstepv] ; dv*d_sdivzstepv | du | dv 113 fld st(1) ; du | dv*d_sdivzstepv | du | dv 114 fmul ds:dword ptr[_d_sdivzstepu] ; du*d_sdivzstepu | dv*d_sdivzstepv | du | dv 115 fld st(2) ; du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv 116 fmul ds:dword ptr[_d_tdivzstepu] ; du*d_tdivzstepu | du*d_sdivzstepu | 117 ; dv*d_sdivzstepv | du | dv 118 fxch st(1) ; du*d_sdivzstepu | du*d_tdivzstepu | 119 ; dv*d_sdivzstepv | du | dv 120 faddp st(2),st(0) ; du*d_tdivzstepu | 121 ; du*d_sdivzstepu + dv*d_sdivzstepv | du | dv 122 fxch st(1) ; du*d_sdivzstepu + dv*d_sdivzstepv | 123 ; du*d_tdivzstepu | du | dv 124 fld st(3) ; dv | du*d_sdivzstepu + dv*d_sdivzstepv | 125 ; du*d_tdivzstepu | du | dv 126 fmul ds:dword ptr[_d_tdivzstepv] ; dv*d_tdivzstepv | 127 ; du*d_sdivzstepu + dv*d_sdivzstepv | 128 ; du*d_tdivzstepu | du | dv 129 fxch st(1) ; du*d_sdivzstepu + dv*d_sdivzstepv | 130 ; dv*d_tdivzstepv | du*d_tdivzstepu | du | dv 131 fadd ds:dword ptr[_d_sdivzorigin] ; sdivz = d_sdivzorigin + dv*d_sdivzstepv + 132 ; du*d_sdivzstepu; stays in %st(2) at end 133 fxch st(4) ; dv | dv*d_tdivzstepv | du*d_tdivzstepu | du | 134 ; s/z 135 fmul ds:dword ptr[_d_zistepv] ; dv*d_zistepv | dv*d_tdivzstepv | 136 ; du*d_tdivzstepu | du | s/z 137 fxch st(1) ; dv*d_tdivzstepv | dv*d_zistepv | 138 ; du*d_tdivzstepu | du | s/z 139 faddp st(2),st(0) ; dv*d_zistepv | 140 ; dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z 141 fxch st(2) ; du | dv*d_tdivzstepv + du*d_tdivzstepu | 142 ; dv*d_zistepv | s/z 143 fmul ds:dword ptr[_d_zistepu] ; du*d_zistepu | 144 ; dv*d_tdivzstepv + du*d_tdivzstepu | 145 ; dv*d_zistepv | s/z 146 fxch st(1) ; dv*d_tdivzstepv + du*d_tdivzstepu | 147 ; du*d_zistepu | dv*d_zistepv | s/z 148 fadd ds:dword ptr[_d_tdivzorigin] ; tdivz = d_tdivzorigin + dv*d_tdivzstepv + 149 ; du*d_tdivzstepu; stays in %st(1) at end 150 fxch st(2) ; dv*d_zistepv | du*d_zistepu | t/z | s/z 151 faddp st(1),st(0) ; dv*d_zistepv + du*d_zistepu | t/z | s/z 152 153 fld ds:dword ptr[fp_64k] ; fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z 154 fxch st(1) ; dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z 155 fadd ds:dword ptr[_d_ziorigin] ; zi = d_ziorigin + dv*d_zistepv + 156 ; du*d_zistepu; stays in %st(0) at end 157 ; 1/z | fp_64k | t/z | s/z 158 ; 159 ; calculate and clamp s & t 160 ; 161 fdiv st(1),st(0) ; 1/z | z*64k | t/z | s/z 162 163 ; 164 ; point %edi to the first pixel in the span 165 ; 166 mov ecx,ds:dword ptr[_d_viewbuffer] 167 mov eax,ds:dword ptr[espan_t_v+ebx] 168 mov ds:dword ptr[pspantemp],ebx ; preserve spans pointer 169 170 mov edx,ds:dword ptr[_tadjust] 171 mov esi,ds:dword ptr[_sadjust] 172 mov edi,ds:dword ptr[_d_scantable+eax*4] ; v * screenwidth 173 add edi,ecx 174 mov ecx,ds:dword ptr[espan_t_u+ebx] 175 add edi,ecx ; pdest = &pdestspan[scans->u]; 176 mov ecx,ds:dword ptr[espan_t_count+ebx] 177 178 ; 179 ; now start the FDIV for the end of the span 180 ; 181 cmp ecx,16 182 ja LSetupNotLast1 183 184 dec ecx 185 jz LCleanup1 ; if only one pixel, no need to start an FDIV 186 mov ds:dword ptr[spancountminus1],ecx 187 188 ; finish up the s and t calcs 189 fxch st(1) ; z*64k | 1/z | t/z | s/z 190 191 fld st(0) ; z*64k | z*64k | 1/z | t/z | s/z 192 fmul st(0),st(4) ; s | z*64k | 1/z | t/z | s/z 193 fxch st(1) ; z*64k | s | 1/z | t/z | s/z 194 fmul st(0),st(3) ; t | s | 1/z | t/z | s/z 195 fxch st(1) ; s | t | 1/z | t/z | s/z 196 fistp ds:dword ptr[s] ; 1/z | t | t/z | s/z 197 fistp ds:dword ptr[t] ; 1/z | t/z | s/z 198 199 fild ds:dword ptr[spancountminus1] 200 201 fld ds:dword ptr[_d_tdivzstepu] ; C(d_tdivzstepu) | spancountminus1 202 fld ds:dword ptr[_d_zistepu] ; C(d_zistepu) | C(d_tdivzstepu) | spancountminus1 203 fmul st(0),st(2) ; C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1 204 fxch st(1) ; C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1 205 fmul st(0),st(2) ; C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1 206 fxch st(2) ; scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 207 fmul ds:dword ptr[_d_sdivzstepu] ; C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 | 208 ; C(d_tdivzstepu)*scm1 209 fxch st(1) ; C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 | 210 ; C(d_tdivzstepu)*scm1 211 faddp st(3),st(0) ; C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1 212 fxch st(1) ; C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1 213 faddp st(3),st(0) ; C(d_sdivzstepu)*scm1 214 faddp st(3),st(0) 215 216 fld ds:dword ptr[fp_64k] 217 fdiv st(0),st(1) ; this is what we've gone to all this trouble to 218 ; overlap 219 jmp LFDIVInFlight1 220 221 LCleanup1: 222 ; finish up the s and t calcs 223 fxch st(1) ; z*64k | 1/z | t/z | s/z 224 225 fld st(0) ; z*64k | z*64k | 1/z | t/z | s/z 226 fmul st(0),st(4) ; s | z*64k | 1/z | t/z | s/z 227 fxch st(1) ; z*64k | s | 1/z | t/z | s/z 228 fmul st(0),st(3) ; t | s | 1/z | t/z | s/z 229 fxch st(1) ; s | t | 1/z | t/z | s/z 230 fistp ds:dword ptr[s] ; 1/z | t | t/z | s/z 231 fistp ds:dword ptr[t] ; 1/z | t/z | s/z 232 jmp LFDIVInFlight1 233 234 align 4 235 LSetupNotLast1: 236 ; finish up the s and t calcs 237 fxch st(1) ; z*64k | 1/z | t/z | s/z 238 239 fld st(0) ; z*64k | z*64k | 1/z | t/z | s/z 240 fmul st(0),st(4) ; s | z*64k | 1/z | t/z | s/z 241 fxch st(1) ; z*64k | s | 1/z | t/z | s/z 242 fmul st(0),st(3) ; t | s | 1/z | t/z | s/z 243 fxch st(1) ; s | t | 1/z | t/z | s/z 244 fistp ds:dword ptr[s] ; 1/z | t | t/z | s/z 245 fistp ds:dword ptr[t] ; 1/z | t/z | s/z 246 247 fadd ds:dword ptr[zi16stepu] 248 fxch st(2) 249 fadd ds:dword ptr[sdivz16stepu] 250 fxch st(2) 251 fld ds:dword ptr[tdivz16stepu] 252 faddp st(2),st(0) 253 fld ds:dword ptr[fp_64k] 254 fdiv st(0),st(1) ; z = 1/1/z 255 ; this is what we've gone to all this trouble to 256 ; overlap 257 LFDIVInFlight1: 258 259 add esi,ds:dword ptr[s] 260 add edx,ds:dword ptr[t] 261 mov ebx,ds:dword ptr[_bbextents] 262 mov ebp,ds:dword ptr[_bbextentt] 263 cmp esi,ebx 264 ja LClampHighOrLow0 265 LClampReentry0: 266 mov ds:dword ptr[s],esi 267 mov ebx,ds:dword ptr[pbase] 268 shl esi,16 269 cmp edx,ebp 270 mov ds:dword ptr[sfracf],esi 271 ja LClampHighOrLow1 272 LClampReentry1: 273 mov ds:dword ptr[t],edx 274 mov esi,ds:dword ptr[s] ; sfrac = scans->sfrac; 275 shl edx,16 276 mov eax,ds:dword ptr[t] ; tfrac = scans->tfrac; 277 sar esi,16 278 mov ds:dword ptr[tfracf],edx 279 280 ; 281 ; calculate the texture starting address 282 ; 283 sar eax,16 284 mov edx,ds:dword ptr[_cachewidth] 285 imul eax,edx ; (tfrac >> 16) * cachewidth 286 add esi,ebx 287 add esi,eax ; psource = pbase + (sfrac >> 16) + 288 ; ((tfrac >> 16) * cachewidth); 289 ; 290 ; determine whether last span or not 291 ; 292 cmp ecx,16 293 jna LLastSegment 294 295 ; 296 ; not the last segment; do full 16-wide segment 297 ; 298 LNotLastSegment: 299 300 ; 301 ; advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to 302 ; get there 303 ; 304 305 ; pick up after the FDIV that was left in flight previously 306 307 fld st(0) ; duplicate it 308 fmul st(0),st(4) ; s = s/z * z 309 fxch st(1) 310 fmul st(0),st(3) ; t = t/z * z 311 fxch st(1) 312 fistp ds:dword ptr[snext] 313 fistp ds:dword ptr[tnext] 314 mov eax,ds:dword ptr[snext] 315 mov edx,ds:dword ptr[tnext] 316 317 mov bl,ds:byte ptr[esi] ; get first source texel 318 sub ecx,16 ; count off this segments' pixels 319 mov ebp,ds:dword ptr[_sadjust] 320 mov ds:dword ptr[counttemp],ecx ; remember count of remaining pixels 321 322 mov ecx,ds:dword ptr[_tadjust] 323 mov ds:byte ptr[edi],bl ; store first dest pixel 324 325 add ebp,eax 326 add ecx,edx 327 328 mov eax,ds:dword ptr[_bbextents] 329 mov edx,ds:dword ptr[_bbextentt] 330 331 cmp ebp,4096 332 jl LClampLow2 333 cmp ebp,eax 334 ja LClampHigh2 335 LClampReentry2: 336 337 cmp ecx,4096 338 jl LClampLow3 339 cmp ecx,edx 340 ja LClampHigh3 341 LClampReentry3: 342 343 mov ds:dword ptr[snext],ebp 344 mov ds:dword ptr[tnext],ecx 345 346 sub ebp,ds:dword ptr[s] 347 sub ecx,ds:dword ptr[t] 348 349 ; 350 ; set up advancetable 351 ; 352 mov eax,ecx 353 mov edx,ebp 354 sar eax,20 ; tstep >>= 16; 355 jz LZero 356 sar edx,20 ; sstep >>= 16; 357 mov ebx,ds:dword ptr[_cachewidth] 358 imul eax,ebx 359 jmp LSetUp1 360 361 LZero: 362 sar edx,20 ; sstep >>= 16; 363 mov ebx,ds:dword ptr[_cachewidth] 364 365 LSetUp1: 366 367 add eax,edx ; add in sstep 368 ; (tstep >> 16) * cachewidth + (sstep >> 16); 369 mov edx,ds:dword ptr[tfracf] 370 mov ds:dword ptr[advancetable+4],eax ; advance base in t 371 add eax,ebx ; ((tstep >> 16) + 1) * cachewidth + 372 ; (sstep >> 16); 373 shl ebp,12 ; left-justify sstep fractional part 374 mov ebx,ds:dword ptr[sfracf] 375 shl ecx,12 ; left-justify tstep fractional part 376 mov ds:dword ptr[advancetable],eax ; advance extra in t 377 378 mov ds:dword ptr[tstep],ecx 379 add edx,ecx ; advance tfrac fractional part by tstep frac 380 381 sbb ecx,ecx ; turn tstep carry into -1 (0 if none) 382 add ebx,ebp ; advance sfrac fractional part by sstep frac 383 adc esi,ds:dword ptr[advancetable+4+ecx*4] ; point to next source texel 384 385 add edx,ds:dword ptr[tstep] 386 sbb ecx,ecx 387 mov al,ds:byte ptr[esi] 388 add ebx,ebp 389 mov ds:byte ptr[1+edi],al 390 adc esi,ds:dword ptr[advancetable+4+ecx*4] 391 392 add edx,ds:dword ptr[tstep] 393 sbb ecx,ecx 394 add ebx,ebp 395 mov al,ds:byte ptr[esi] 396 adc esi,ds:dword ptr[advancetable+4+ecx*4] 397 398 add edx,ds:dword ptr[tstep] 399 sbb ecx,ecx 400 mov ds:byte ptr[2+edi],al 401 add ebx,ebp 402 mov al,ds:byte ptr[esi] 403 adc esi,ds:dword ptr[advancetable+4+ecx*4] 404 405 add edx,ds:dword ptr[tstep] 406 sbb ecx,ecx 407 mov ds:byte ptr[3+edi],al 408 add ebx,ebp 409 mov al,ds:byte ptr[esi] 410 adc esi,ds:dword ptr[advancetable+4+ecx*4] 411 412 add edx,ds:dword ptr[tstep] 413 sbb ecx,ecx 414 mov ds:byte ptr[4+edi],al 415 add ebx,ebp 416 mov al,ds:byte ptr[esi] 417 adc esi,ds:dword ptr[advancetable+4+ecx*4] 418 419 add edx,ds:dword ptr[tstep] 420 sbb ecx,ecx 421 mov ds:byte ptr[5+edi],al 422 add ebx,ebp 423 mov al,ds:byte ptr[esi] 424 adc esi,ds:dword ptr[advancetable+4+ecx*4] 425 426 add edx,ds:dword ptr[tstep] 427 sbb ecx,ecx 428 mov ds:byte ptr[6+edi],al 429 add ebx,ebp 430 mov al,ds:byte ptr[esi] 431 adc esi,ds:dword ptr[advancetable+4+ecx*4] 432 433 add edx,ds:dword ptr[tstep] 434 sbb ecx,ecx 435 mov ds:byte ptr[7+edi],al 436 add ebx,ebp 437 mov al,ds:byte ptr[esi] 438 adc esi,ds:dword ptr[advancetable+4+ecx*4] 439 440 441 ; 442 ; start FDIV for end of next segment in flight, so it can overlap 443 ; 444 mov ecx,ds:dword ptr[counttemp] 445 cmp ecx,16 ; more than one segment after this? 446 ja LSetupNotLast2 ; yes 447 448 dec ecx 449 jz LFDIVInFlight2 ; if only one pixel, no need to start an FDIV 450 mov ds:dword ptr[spancountminus1],ecx 451 fild ds:dword ptr[spancountminus1] 452 453 fld ds:dword ptr[_d_zistepu] ; C(d_zistepu) | spancountminus1 454 fmul st(0),st(1) ; C(d_zistepu)*scm1 | scm1 455 fld ds:dword ptr[_d_tdivzstepu] ; C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1 456 fmul st(0),st(2) ; C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1 457 fxch st(1) ; C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1 458 faddp st(3),st(0) ; C(d_tdivzstepu)*scm1 | scm1 459 fxch st(1) ; scm1 | C(d_tdivzstepu)*scm1 460 fmul ds:dword ptr[_d_sdivzstepu] ; C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1 461 fxch st(1) ; C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1 462 faddp st(3),st(0) ; C(d_sdivzstepu)*scm1 463 fld ds:dword ptr[fp_64k] ; 64k | C(d_sdivzstepu)*scm1 464 fxch st(1) ; C(d_sdivzstepu)*scm1 | 64k 465 faddp st(4),st(0) ; 64k 466 467 fdiv st(0),st(1) ; this is what we've gone to all this trouble to 468 ; overlap 469 jmp LFDIVInFlight2 470 471 align 4 472 LSetupNotLast2: 473 fadd ds:dword ptr[zi16stepu] 474 fxch st(2) 475 fadd ds:dword ptr[sdivz16stepu] 476 fxch st(2) 477 fld ds:dword ptr[tdivz16stepu] 478 faddp st(2),st(0) 479 fld ds:dword ptr[fp_64k] 480 fdiv st(0),st(1) ; z = 1/1/z 481 ; this is what we've gone to all this trouble to 482 ; overlap 483 LFDIVInFlight2: 484 mov ds:dword ptr[counttemp],ecx 485 486 add edx,ds:dword ptr[tstep] 487 sbb ecx,ecx 488 mov ds:byte ptr[8+edi],al 489 add ebx,ebp 490 mov al,ds:byte ptr[esi] 491 adc esi,ds:dword ptr[advancetable+4+ecx*4] 492 493 add edx,ds:dword ptr[tstep] 494 sbb ecx,ecx 495 mov ds:byte ptr[9+edi],al 496 add ebx,ebp 497 mov al,ds:byte ptr[esi] 498 adc esi,ds:dword ptr[advancetable+4+ecx*4] 499 500 add edx,ds:dword ptr[tstep] 501 sbb ecx,ecx 502 mov ds:byte ptr[10+edi],al 503 add ebx,ebp 504 mov al,ds:byte ptr[esi] 505 adc esi,ds:dword ptr[advancetable+4+ecx*4] 506 507 add edx,ds:dword ptr[tstep] 508 sbb ecx,ecx 509 mov ds:byte ptr[11+edi],al 510 add ebx,ebp 511 mov al,ds:byte ptr[esi] 512 adc esi,ds:dword ptr[advancetable+4+ecx*4] 513 514 add edx,ds:dword ptr[tstep] 515 sbb ecx,ecx 516 mov ds:byte ptr[12+edi],al 517 add ebx,ebp 518 mov al,ds:byte ptr[esi] 519 adc esi,ds:dword ptr[advancetable+4+ecx*4] 520 521 add edx,ds:dword ptr[tstep] 522 sbb ecx,ecx 523 mov ds:byte ptr[13+edi],al 524 add ebx,ebp 525 mov al,ds:byte ptr[esi] 526 adc esi,ds:dword ptr[advancetable+4+ecx*4] 527 528 add edx,ds:dword ptr[tstep] 529 sbb ecx,ecx 530 mov ds:byte ptr[14+edi],al 531 add ebx,ebp 532 mov al,ds:byte ptr[esi] 533 adc esi,ds:dword ptr[advancetable+4+ecx*4] 534 535 add edi,16 536 mov ds:dword ptr[tfracf],edx 537 mov edx,ds:dword ptr[snext] 538 mov ds:dword ptr[sfracf],ebx 539 mov ebx,ds:dword ptr[tnext] 540 mov ds:dword ptr[s],edx 541 mov ds:dword ptr[t],ebx 542 543 mov ecx,ds:dword ptr[counttemp] ; retrieve count 544 545 ; 546 ; determine whether last span or not 547 ; 548 cmp ecx,16 ; are there multiple segments remaining? 549 mov ds:byte ptr[-1+edi],al 550 ja LNotLastSegment ; yes 551 552 ; 553 ; last segment of scan 554 ; 555 LLastSegment: 556 557 ; 558 ; advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to 559 ; get there. The number of pixels left is variable, and we want to land on the 560 ; last pixel, not step one past it, so we can't run into arithmetic problems 561 ; 562 test ecx,ecx 563 jz LNoSteps ; just draw the last pixel and we're done 564 565 ; pick up after the FDIV that was left in flight previously 566 567 568 fld st(0) ; duplicate it 569 fmul st(0),st(4) ; s = s/z * z 570 fxch st(1) 571 fmul st(0),st(3) ; t = t/z * z 572 fxch st(1) 573 fistp ds:dword ptr[snext] 574 fistp ds:dword ptr[tnext] 575 576 mov al,ds:byte ptr[esi] ; load first texel in segment 577 mov ebx,ds:dword ptr[_tadjust] 578 mov ds:byte ptr[edi],al ; store first pixel in segment 579 mov eax,ds:dword ptr[_sadjust] 580 581 add eax,ds:dword ptr[snext] 582 add ebx,ds:dword ptr[tnext] 583 584 mov ebp,ds:dword ptr[_bbextents] 585 mov edx,ds:dword ptr[_bbextentt] 586 587 cmp eax,4096 588 jl LClampLow4 589 cmp eax,ebp 590 ja LClampHigh4 591 LClampReentry4: 592 mov ds:dword ptr[snext],eax 593 594 cmp ebx,4096 595 jl LClampLow5 596 cmp ebx,edx 597 ja LClampHigh5 598 LClampReentry5: 599 600 cmp ecx,1 ; don't bother 601 je LOnlyOneStep ; if two pixels in segment, there's only one step, 602 ; of the segment length 603 sub eax,ds:dword ptr[s] 604 sub ebx,ds:dword ptr[t] 605 606 add eax,eax ; convert to 15.17 format so multiply by 1.31 607 add ebx,ebx ; reciprocal yields 16.48 608 609 imul ds:dword ptr[reciprocal_table_16-8+ecx*4] ; sstep = (snext - s) / 610 ; (spancount-1) 611 mov ebp,edx 612 613 mov eax,ebx 614 imul ds:dword ptr[reciprocal_table_16-8+ecx*4] ; tstep = (tnext - t) / 615 ; (spancount-1) 616 LSetEntryvec: 617 ; 618 ; set up advancetable 619 ; 620 mov ebx,ds:dword ptr[entryvec_table_16+ecx*4] 621 mov eax,edx 622 mov ds:dword ptr[jumptemp],ebx ; entry point into code for RET later 623 mov ecx,ebp 624 sar edx,16 ; tstep >>= 16; 625 mov ebx,ds:dword ptr[_cachewidth] 626 sar ecx,16 ; sstep >>= 16; 627 imul edx,ebx 628 629 add edx,ecx ; add in sstep 630 ; (tstep >> 16) * cachewidth + (sstep >> 16); 631 mov ecx,ds:dword ptr[tfracf] 632 mov ds:dword ptr[advancetable+4],edx ; advance base in t 633 add edx,ebx ; ((tstep >> 16) + 1) * cachewidth + 634 ; (sstep >> 16); 635 shl ebp,16 ; left-justify sstep fractional part 636 mov ebx,ds:dword ptr[sfracf] 637 shl eax,16 ; left-justify tstep fractional part 638 mov ds:dword ptr[advancetable],edx ; advance extra in t 639 640 mov ds:dword ptr[tstep],eax 641 mov edx,ecx 642 add edx,eax 643 sbb ecx,ecx 644 add ebx,ebp 645 adc esi,ds:dword ptr[advancetable+4+ecx*4] 646 647 jmp dword ptr[jumptemp] ; jump to the number-of-pixels handler 648 649 ;---------------------------------------- 650 651 LNoSteps: 652 mov al,ds:byte ptr[esi] ; load first texel in segment 653 sub edi,15 ; adjust for hardwired offset 654 jmp LEndSpan 655 656 657 LOnlyOneStep: 658 sub eax,ds:dword ptr[s] 659 sub ebx,ds:dword ptr[t] 660 mov ebp,eax 661 mov edx,ebx 662 jmp LSetEntryvec 663 664 ;---------------------------------------- 665 666 public Entry2_16, Entry3_16, Entry4_16, Entry5_16 667 public Entry6_16, Entry7_16, Entry8_16, Entry9_16 668 public Entry10_16, Entry11_16, Entry12_16, Entry13_16 669 public Entry14_16, Entry15_16, Entry16_16 670 671 Entry2_16: 672 sub edi,14 ; adjust for hardwired offsets 673 mov al,ds:byte ptr[esi] 674 jmp LEntry2_16 675 676 ;---------------------------------------- 677 678 Entry3_16: 679 sub edi,13 ; adjust for hardwired offsets 680 add edx,eax 681 mov al,ds:byte ptr[esi] 682 sbb ecx,ecx 683 add ebx,ebp 684 adc esi,ds:dword ptr[advancetable+4+ecx*4] 685 jmp LEntry3_16 686 687 ;---------------------------------------- 688 689 Entry4_16: 690 sub edi,12 ; adjust for hardwired offsets 691 add edx,eax 692 mov al,ds:byte ptr[esi] 693 sbb ecx,ecx 694 add ebx,ebp 695 adc esi,ds:dword ptr[advancetable+4+ecx*4] 696 add edx,ds:dword ptr[tstep] 697 jmp LEntry4_16 698 699 ;---------------------------------------- 700 701 Entry5_16: 702 sub edi,11 ; adjust for hardwired offsets 703 add edx,eax 704 mov al,ds:byte ptr[esi] 705 sbb ecx,ecx 706 add ebx,ebp 707 adc esi,ds:dword ptr[advancetable+4+ecx*4] 708 add edx,ds:dword ptr[tstep] 709 jmp LEntry5_16 710 711 ;---------------------------------------- 712 713 Entry6_16: 714 sub edi,10 ; adjust for hardwired offsets 715 add edx,eax 716 mov al,ds:byte ptr[esi] 717 sbb ecx,ecx 718 add ebx,ebp 719 adc esi,ds:dword ptr[advancetable+4+ecx*4] 720 add edx,ds:dword ptr[tstep] 721 jmp LEntry6_16 722 723 ;---------------------------------------- 724 725 Entry7_16: 726 sub edi,9 ; adjust for hardwired offsets 727 add edx,eax 728 mov al,ds:byte ptr[esi] 729 sbb ecx,ecx 730 add ebx,ebp 731 adc esi,ds:dword ptr[advancetable+4+ecx*4] 732 add edx,ds:dword ptr[tstep] 733 jmp LEntry7_16 734 735 ;---------------------------------------- 736 737 Entry8_16: 738 sub edi,8 ; adjust for hardwired offsets 739 add edx,eax 740 mov al,ds:byte ptr[esi] 741 sbb ecx,ecx 742 add ebx,ebp 743 adc esi,ds:dword ptr[advancetable+4+ecx*4] 744 add edx,ds:dword ptr[tstep] 745 jmp LEntry8_16 746 747 ;---------------------------------------- 748 749 Entry9_16: 750 sub edi,7 ; adjust for hardwired offsets 751 add edx,eax 752 mov al,ds:byte ptr[esi] 753 sbb ecx,ecx 754 add ebx,ebp 755 adc esi,ds:dword ptr[advancetable+4+ecx*4] 756 add edx,ds:dword ptr[tstep] 757 jmp LEntry9_16 758 759 ;---------------------------------------- 760 761 Entry10_16: 762 sub edi,6 ; adjust for hardwired offsets 763 add edx,eax 764 mov al,ds:byte ptr[esi] 765 sbb ecx,ecx 766 add ebx,ebp 767 adc esi,ds:dword ptr[advancetable+4+ecx*4] 768 add edx,ds:dword ptr[tstep] 769 jmp LEntry10_16 770 771 ;---------------------------------------- 772 773 Entry11_16: 774 sub edi,5 ; adjust for hardwired offsets 775 add edx,eax 776 mov al,ds:byte ptr[esi] 777 sbb ecx,ecx 778 add ebx,ebp 779 adc esi,ds:dword ptr[advancetable+4+ecx*4] 780 add edx,ds:dword ptr[tstep] 781 jmp LEntry11_16 782 783 ;---------------------------------------- 784 785 Entry12_16: 786 sub edi,4 ; adjust for hardwired offsets 787 add edx,eax 788 mov al,ds:byte ptr[esi] 789 sbb ecx,ecx 790 add ebx,ebp 791 adc esi,ds:dword ptr[advancetable+4+ecx*4] 792 add edx,ds:dword ptr[tstep] 793 jmp LEntry12_16 794 795 ;---------------------------------------- 796 797 Entry13_16: 798 sub edi,3 ; adjust for hardwired offsets 799 add edx,eax 800 mov al,ds:byte ptr[esi] 801 sbb ecx,ecx 802 add ebx,ebp 803 adc esi,ds:dword ptr[advancetable+4+ecx*4] 804 add edx,ds:dword ptr[tstep] 805 jmp LEntry13_16 806 807 ;---------------------------------------- 808 809 Entry14_16: 810 sub edi,2 ; adjust for hardwired offsets 811 add edx,eax 812 mov al,ds:byte ptr[esi] 813 sbb ecx,ecx 814 add ebx,ebp 815 adc esi,ds:dword ptr[advancetable+4+ecx*4] 816 add edx,ds:dword ptr[tstep] 817 jmp LEntry14_16 818 819 ;---------------------------------------- 820 821 Entry15_16: 822 dec edi ; adjust for hardwired offsets 823 add edx,eax 824 mov al,ds:byte ptr[esi] 825 sbb ecx,ecx 826 add ebx,ebp 827 adc esi,ds:dword ptr[advancetable+4+ecx*4] 828 add edx,ds:dword ptr[tstep] 829 jmp LEntry15_16 830 831 ;---------------------------------------- 832 833 Entry16_16: 834 add edx,eax 835 mov al,ds:byte ptr[esi] 836 sbb ecx,ecx 837 add ebx,ebp 838 adc esi,ds:dword ptr[advancetable+4+ecx*4] 839 840 add edx,ds:dword ptr[tstep] 841 sbb ecx,ecx 842 mov ds:byte ptr[1+edi],al 843 add ebx,ebp 844 mov al,ds:byte ptr[esi] 845 adc esi,ds:dword ptr[advancetable+4+ecx*4] 846 add edx,ds:dword ptr[tstep] 847 LEntry15_16: 848 sbb ecx,ecx 849 mov ds:byte ptr[2+edi],al 850 add ebx,ebp 851 mov al,ds:byte ptr[esi] 852 adc esi,ds:dword ptr[advancetable+4+ecx*4] 853 add edx,ds:dword ptr[tstep] 854 LEntry14_16: 855 sbb ecx,ecx 856 mov ds:byte ptr[3+edi],al 857 add ebx,ebp 858 mov al,ds:byte ptr[esi] 859 adc esi,ds:dword ptr[advancetable+4+ecx*4] 860 add edx,ds:dword ptr[tstep] 861 LEntry13_16: 862 sbb ecx,ecx 863 mov ds:byte ptr[4+edi],al 864 add ebx,ebp 865 mov al,ds:byte ptr[esi] 866 adc esi,ds:dword ptr[advancetable+4+ecx*4] 867 add edx,ds:dword ptr[tstep] 868 LEntry12_16: 869 sbb ecx,ecx 870 mov ds:byte ptr[5+edi],al 871 add ebx,ebp 872 mov al,ds:byte ptr[esi] 873 adc esi,ds:dword ptr[advancetable+4+ecx*4] 874 add edx,ds:dword ptr[tstep] 875 LEntry11_16: 876 sbb ecx,ecx 877 mov ds:byte ptr[6+edi],al 878 add ebx,ebp 879 mov al,ds:byte ptr[esi] 880 adc esi,ds:dword ptr[advancetable+4+ecx*4] 881 add edx,ds:dword ptr[tstep] 882 LEntry10_16: 883 sbb ecx,ecx 884 mov ds:byte ptr[7+edi],al 885 add ebx,ebp 886 mov al,ds:byte ptr[esi] 887 adc esi,ds:dword ptr[advancetable+4+ecx*4] 888 add edx,ds:dword ptr[tstep] 889 LEntry9_16: 890 sbb ecx,ecx 891 mov ds:byte ptr[8+edi],al 892 add ebx,ebp 893 mov al,ds:byte ptr[esi] 894 adc esi,ds:dword ptr[advancetable+4+ecx*4] 895 add edx,ds:dword ptr[tstep] 896 LEntry8_16: 897 sbb ecx,ecx 898 mov ds:byte ptr[9+edi],al 899 add ebx,ebp 900 mov al,ds:byte ptr[esi] 901 adc esi,ds:dword ptr[advancetable+4+ecx*4] 902 add edx,ds:dword ptr[tstep] 903 LEntry7_16: 904 sbb ecx,ecx 905 mov ds:byte ptr[10+edi],al 906 add ebx,ebp 907 mov al,ds:byte ptr[esi] 908 adc esi,ds:dword ptr[advancetable+4+ecx*4] 909 add edx,ds:dword ptr[tstep] 910 LEntry6_16: 911 sbb ecx,ecx 912 mov ds:byte ptr[11+edi],al 913 add ebx,ebp 914 mov al,ds:byte ptr[esi] 915 adc esi,ds:dword ptr[advancetable+4+ecx*4] 916 add edx,ds:dword ptr[tstep] 917 LEntry5_16: 918 sbb ecx,ecx 919 mov ds:byte ptr[12+edi],al 920 add ebx,ebp 921 mov al,ds:byte ptr[esi] 922 adc esi,ds:dword ptr[advancetable+4+ecx*4] 923 add edx,ds:dword ptr[tstep] 924 LEntry4_16: 925 sbb ecx,ecx 926 mov ds:byte ptr[13+edi],al 927 add ebx,ebp 928 mov al,ds:byte ptr[esi] 929 adc esi,ds:dword ptr[advancetable+4+ecx*4] 930 LEntry3_16: 931 mov ds:byte ptr[14+edi],al 932 mov al,ds:byte ptr[esi] 933 LEntry2_16: 934 935 LEndSpan: 936 937 ; 938 ; clear s/z, t/z, 1/z from FP stack 939 ; 940 fstp st(0) 941 fstp st(0) 942 fstp st(0) 943 944 mov ebx,ds:dword ptr[pspantemp] ; restore spans pointer 945 mov ebx,ds:dword ptr[espan_t_pnext+ebx] ; point to next span 946 test ebx,ebx ; any more spans? 947 mov ds:byte ptr[15+edi],al 948 jnz LSpanLoop ; more spans 949 950 pop ebx ; restore register variables 951 pop esi 952 pop edi 953 pop ebp ; restore the caller's stack frame 954 ret 955 956 957 ;---------------------------------------------------------------------- 958 ; 8-bpp horizontal span z drawing codefor polygons, with no transparency. 959 ; 960 ; Assumes there is at least one span in pzspans, and that every span 961 ; contains at least one pixel 962 ;---------------------------------------------------------------------- 963 964 965 966 ; z-clamp on a non-negative gradient span 967 LClamp: 968 mov edx,040000000h 969 xor ebx,ebx 970 fstp st(0) 971 jmp LZDraw 972 973 ; z-clamp on a negative gradient span 974 LClampNeg: 975 mov edx,040000000h 976 xor ebx,ebx 977 fstp st(0) 978 jmp LZDrawNeg 979 980 981 pzspans equ 4+16 982 983 public _D_DrawZSpans 984 _D_DrawZSpans: 985 push ebp ; preserve caller's stack frame 986 push edi 987 push esi ; preserve register variables 988 push ebx 989 990 fld ds:dword ptr[_d_zistepu] 991 mov eax,ds:dword ptr[_d_zistepu] 992 mov esi,ds:dword ptr[pzspans+esp] 993 test eax,eax 994 jz LFNegSpan 995 996 fmul ds:dword ptr[Float2ToThe31nd] 997 fistp ds:dword ptr[izistep] ; note: we are relying on FP exceptions being turned 998 ; off here to avoid range problems 999 mov ebx,ds:dword ptr[izistep] ; remains loaded for all spans 1000 1001 LFSpanLoop: 1002 ; set up the initial 1/z value 1003 fild ds:dword ptr[espan_t_v+esi] 1004 fild ds:dword ptr[espan_t_u+esi] 1005 mov ecx,ds:dword ptr[espan_t_v+esi] 1006 mov edi,ds:dword ptr[_d_pzbuffer] 1007 fmul ds:dword ptr[_d_zistepu] 1008 fxch st(1) 1009 fmul ds:dword ptr[_d_zistepv] 1010 fxch st(1) 1011 fadd ds:dword ptr[_d_ziorigin] 1012 imul ecx,ds:dword ptr[_d_zrowbytes] 1013 faddp st(1),st(0) 1014 1015 ; clamp if z is nearer than 2 (1/z > 0.5) 1016 fcom ds:dword ptr[float_point5] 1017 add edi,ecx 1018 mov edx,ds:dword ptr[espan_t_u+esi] 1019 add edx,edx ; word count 1020 mov ecx,ds:dword ptr[espan_t_count+esi] 1021 add edi,edx ; pdest = &pdestspan[scans->u]; 1022 push esi ; preserve spans pointer 1023 fnstsw ax 1024 test ah,045h 1025 jz LClamp 1026 1027 fmul ds:dword ptr[Float2ToThe31nd] 1028 fistp ds:dword ptr[izi] ; note: we are relying on FP exceptions being turned 1029 ; off here to avoid problems when the span is closer 1030 ; than 1/(2**31) 1031 mov edx,ds:dword ptr[izi] 1032 1033 ; at this point: 1034 ; %ebx = izistep 1035 ; %ecx = count 1036 ; %edx = izi 1037 ; %edi = pdest 1038 1039 LZDraw: 1040 1041 ; do a single pixel up front, if necessary to dword align the destination 1042 test edi,2 1043 jz LFMiddle 1044 mov eax,edx 1045 add edx,ebx 1046 shr eax,16 1047 dec ecx 1048 mov ds:word ptr[edi],ax 1049 add edi,2 1050 1051 ; do middle a pair of aligned dwords at a time 1052 LFMiddle: 1053 push ecx 1054 shr ecx,1 ; count / 2 1055 jz LFLast ; no aligned dwords to do 1056 shr ecx,1 ; (count / 2) / 2 1057 jnc LFMiddleLoop ; even number of aligned dwords to do 1058 1059 mov eax,edx 1060 add edx,ebx 1061 shr eax,16 1062 mov esi,edx 1063 add edx,ebx 1064 and esi,0FFFF0000h 1065 or eax,esi 1066 mov ds:dword ptr[edi],eax 1067 add edi,4 1068 and ecx,ecx 1069 jz LFLast 1070 1071 LFMiddleLoop: 1072 mov eax,edx 1073 add edx,ebx 1074 shr eax,16 1075 mov esi,edx 1076 add edx,ebx 1077 and esi,0FFFF0000h 1078 or eax,esi 1079 mov ebp,edx 1080 mov ds:dword ptr[edi],eax 1081 add edx,ebx 1082 shr ebp,16 1083 mov esi,edx 1084 add edx,ebx 1085 and esi,0FFFF0000h 1086 or ebp,esi 1087 mov ds:dword ptr[4+edi],ebp ; FIXME: eliminate register contention 1088 add edi,8 1089 1090 dec ecx 1091 jnz LFMiddleLoop 1092 1093 LFLast: 1094 pop ecx ; retrieve count 1095 pop esi ; retrieve span pointer 1096 1097 ; do the last, unaligned pixel, if there is one 1098 and ecx,1 ; is there an odd pixel left to do? 1099 jz LFSpanDone ; no 1100 shr edx,16 1101 mov ds:word ptr[edi],dx ; do the final pixel's z 1102 1103 LFSpanDone: 1104 mov esi,ds:dword ptr[espan_t_pnext+esi] 1105 test esi,esi 1106 jnz LFSpanLoop 1107 1108 jmp LFDone 1109 1110 LFNegSpan: 1111 fmul ds:dword ptr[FloatMinus2ToThe31nd] 1112 fistp ds:dword ptr[izistep] ; note: we are relying on FP exceptions being turned 1113 ; off here to avoid range problems 1114 mov ebx,ds:dword ptr[izistep] ; remains loaded for all spans 1115 1116 LFNegSpanLoop: 1117 ; set up the initial 1/z value 1118 fild ds:dword ptr[espan_t_v+esi] 1119 fild ds:dword ptr[espan_t_u+esi] 1120 mov ecx,ds:dword ptr[espan_t_v+esi] 1121 mov edi,ds:dword ptr[_d_pzbuffer] 1122 fmul ds:dword ptr[_d_zistepu] 1123 fxch st(1) 1124 fmul ds:dword ptr[_d_zistepv] 1125 fxch st(1) 1126 fadd ds:dword ptr[_d_ziorigin] 1127 imul ecx,ds:dword ptr[_d_zrowbytes] 1128 faddp st(1),st(0) 1129 1130 ; clamp if z is nearer than 2 (1/z > 0.5) 1131 fcom ds:dword ptr[float_point5] 1132 add edi,ecx 1133 mov edx,ds:dword ptr[espan_t_u+esi] 1134 add edx,edx ; word count 1135 mov ecx,ds:dword ptr[espan_t_count+esi] 1136 add edi,edx ; pdest = &pdestspan[scans->u]; 1137 push esi ; preserve spans pointer 1138 fnstsw ax 1139 test ah,045h 1140 jz LClampNeg 1141 1142 fmul ds:dword ptr[Float2ToThe31nd] 1143 fistp ds:dword ptr[izi] ; note: we are relying on FP exceptions being turned 1144 ; off here to avoid problems when the span is closer 1145 ; than 1/(2**31) 1146 mov edx,ds:dword ptr[izi] 1147 1148 ; at this point: 1149 ; %ebx = izistep 1150 ; %ecx = count 1151 ; %edx = izi 1152 ; %edi = pdest 1153 1154 LZDrawNeg: 1155 1156 ; do a single pixel up front, if necessary to dword align the destination 1157 test edi,2 1158 jz LFNegMiddle 1159 mov eax,edx 1160 sub edx,ebx 1161 shr eax,16 1162 dec ecx 1163 mov ds:word ptr[edi],ax 1164 add edi,2 1165 1166 ; do middle a pair of aligned dwords at a time 1167 LFNegMiddle: 1168 push ecx 1169 shr ecx,1 ; count / 2 1170 jz LFNegLast ; no aligned dwords to do 1171 shr ecx,1 ; (count / 2) / 2 1172 jnc LFNegMiddleLoop ; even number of aligned dwords to do 1173 1174 mov eax,edx 1175 sub edx,ebx 1176 shr eax,16 1177 mov esi,edx 1178 sub edx,ebx 1179 and esi,0FFFF0000h 1180 or eax,esi 1181 mov ds:dword ptr[edi],eax 1182 add edi,4 1183 and ecx,ecx 1184 jz LFNegLast 1185 1186 LFNegMiddleLoop: 1187 mov eax,edx 1188 sub edx,ebx 1189 shr eax,16 1190 mov esi,edx 1191 sub edx,ebx 1192 and esi,0FFFF0000h 1193 or eax,esi 1194 mov ebp,edx 1195 mov ds:dword ptr[edi],eax 1196 sub edx,ebx 1197 shr ebp,16 1198 mov esi,edx 1199 sub edx,ebx 1200 and esi,0FFFF0000h 1201 or ebp,esi 1202 mov ds:dword ptr[4+edi],ebp ; FIXME: eliminate register contention 1203 add edi,8 1204 1205 dec ecx 1206 jnz LFNegMiddleLoop 1207 1208 LFNegLast: 1209 pop ecx ; retrieve count 1210 pop esi ; retrieve span pointer 1211 1212 ; do the last, unaligned pixel, if there is one 1213 and ecx,1 ; is there an odd pixel left to do? 1214 jz LFNegSpanDone ; no 1215 shr edx,16 1216 mov ds:word ptr[edi],dx ; do the final pixel's z 1217 1218 LFNegSpanDone: 1219 mov esi,ds:dword ptr[espan_t_pnext+esi] 1220 test esi,esi 1221 jnz LFNegSpanLoop 1222 1223 LFDone: 1224 pop ebx ; restore register variables 1225 pop esi 1226 pop edi 1227 pop ebp ; restore the caller's stack frame 1228 ret 1229 1230 1231 1232 _TEXT ENDS 1233 endif ;id386 1234 END