Quake-2

Quake 2 GPL Source Release
Log | Files | Refs

r_draw16.s (27739B)


      1 //
      2 // d_draw16.s
      3 // x86 assembly-language horizontal 8-bpp span-drawing code, with 16-pixel
      4 // subdivision.
      5 //
      6 
      7 #include "qasm.h"
      8 #include "d_ifacea.h"
      9 
     10 #if	id386
     11 
     12 //----------------------------------------------------------------------
     13 // 8-bpp horizontal span drawing code for polygons, with no transparency and
     14 // 16-pixel subdivision.
     15 //
     16 // Assumes there is at least one span in pspans, and that every span
     17 // contains at least one pixel
     18 //----------------------------------------------------------------------
     19 
     20 	.data
     21 
     22 	.text
     23 
     24 // out-of-line, rarely-needed clamping code
     25 
     26 LClampHigh0:
     27 	movl	C(bbextents),%esi
     28 	jmp		LClampReentry0
     29 LClampHighOrLow0:
     30 	jg		LClampHigh0
     31 	xorl	%esi,%esi
     32 	jmp		LClampReentry0
     33 
     34 LClampHigh1:
     35 	movl	C(bbextentt),%edx
     36 	jmp		LClampReentry1
     37 LClampHighOrLow1:
     38 	jg		LClampHigh1
     39 	xorl	%edx,%edx
     40 	jmp		LClampReentry1
     41 
     42 LClampLow2:
     43 	movl	$4096,%ebp
     44 	jmp		LClampReentry2
     45 LClampHigh2:
     46 	movl	C(bbextents),%ebp
     47 	jmp		LClampReentry2
     48 
     49 LClampLow3:
     50 	movl	$4096,%ecx
     51 	jmp		LClampReentry3
     52 LClampHigh3:
     53 	movl	C(bbextentt),%ecx
     54 	jmp		LClampReentry3
     55 
     56 LClampLow4:
     57 	movl	$4096,%eax
     58 	jmp		LClampReentry4
     59 LClampHigh4:
     60 	movl	C(bbextents),%eax
     61 	jmp		LClampReentry4
     62 
     63 LClampLow5:
     64 	movl	$4096,%ebx
     65 	jmp		LClampReentry5
     66 LClampHigh5:
     67 	movl	C(bbextentt),%ebx
     68 	jmp		LClampReentry5
     69 
     70 
     71 #define pspans	4+16
     72 
     73 	.align 4
     74 .globl C(D_DrawSpans16)
     75 C(D_DrawSpans16):
     76 	pushl	%ebp				// preserve caller's stack frame
     77 	pushl	%edi
     78 	pushl	%esi				// preserve register variables
     79 	pushl	%ebx
     80 
     81 //
     82 // set up scaled-by-16 steps, for 16-long segments; also set up cacheblock
     83 // and span list pointers
     84 //
     85 // TODO: any overlap from rearranging?
     86 	flds	C(d_sdivzstepu)
     87 	fmuls	fp_16
     88 	movl	C(cacheblock),%edx
     89 	flds	C(d_tdivzstepu)
     90 	fmuls	fp_16
     91 	movl	pspans(%esp),%ebx	// point to the first span descriptor
     92 	flds	C(d_zistepu)
     93 	fmuls	fp_16
     94 	movl	%edx,pbase			// pbase = cacheblock
     95 	fstps	zi16stepu
     96 	fstps	tdivz16stepu
     97 	fstps	sdivz16stepu
     98 
     99 LSpanLoop:
    100 //
    101 // set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
    102 // initial s and t values
    103 //
    104 // FIXME: pipeline FILD?
    105 	fildl	espan_t_v(%ebx)
    106 	fildl	espan_t_u(%ebx)
    107 
    108 	fld		%st(1)			// dv | du | dv
    109 	fmuls	C(d_sdivzstepv)	// dv*d_sdivzstepv | du | dv
    110 	fld		%st(1)			// du | dv*d_sdivzstepv | du | dv
    111 	fmuls	C(d_sdivzstepu)	// du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
    112 	fld		%st(2)			// du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
    113 	fmuls	C(d_tdivzstepu)	// du*d_tdivzstepu | du*d_sdivzstepu |
    114 							//  dv*d_sdivzstepv | du | dv
    115 	fxch	%st(1)			// du*d_sdivzstepu | du*d_tdivzstepu |
    116 							//  dv*d_sdivzstepv | du | dv
    117 	faddp	%st(0),%st(2)	// du*d_tdivzstepu |
    118 							//  du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
    119 	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
    120 							//  du*d_tdivzstepu | du | dv
    121 	fld		%st(3)			// dv | du*d_sdivzstepu + dv*d_sdivzstepv |
    122 							//  du*d_tdivzstepu | du | dv
    123 	fmuls	C(d_tdivzstepv)	// dv*d_tdivzstepv |
    124 							//  du*d_sdivzstepu + dv*d_sdivzstepv |
    125 							//  du*d_tdivzstepu | du | dv
    126 	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
    127 							//  dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
    128 	fadds	C(d_sdivzorigin)	// sdivz = d_sdivzorigin + dv*d_sdivzstepv +
    129 							//  du*d_sdivzstepu; stays in %st(2) at end
    130 	fxch	%st(4)			// dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
    131 							//  s/z
    132 	fmuls	C(d_zistepv)		// dv*d_zistepv | dv*d_tdivzstepv |
    133 							//  du*d_tdivzstepu | du | s/z
    134 	fxch	%st(1)			// dv*d_tdivzstepv |  dv*d_zistepv |
    135 							//  du*d_tdivzstepu | du | s/z
    136 	faddp	%st(0),%st(2)	// dv*d_zistepv |
    137 							//  dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
    138 	fxch	%st(2)			// du | dv*d_tdivzstepv + du*d_tdivzstepu |
    139 							//  dv*d_zistepv | s/z
    140 	fmuls	C(d_zistepu)		// du*d_zistepu |
    141 							//  dv*d_tdivzstepv + du*d_tdivzstepu |
    142 							//  dv*d_zistepv | s/z
    143 	fxch	%st(1)			// dv*d_tdivzstepv + du*d_tdivzstepu |
    144 							//  du*d_zistepu | dv*d_zistepv | s/z
    145 	fadds	C(d_tdivzorigin)	// tdivz = d_tdivzorigin + dv*d_tdivzstepv +
    146 							//  du*d_tdivzstepu; stays in %st(1) at end
    147 	fxch	%st(2)			// dv*d_zistepv | du*d_zistepu | t/z | s/z
    148 	faddp	%st(0),%st(1)	// dv*d_zistepv + du*d_zistepu | t/z | s/z
    149 
    150 	flds	fp_64k			// fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
    151 	fxch	%st(1)			// dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
    152 	fadds	C(d_ziorigin)		// zi = d_ziorigin + dv*d_zistepv +
    153 							//  du*d_zistepu; stays in %st(0) at end
    154 							// 1/z | fp_64k | t/z | s/z
    155 //
    156 // calculate and clamp s & t
    157 //
    158 	fdivr	%st(0),%st(1)	// 1/z | z*64k | t/z | s/z
    159 
    160 //
    161 // point %edi to the first pixel in the span
    162 //
    163 	movl	C(d_viewbuffer),%ecx
    164 	movl	espan_t_v(%ebx),%eax
    165 	movl	%ebx,pspantemp	// preserve spans pointer
    166 
    167 	movl	C(tadjust),%edx
    168 	movl	C(sadjust),%esi
    169 	movl	C(d_scantable)(,%eax,4),%edi	// v * screenwidth
    170 	addl	%ecx,%edi
    171 	movl	espan_t_u(%ebx),%ecx
    172 	addl	%ecx,%edi				// pdest = &pdestspan[scans->u];
    173 	movl	espan_t_count(%ebx),%ecx
    174 
    175 //
    176 // now start the FDIV for the end of the span
    177 //
    178 	cmpl	$16,%ecx
    179 	ja		LSetupNotLast1
    180 
    181 	decl	%ecx
    182 	jz		LCleanup1		// if only one pixel, no need to start an FDIV
    183 	movl	%ecx,spancountminus1
    184 
    185 // finish up the s and t calcs
    186 	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
    187 
    188 	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
    189 	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
    190 	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
    191 	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
    192 	fxch	%st(1)			// s | t | 1/z | t/z | s/z
    193 	fistpl	s				// 1/z | t | t/z | s/z
    194 	fistpl	t				// 1/z | t/z | s/z
    195 
    196 	fildl	spancountminus1
    197 
    198 	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | spancountminus1
    199 	flds	C(d_zistepu)		// C(d_zistepu) | C(d_tdivzstepu) | spancountminus1
    200 	fmul	%st(2),%st(0)	// C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1
    201 	fxch	%st(1)			// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
    202 	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
    203 	fxch	%st(2)			// scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1
    204 	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |
    205 							//  C(d_tdivzstepu)*scm1
    206 	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |
    207 							//  C(d_tdivzstepu)*scm1
    208 	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
    209 	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
    210 	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1
    211 	faddp	%st(0),%st(3)
    212 
    213 	flds	fp_64k
    214 	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
    215 							//  overlap
    216 	jmp		LFDIVInFlight1
    217 
    218 LCleanup1:
    219 // finish up the s and t calcs
    220 	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
    221 
    222 	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
    223 	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
    224 	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
    225 	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
    226 	fxch	%st(1)			// s | t | 1/z | t/z | s/z
    227 	fistpl	s				// 1/z | t | t/z | s/z
    228 	fistpl	t				// 1/z | t/z | s/z
    229 	jmp		LFDIVInFlight1
    230 
    231 	.align	4
    232 LSetupNotLast1:
    233 // finish up the s and t calcs
    234 	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
    235 
    236 	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
    237 	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
    238 	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
    239 	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
    240 	fxch	%st(1)			// s | t | 1/z | t/z | s/z
    241 	fistpl	s				// 1/z | t | t/z | s/z
    242 	fistpl	t				// 1/z | t/z | s/z
    243 
    244 	fadds	zi16stepu
    245 	fxch	%st(2)
    246 	fadds	sdivz16stepu
    247 	fxch	%st(2)
    248 	flds	tdivz16stepu
    249 	faddp	%st(0),%st(2)
    250 	flds	fp_64k
    251 	fdiv	%st(1),%st(0)	// z = 1/1/z
    252 							// this is what we've gone to all this trouble to
    253 							//  overlap
    254 LFDIVInFlight1:
    255 
    256 	addl	s,%esi
    257 	addl	t,%edx
    258 	movl	C(bbextents),%ebx
    259 	movl	C(bbextentt),%ebp
    260 	cmpl	%ebx,%esi
    261 	ja		LClampHighOrLow0
    262 LClampReentry0:
    263 	movl	%esi,s
    264 	movl	pbase,%ebx
    265 	shll	$16,%esi
    266 	cmpl	%ebp,%edx
    267 	movl	%esi,sfracf
    268 	ja		LClampHighOrLow1
    269 LClampReentry1:
    270 	movl	%edx,t
    271 	movl	s,%esi					// sfrac = scans->sfrac;
    272 	shll	$16,%edx
    273 	movl	t,%eax					// tfrac = scans->tfrac;
    274 	sarl	$16,%esi
    275 	movl	%edx,tfracf
    276 
    277 //
    278 // calculate the texture starting address
    279 //
    280 	sarl	$16,%eax
    281 	movl	C(cachewidth),%edx
    282 	imull	%edx,%eax				// (tfrac >> 16) * cachewidth
    283 	addl	%ebx,%esi
    284 	addl	%eax,%esi				// psource = pbase + (sfrac >> 16) +
    285 									//           ((tfrac >> 16) * cachewidth);
    286 //
    287 // determine whether last span or not
    288 //
    289 	cmpl	$16,%ecx
    290 	jna		LLastSegment
    291 
    292 //
    293 // not the last segment; do full 16-wide segment
    294 //
    295 LNotLastSegment:
    296 
    297 //
    298 // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
    299 // get there
    300 //
    301 
    302 // pick up after the FDIV that was left in flight previously
    303 
    304 	fld		%st(0)			// duplicate it
    305 	fmul	%st(4),%st(0)	// s = s/z * z
    306 	fxch	%st(1)
    307 	fmul	%st(3),%st(0)	// t = t/z * z
    308 	fxch	%st(1)
    309 	fistpl	snext
    310 	fistpl	tnext
    311 	movl	snext,%eax
    312 	movl	tnext,%edx
    313 
    314 	movb	(%esi),%bl	// get first source texel
    315 	subl	$16,%ecx		// count off this segments' pixels
    316 	movl	C(sadjust),%ebp
    317 	movl	%ecx,counttemp	// remember count of remaining pixels
    318 
    319 	movl	C(tadjust),%ecx
    320 	movb	%bl,(%edi)	// store first dest pixel
    321 
    322 	addl	%eax,%ebp
    323 	addl	%edx,%ecx
    324 
    325 	movl	C(bbextents),%eax
    326 	movl	C(bbextentt),%edx
    327 
    328 	cmpl	$4096,%ebp
    329 	jl		LClampLow2
    330 	cmpl	%eax,%ebp
    331 	ja		LClampHigh2
    332 LClampReentry2:
    333 
    334 	cmpl	$4096,%ecx
    335 	jl		LClampLow3
    336 	cmpl	%edx,%ecx
    337 	ja		LClampHigh3
    338 LClampReentry3:
    339 
    340 	movl	%ebp,snext
    341 	movl	%ecx,tnext
    342 
    343 	subl	s,%ebp
    344 	subl	t,%ecx
    345 	
    346 //
    347 // set up advancetable
    348 //
    349 	movl	%ecx,%eax
    350 	movl	%ebp,%edx
    351 	sarl	$20,%eax			// tstep >>= 16;
    352 	jz		LZero
    353 	sarl	$20,%edx			// sstep >>= 16;
    354 	movl	C(cachewidth),%ebx
    355 	imull	%ebx,%eax
    356 	jmp		LSetUp1
    357 
    358 LZero:
    359 	sarl	$20,%edx			// sstep >>= 16;
    360 	movl	C(cachewidth),%ebx
    361 
    362 LSetUp1:
    363 
    364 	addl	%edx,%eax			// add in sstep
    365 								// (tstep >> 16) * cachewidth + (sstep >> 16);
    366 	movl	tfracf,%edx
    367 	movl	%eax,advancetable+4	// advance base in t
    368 	addl	%ebx,%eax			// ((tstep >> 16) + 1) * cachewidth +
    369 								//  (sstep >> 16);
    370 	shll	$12,%ebp			// left-justify sstep fractional part
    371 	movl	sfracf,%ebx
    372 	shll	$12,%ecx			// left-justify tstep fractional part
    373 	movl	%eax,advancetable	// advance extra in t
    374 
    375 	movl	%ecx,tstep
    376 	addl	%ecx,%edx			// advance tfrac fractional part by tstep frac
    377 
    378 	sbbl	%ecx,%ecx			// turn tstep carry into -1 (0 if none)
    379 	addl	%ebp,%ebx			// advance sfrac fractional part by sstep frac
    380 	adcl	advancetable+4(,%ecx,4),%esi	// point to next source texel
    381 
    382 	addl	tstep,%edx
    383 	sbbl	%ecx,%ecx
    384 	movb	(%esi),%al
    385 	addl	%ebp,%ebx
    386 	movb	%al,1(%edi)
    387 	adcl	advancetable+4(,%ecx,4),%esi
    388 
    389 	addl	tstep,%edx
    390 	sbbl	%ecx,%ecx
    391 	addl	%ebp,%ebx
    392 	movb	(%esi),%al
    393 	adcl	advancetable+4(,%ecx,4),%esi
    394 
    395 	addl	tstep,%edx
    396 	sbbl	%ecx,%ecx
    397 	movb	%al,2(%edi)
    398 	addl	%ebp,%ebx
    399 	movb	(%esi),%al
    400 	adcl	advancetable+4(,%ecx,4),%esi
    401 
    402 	addl	tstep,%edx
    403 	sbbl	%ecx,%ecx
    404 	movb	%al,3(%edi)
    405 	addl	%ebp,%ebx
    406 	movb	(%esi),%al
    407 	adcl	advancetable+4(,%ecx,4),%esi
    408 
    409 	addl	tstep,%edx
    410 	sbbl	%ecx,%ecx
    411 	movb	%al,4(%edi)
    412 	addl	%ebp,%ebx
    413 	movb	(%esi),%al
    414 	adcl	advancetable+4(,%ecx,4),%esi
    415 
    416 	addl	tstep,%edx
    417 	sbbl	%ecx,%ecx
    418 	movb	%al,5(%edi)
    419 	addl	%ebp,%ebx
    420 	movb	(%esi),%al
    421 	adcl	advancetable+4(,%ecx,4),%esi
    422 
    423 	addl	tstep,%edx
    424 	sbbl	%ecx,%ecx
    425 	movb	%al,6(%edi)
    426 	addl	%ebp,%ebx
    427 	movb	(%esi),%al
    428 	adcl	advancetable+4(,%ecx,4),%esi
    429 
    430 	addl	tstep,%edx
    431 	sbbl	%ecx,%ecx
    432 	movb	%al,7(%edi)
    433 	addl	%ebp,%ebx
    434 	movb	(%esi),%al
    435 	adcl	advancetable+4(,%ecx,4),%esi
    436 
    437 
    438 //
    439 // start FDIV for end of next segment in flight, so it can overlap
    440 //
    441 	movl	counttemp,%ecx
    442 	cmpl	$16,%ecx			// more than one segment after this?
    443 	ja		LSetupNotLast2	// yes
    444 
    445 	decl	%ecx
    446 	jz		LFDIVInFlight2	// if only one pixel, no need to start an FDIV
    447 	movl	%ecx,spancountminus1
    448 	fildl	spancountminus1
    449 
    450 	flds	C(d_zistepu)		// C(d_zistepu) | spancountminus1
    451 	fmul	%st(1),%st(0)	// C(d_zistepu)*scm1 | scm1
    452 	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
    453 	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
    454 	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1
    455 	faddp	%st(0),%st(3)	// C(d_tdivzstepu)*scm1 | scm1
    456 	fxch	%st(1)			// scm1 | C(d_tdivzstepu)*scm1
    457 	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
    458 	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
    459 	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1
    460 	flds	fp_64k			// 64k | C(d_sdivzstepu)*scm1
    461 	fxch	%st(1)			// C(d_sdivzstepu)*scm1 | 64k
    462 	faddp	%st(0),%st(4)	// 64k
    463 
    464 	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
    465 							//  overlap
    466 	jmp		LFDIVInFlight2
    467 
    468 	.align	4
    469 LSetupNotLast2:
    470 	fadds	zi16stepu
    471 	fxch	%st(2)
    472 	fadds	sdivz16stepu
    473 	fxch	%st(2)
    474 	flds	tdivz16stepu
    475 	faddp	%st(0),%st(2)
    476 	flds	fp_64k
    477 	fdiv	%st(1),%st(0)	// z = 1/1/z
    478 							// this is what we've gone to all this trouble to
    479 							//  overlap
    480 LFDIVInFlight2:
    481 	movl	%ecx,counttemp
    482 
    483 	addl	tstep,%edx
    484 	sbbl	%ecx,%ecx
    485 	movb	%al,8(%edi)
    486 	addl	%ebp,%ebx
    487 	movb	(%esi),%al
    488 	adcl	advancetable+4(,%ecx,4),%esi
    489 
    490 	addl	tstep,%edx
    491 	sbbl	%ecx,%ecx
    492 	movb	%al,9(%edi)
    493 	addl	%ebp,%ebx
    494 	movb	(%esi),%al
    495 	adcl	advancetable+4(,%ecx,4),%esi
    496 
    497 	addl	tstep,%edx
    498 	sbbl	%ecx,%ecx
    499 	movb	%al,10(%edi)
    500 	addl	%ebp,%ebx
    501 	movb	(%esi),%al
    502 	adcl	advancetable+4(,%ecx,4),%esi
    503 
    504 	addl	tstep,%edx
    505 	sbbl	%ecx,%ecx
    506 	movb	%al,11(%edi)
    507 	addl	%ebp,%ebx
    508 	movb	(%esi),%al
    509 	adcl	advancetable+4(,%ecx,4),%esi
    510 
    511 	addl	tstep,%edx
    512 	sbbl	%ecx,%ecx
    513 	movb	%al,12(%edi)
    514 	addl	%ebp,%ebx
    515 	movb	(%esi),%al
    516 	adcl	advancetable+4(,%ecx,4),%esi
    517 
    518 	addl	tstep,%edx
    519 	sbbl	%ecx,%ecx
    520 	movb	%al,13(%edi)
    521 	addl	%ebp,%ebx
    522 	movb	(%esi),%al
    523 	adcl	advancetable+4(,%ecx,4),%esi
    524 
    525 	addl	tstep,%edx
    526 	sbbl	%ecx,%ecx
    527 	movb	%al,14(%edi)
    528 	addl	%ebp,%ebx
    529 	movb	(%esi),%al
    530 	adcl	advancetable+4(,%ecx,4),%esi
    531 
    532 	addl	$16,%edi
    533 	movl	%edx,tfracf
    534 	movl	snext,%edx
    535 	movl	%ebx,sfracf
    536 	movl	tnext,%ebx
    537 	movl	%edx,s
    538 	movl	%ebx,t
    539 
    540 	movl	counttemp,%ecx		// retrieve count
    541 
    542 //
    543 // determine whether last span or not
    544 //
    545 	cmpl	$16,%ecx				// are there multiple segments remaining?
    546 	movb	%al,-1(%edi)
    547 	ja		LNotLastSegment		// yes
    548 
    549 //
    550 // last segment of scan
    551 //
    552 LLastSegment:
    553 
    554 //
    555 // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
    556 // get there. The number of pixels left is variable, and we want to land on the
    557 // last pixel, not step one past it, so we can't run into arithmetic problems
    558 //
    559 	testl	%ecx,%ecx
    560 	jz		LNoSteps		// just draw the last pixel and we're done
    561 
    562 // pick up after the FDIV that was left in flight previously
    563 
    564 
    565 	fld		%st(0)			// duplicate it
    566 	fmul	%st(4),%st(0)	// s = s/z * z
    567 	fxch	%st(1)
    568 	fmul	%st(3),%st(0)	// t = t/z * z
    569 	fxch	%st(1)
    570 	fistpl	snext
    571 	fistpl	tnext
    572 
    573 	movb	(%esi),%al		// load first texel in segment
    574 	movl	C(tadjust),%ebx
    575 	movb	%al,(%edi)		// store first pixel in segment
    576 	movl	C(sadjust),%eax
    577 
    578 	addl	snext,%eax
    579 	addl	tnext,%ebx
    580 
    581 	movl	C(bbextents),%ebp
    582 	movl	C(bbextentt),%edx
    583 
    584 	cmpl	$4096,%eax
    585 	jl		LClampLow4
    586 	cmpl	%ebp,%eax
    587 	ja		LClampHigh4
    588 LClampReentry4:
    589 	movl	%eax,snext
    590 
    591 	cmpl	$4096,%ebx
    592 	jl		LClampLow5
    593 	cmpl	%edx,%ebx
    594 	ja		LClampHigh5
    595 LClampReentry5:
    596 
    597 	cmpl	$1,%ecx			// don't bother 
    598 	je		LOnlyOneStep	// if two pixels in segment, there's only one step,
    599 							//  of the segment length
    600 	subl	s,%eax
    601 	subl	t,%ebx
    602 
    603 	addl	%eax,%eax		// convert to 15.17 format so multiply by 1.31
    604 	addl	%ebx,%ebx		//  reciprocal yields 16.48
    605 
    606 	imull	reciprocal_table_16-8(,%ecx,4)	// sstep = (snext - s) /
    607 											//  (spancount-1)
    608 	movl	%edx,%ebp
    609 
    610 	movl	%ebx,%eax
    611 	imull	reciprocal_table_16-8(,%ecx,4)	// tstep = (tnext - t) /
    612 											//  (spancount-1)
    613 LSetEntryvec:
    614 //
    615 // set up advancetable
    616 //
    617 	movl	entryvec_table_16(,%ecx,4),%ebx
    618 	movl	%edx,%eax
    619 	movl	%ebx,jumptemp		// entry point into code for RET later
    620 	movl	%ebp,%ecx
    621 	sarl	$16,%edx			// tstep >>= 16;
    622 	movl	C(cachewidth),%ebx
    623 	sarl	$16,%ecx			// sstep >>= 16;
    624 	imull	%ebx,%edx
    625 
    626 	addl	%ecx,%edx			// add in sstep
    627 								// (tstep >> 16) * cachewidth + (sstep >> 16);
    628 	movl	tfracf,%ecx
    629 	movl	%edx,advancetable+4	// advance base in t
    630 	addl	%ebx,%edx			// ((tstep >> 16) + 1) * cachewidth +
    631 								//  (sstep >> 16);
    632 	shll	$16,%ebp			// left-justify sstep fractional part
    633 	movl	sfracf,%ebx
    634 	shll	$16,%eax			// left-justify tstep fractional part
    635 	movl	%edx,advancetable	// advance extra in t
    636 
    637 	movl	%eax,tstep
    638 	movl	%ecx,%edx
    639 	addl	%eax,%edx
    640 	sbbl	%ecx,%ecx
    641 	addl	%ebp,%ebx
    642 	adcl	advancetable+4(,%ecx,4),%esi
    643 
    644 	jmp		*jumptemp			// jump to the number-of-pixels handler
    645 
    646 //----------------------------------------
    647 
    648 LNoSteps:
    649 	movb	(%esi),%al		// load first texel in segment
    650 	subl	$15,%edi			// adjust for hardwired offset
    651 	jmp		LEndSpan
    652 
    653 
    654 LOnlyOneStep:
    655 	subl	s,%eax
    656 	subl	t,%ebx
    657 	movl	%eax,%ebp
    658 	movl	%ebx,%edx
    659 	jmp		LSetEntryvec
    660 
    661 //----------------------------------------
    662 
    663 .globl	Entry2_16, Entry3_16, Entry4_16, Entry5_16
    664 .globl	Entry6_16, Entry7_16, Entry8_16, Entry9_16
    665 .globl	Entry10_16, Entry11_16, Entry12_16, Entry13_16
    666 .globl	Entry14_16, Entry15_16, Entry16_16
    667 
    668 Entry2_16:
    669 	subl	$14,%edi		// adjust for hardwired offsets
    670 	movb	(%esi),%al
    671 	jmp		LEntry2_16
    672 
    673 //----------------------------------------
    674 
    675 Entry3_16:
    676 	subl	$13,%edi		// adjust for hardwired offsets
    677 	addl	%eax,%edx
    678 	movb	(%esi),%al
    679 	sbbl	%ecx,%ecx
    680 	addl	%ebp,%ebx
    681 	adcl	advancetable+4(,%ecx,4),%esi
    682 	jmp		LEntry3_16
    683 
    684 //----------------------------------------
    685 
    686 Entry4_16:
    687 	subl	$12,%edi		// adjust for hardwired offsets
    688 	addl	%eax,%edx
    689 	movb	(%esi),%al
    690 	sbbl	%ecx,%ecx
    691 	addl	%ebp,%ebx
    692 	adcl	advancetable+4(,%ecx,4),%esi
    693 	addl	tstep,%edx
    694 	jmp		LEntry4_16
    695 
    696 //----------------------------------------
    697 
    698 Entry5_16:
    699 	subl	$11,%edi		// adjust for hardwired offsets
    700 	addl	%eax,%edx
    701 	movb	(%esi),%al
    702 	sbbl	%ecx,%ecx
    703 	addl	%ebp,%ebx
    704 	adcl	advancetable+4(,%ecx,4),%esi
    705 	addl	tstep,%edx
    706 	jmp		LEntry5_16
    707 
    708 //----------------------------------------
    709 
    710 Entry6_16:
    711 	subl	$10,%edi		// adjust for hardwired offsets
    712 	addl	%eax,%edx
    713 	movb	(%esi),%al
    714 	sbbl	%ecx,%ecx
    715 	addl	%ebp,%ebx
    716 	adcl	advancetable+4(,%ecx,4),%esi
    717 	addl	tstep,%edx
    718 	jmp		LEntry6_16
    719 
    720 //----------------------------------------
    721 
    722 Entry7_16:
    723 	subl	$9,%edi		// adjust for hardwired offsets
    724 	addl	%eax,%edx
    725 	movb	(%esi),%al
    726 	sbbl	%ecx,%ecx
    727 	addl	%ebp,%ebx
    728 	adcl	advancetable+4(,%ecx,4),%esi
    729 	addl	tstep,%edx
    730 	jmp		LEntry7_16
    731 
    732 //----------------------------------------
    733 
    734 Entry8_16:
    735 	subl	$8,%edi		// adjust for hardwired offsets
    736 	addl	%eax,%edx
    737 	movb	(%esi),%al
    738 	sbbl	%ecx,%ecx
    739 	addl	%ebp,%ebx
    740 	adcl	advancetable+4(,%ecx,4),%esi
    741 	addl	tstep,%edx
    742 	jmp		LEntry8_16
    743 
    744 //----------------------------------------
    745 
    746 Entry9_16:
    747 	subl	$7,%edi		// adjust for hardwired offsets
    748 	addl	%eax,%edx
    749 	movb	(%esi),%al
    750 	sbbl	%ecx,%ecx
    751 	addl	%ebp,%ebx
    752 	adcl	advancetable+4(,%ecx,4),%esi
    753 	addl	tstep,%edx
    754 	jmp		LEntry9_16
    755 
    756 //----------------------------------------
    757 
    758 Entry10_16:
    759 	subl	$6,%edi		// adjust for hardwired offsets
    760 	addl	%eax,%edx
    761 	movb	(%esi),%al
    762 	sbbl	%ecx,%ecx
    763 	addl	%ebp,%ebx
    764 	adcl	advancetable+4(,%ecx,4),%esi
    765 	addl	tstep,%edx
    766 	jmp		LEntry10_16
    767 
    768 //----------------------------------------
    769 
    770 Entry11_16:
    771 	subl	$5,%edi		// adjust for hardwired offsets
    772 	addl	%eax,%edx
    773 	movb	(%esi),%al
    774 	sbbl	%ecx,%ecx
    775 	addl	%ebp,%ebx
    776 	adcl	advancetable+4(,%ecx,4),%esi
    777 	addl	tstep,%edx
    778 	jmp		LEntry11_16
    779 
    780 //----------------------------------------
    781 
    782 Entry12_16:
    783 	subl	$4,%edi		// adjust for hardwired offsets
    784 	addl	%eax,%edx
    785 	movb	(%esi),%al
    786 	sbbl	%ecx,%ecx
    787 	addl	%ebp,%ebx
    788 	adcl	advancetable+4(,%ecx,4),%esi
    789 	addl	tstep,%edx
    790 	jmp		LEntry12_16
    791 
    792 //----------------------------------------
    793 
    794 Entry13_16:
    795 	subl	$3,%edi		// adjust for hardwired offsets
    796 	addl	%eax,%edx
    797 	movb	(%esi),%al
    798 	sbbl	%ecx,%ecx
    799 	addl	%ebp,%ebx
    800 	adcl	advancetable+4(,%ecx,4),%esi
    801 	addl	tstep,%edx
    802 	jmp		LEntry13_16
    803 
    804 //----------------------------------------
    805 
    806 Entry14_16:
    807 	subl	$2,%edi		// adjust for hardwired offsets
    808 	addl	%eax,%edx
    809 	movb	(%esi),%al
    810 	sbbl	%ecx,%ecx
    811 	addl	%ebp,%ebx
    812 	adcl	advancetable+4(,%ecx,4),%esi
    813 	addl	tstep,%edx
    814 	jmp		LEntry14_16
    815 
    816 //----------------------------------------
    817 
    818 Entry15_16:
    819 	decl	%edi		// adjust for hardwired offsets
    820 	addl	%eax,%edx
    821 	movb	(%esi),%al
    822 	sbbl	%ecx,%ecx
    823 	addl	%ebp,%ebx
    824 	adcl	advancetable+4(,%ecx,4),%esi
    825 	addl	tstep,%edx
    826 	jmp		LEntry15_16
    827 
    828 //----------------------------------------
    829 
    830 Entry16_16:
    831 	addl	%eax,%edx
    832 	movb	(%esi),%al
    833 	sbbl	%ecx,%ecx
    834 	addl	%ebp,%ebx
    835 	adcl	advancetable+4(,%ecx,4),%esi
    836 
    837 	addl	tstep,%edx
    838 	sbbl	%ecx,%ecx
    839 	movb	%al,1(%edi)
    840 	addl	%ebp,%ebx
    841 	movb	(%esi),%al
    842 	adcl	advancetable+4(,%ecx,4),%esi
    843 	addl	tstep,%edx
    844 LEntry15_16:
    845 	sbbl	%ecx,%ecx
    846 	movb	%al,2(%edi)
    847 	addl	%ebp,%ebx
    848 	movb	(%esi),%al
    849 	adcl	advancetable+4(,%ecx,4),%esi
    850 	addl	tstep,%edx
    851 LEntry14_16:
    852 	sbbl	%ecx,%ecx
    853 	movb	%al,3(%edi)
    854 	addl	%ebp,%ebx
    855 	movb	(%esi),%al
    856 	adcl	advancetable+4(,%ecx,4),%esi
    857 	addl	tstep,%edx
    858 LEntry13_16:
    859 	sbbl	%ecx,%ecx
    860 	movb	%al,4(%edi)
    861 	addl	%ebp,%ebx
    862 	movb	(%esi),%al
    863 	adcl	advancetable+4(,%ecx,4),%esi
    864 	addl	tstep,%edx
    865 LEntry12_16:
    866 	sbbl	%ecx,%ecx
    867 	movb	%al,5(%edi)
    868 	addl	%ebp,%ebx
    869 	movb	(%esi),%al
    870 	adcl	advancetable+4(,%ecx,4),%esi
    871 	addl	tstep,%edx
    872 LEntry11_16:
    873 	sbbl	%ecx,%ecx
    874 	movb	%al,6(%edi)
    875 	addl	%ebp,%ebx
    876 	movb	(%esi),%al
    877 	adcl	advancetable+4(,%ecx,4),%esi
    878 	addl	tstep,%edx
    879 LEntry10_16:
    880 	sbbl	%ecx,%ecx
    881 	movb	%al,7(%edi)
    882 	addl	%ebp,%ebx
    883 	movb	(%esi),%al
    884 	adcl	advancetable+4(,%ecx,4),%esi
    885 	addl	tstep,%edx
    886 LEntry9_16:
    887 	sbbl	%ecx,%ecx
    888 	movb	%al,8(%edi)
    889 	addl	%ebp,%ebx
    890 	movb	(%esi),%al
    891 	adcl	advancetable+4(,%ecx,4),%esi
    892 	addl	tstep,%edx
    893 LEntry8_16:
    894 	sbbl	%ecx,%ecx
    895 	movb	%al,9(%edi)
    896 	addl	%ebp,%ebx
    897 	movb	(%esi),%al
    898 	adcl	advancetable+4(,%ecx,4),%esi
    899 	addl	tstep,%edx
    900 LEntry7_16:
    901 	sbbl	%ecx,%ecx
    902 	movb	%al,10(%edi)
    903 	addl	%ebp,%ebx
    904 	movb	(%esi),%al
    905 	adcl	advancetable+4(,%ecx,4),%esi
    906 	addl	tstep,%edx
    907 LEntry6_16:
    908 	sbbl	%ecx,%ecx
    909 	movb	%al,11(%edi)
    910 	addl	%ebp,%ebx
    911 	movb	(%esi),%al
    912 	adcl	advancetable+4(,%ecx,4),%esi
    913 	addl	tstep,%edx
    914 LEntry5_16:
    915 	sbbl	%ecx,%ecx
    916 	movb	%al,12(%edi)
    917 	addl	%ebp,%ebx
    918 	movb	(%esi),%al
    919 	adcl	advancetable+4(,%ecx,4),%esi
    920 	addl	tstep,%edx
    921 LEntry4_16:
    922 	sbbl	%ecx,%ecx
    923 	movb	%al,13(%edi)
    924 	addl	%ebp,%ebx
    925 	movb	(%esi),%al
    926 	adcl	advancetable+4(,%ecx,4),%esi
    927 LEntry3_16:
    928 	movb	%al,14(%edi)
    929 	movb	(%esi),%al
    930 LEntry2_16:
    931 
    932 LEndSpan:
    933 
    934 //
    935 // clear s/z, t/z, 1/z from FP stack
    936 //
    937 	fstp %st(0)
    938 	fstp %st(0)
    939 	fstp %st(0)
    940 
    941 	movl	pspantemp,%ebx				// restore spans pointer
    942 	movl	espan_t_pnext(%ebx),%ebx	// point to next span
    943 	testl	%ebx,%ebx			// any more spans?
    944 	movb	%al,15(%edi)
    945 	jnz		LSpanLoop			// more spans
    946 
    947 	popl	%ebx				// restore register variables
    948 	popl	%esi
    949 	popl	%edi
    950 	popl	%ebp				// restore the caller's stack frame
    951 	ret
    952 
    953 //----------------------------------------------------------------------
    954 // 8-bpp horizontal span z drawing codefor polygons, with no transparency.
    955 //
    956 // Assumes there is at least one span in pzspans, and that every span
    957 // contains at least one pixel
    958 //----------------------------------------------------------------------
    959 
    960 	.text
    961 
    962 // z-clamp on a non-negative gradient span
    963 LClamp:
    964 	movl	$0x40000000,%edx
    965 	xorl	%ebx,%ebx
    966 	fstp	%st(0)
    967 	jmp		LZDraw
    968 
    969 // z-clamp on a negative gradient span
    970 LClampNeg:
    971 	movl	$0x40000000,%edx
    972 	xorl	%ebx,%ebx
    973 	fstp	%st(0)
    974 	jmp		LZDrawNeg
    975 
    976 
    977 #define pzspans	4+16
    978 
    979 .globl C(D_DrawZSpans)
    980 C(D_DrawZSpans):
    981 	pushl	%ebp				// preserve caller's stack frame
    982 	pushl	%edi
    983 	pushl	%esi				// preserve register variables
    984 	pushl	%ebx
    985 
    986 	flds	C(d_zistepu)
    987 	movl	C(d_zistepu),%eax
    988 	movl	pzspans(%esp),%esi
    989 	testl	%eax,%eax
    990 	jz		LFNegSpan
    991 
    992 	fmuls	Float2ToThe31nd
    993 	fistpl	izistep		// note: we are relying on FP exceptions being turned
    994 						// off here to avoid range problems
    995 	movl	izistep,%ebx	// remains loaded for all spans
    996 
    997 LFSpanLoop:
    998 // set up the initial 1/z value
    999 	fildl	espan_t_v(%esi)
   1000 	fildl	espan_t_u(%esi)
   1001 	movl	espan_t_v(%esi),%ecx
   1002 	movl	C(d_pzbuffer),%edi
   1003 	fmuls	C(d_zistepu)
   1004 	fxch	%st(1)
   1005 	fmuls	C(d_zistepv)
   1006 	fxch	%st(1)
   1007 	fadds	C(d_ziorigin)
   1008 	imull	C(d_zrowbytes),%ecx
   1009 	faddp	%st(0),%st(1)
   1010 
   1011 // clamp if z is nearer than 2 (1/z > 0.5)
   1012 	fcoms	float_point5
   1013 	addl	%ecx,%edi
   1014 	movl	espan_t_u(%esi),%edx
   1015 	addl	%edx,%edx				// word count
   1016 	movl	espan_t_count(%esi),%ecx
   1017 	addl	%edx,%edi				// pdest = &pdestspan[scans->u];
   1018 	pushl	%esi		// preserve spans pointer
   1019 	fnstsw	%ax
   1020 	testb	$0x45,%ah
   1021 	jz		LClamp
   1022 
   1023 	fmuls	Float2ToThe31nd
   1024 	fistpl	izi			// note: we are relying on FP exceptions being turned
   1025 						// off here to avoid problems when the span is closer
   1026 						// than 1/(2**31)
   1027 	movl	izi,%edx
   1028 
   1029 // at this point:
   1030 // %ebx = izistep
   1031 // %ecx = count
   1032 // %edx = izi
   1033 // %edi = pdest
   1034 
   1035 LZDraw:
   1036 
   1037 // do a single pixel up front, if necessary to dword align the destination
   1038 	testl	$2,%edi
   1039 	jz		LFMiddle
   1040 	movl	%edx,%eax
   1041 	addl	%ebx,%edx
   1042 	shrl	$16,%eax
   1043 	decl	%ecx
   1044 	movw	%ax,(%edi)
   1045 	addl	$2,%edi
   1046 
   1047 // do middle a pair of aligned dwords at a time
   1048 LFMiddle:
   1049 	pushl	%ecx
   1050 	shrl	$1,%ecx				// count / 2
   1051 	jz		LFLast				// no aligned dwords to do
   1052 	shrl	$1,%ecx				// (count / 2) / 2
   1053 	jnc		LFMiddleLoop		// even number of aligned dwords to do
   1054 
   1055 	movl	%edx,%eax
   1056 	addl	%ebx,%edx
   1057 	shrl	$16,%eax
   1058 	movl	%edx,%esi
   1059 	addl	%ebx,%edx
   1060 	andl	$0xFFFF0000,%esi
   1061 	orl		%esi,%eax
   1062 	movl	%eax,(%edi)
   1063 	addl	$4,%edi
   1064 	andl	%ecx,%ecx
   1065 	jz		LFLast
   1066 
   1067 LFMiddleLoop:
   1068 	movl	%edx,%eax
   1069 	addl	%ebx,%edx
   1070 	shrl	$16,%eax
   1071 	movl	%edx,%esi
   1072 	addl	%ebx,%edx
   1073 	andl	$0xFFFF0000,%esi
   1074 	orl		%esi,%eax
   1075 	movl	%edx,%ebp
   1076 	movl	%eax,(%edi)
   1077 	addl	%ebx,%edx
   1078 	shrl	$16,%ebp
   1079 	movl	%edx,%esi
   1080 	addl	%ebx,%edx
   1081 	andl	$0xFFFF0000,%esi
   1082 	orl		%esi,%ebp
   1083 	movl	%ebp,4(%edi)	// FIXME: eliminate register contention
   1084 	addl	$8,%edi
   1085 
   1086 	decl	%ecx
   1087 	jnz		LFMiddleLoop
   1088 
   1089 LFLast:
   1090 	popl	%ecx			// retrieve count
   1091 	popl	%esi			// retrieve span pointer
   1092 
   1093 // do the last, unaligned pixel, if there is one
   1094 	andl	$1,%ecx			// is there an odd pixel left to do?
   1095 	jz		LFSpanDone		// no
   1096 	shrl	$16,%edx
   1097 	movw	%dx,(%edi)		// do the final pixel's z
   1098 
   1099 LFSpanDone:
   1100 	movl	espan_t_pnext(%esi),%esi
   1101 	testl	%esi,%esi
   1102 	jnz		LFSpanLoop
   1103 
   1104 	jmp		LFDone
   1105 
   1106 LFNegSpan:
   1107 	fmuls	FloatMinus2ToThe31nd
   1108 	fistpl	izistep		// note: we are relying on FP exceptions being turned
   1109 						// off here to avoid range problems
   1110 	movl	izistep,%ebx	// remains loaded for all spans
   1111 
   1112 LFNegSpanLoop:
   1113 // set up the initial 1/z value
   1114 	fildl	espan_t_v(%esi)
   1115 	fildl	espan_t_u(%esi)
   1116 	movl	espan_t_v(%esi),%ecx
   1117 	movl	C(d_pzbuffer),%edi
   1118 	fmuls	C(d_zistepu)
   1119 	fxch	%st(1)
   1120 	fmuls	C(d_zistepv)
   1121 	fxch	%st(1)
   1122 	fadds	C(d_ziorigin)
   1123 	imull	C(d_zrowbytes),%ecx
   1124 	faddp	%st(0),%st(1)
   1125 
   1126 // clamp if z is nearer than 2 (1/z > 0.5)
   1127 	fcoms	float_point5
   1128 	addl	%ecx,%edi
   1129 	movl	espan_t_u(%esi),%edx
   1130 	addl	%edx,%edx				// word count
   1131 	movl	espan_t_count(%esi),%ecx
   1132 	addl	%edx,%edi				// pdest = &pdestspan[scans->u];
   1133 	pushl	%esi		// preserve spans pointer
   1134 	fnstsw	%ax
   1135 	testb	$0x45,%ah
   1136 	jz		LClampNeg
   1137 
   1138 	fmuls	Float2ToThe31nd
   1139 	fistpl	izi			// note: we are relying on FP exceptions being turned
   1140 						// off here to avoid problems when the span is closer
   1141 						// than 1/(2**31)
   1142 	movl	izi,%edx
   1143 
   1144 // at this point:
   1145 // %ebx = izistep
   1146 // %ecx = count
   1147 // %edx = izi
   1148 // %edi = pdest
   1149 
   1150 LZDrawNeg:
   1151 
   1152 // do a single pixel up front, if necessary to dword align the destination
   1153 	testl	$2,%edi
   1154 	jz		LFNegMiddle
   1155 	movl	%edx,%eax
   1156 	subl	%ebx,%edx
   1157 	shrl	$16,%eax
   1158 	decl	%ecx
   1159 	movw	%ax,(%edi)
   1160 	addl	$2,%edi
   1161 
   1162 // do middle a pair of aligned dwords at a time
   1163 LFNegMiddle:
   1164 	pushl	%ecx
   1165 	shrl	$1,%ecx				// count / 2
   1166 	jz		LFNegLast			// no aligned dwords to do
   1167 	shrl	$1,%ecx				// (count / 2) / 2
   1168 	jnc		LFNegMiddleLoop		// even number of aligned dwords to do
   1169 
   1170 	movl	%edx,%eax
   1171 	subl	%ebx,%edx
   1172 	shrl	$16,%eax
   1173 	movl	%edx,%esi
   1174 	subl	%ebx,%edx
   1175 	andl	$0xFFFF0000,%esi
   1176 	orl		%esi,%eax
   1177 	movl	%eax,(%edi)
   1178 	addl	$4,%edi
   1179 	andl	%ecx,%ecx
   1180 	jz		LFNegLast
   1181 
   1182 LFNegMiddleLoop:
   1183 	movl	%edx,%eax
   1184 	subl	%ebx,%edx
   1185 	shrl	$16,%eax
   1186 	movl	%edx,%esi
   1187 	subl	%ebx,%edx
   1188 	andl	$0xFFFF0000,%esi
   1189 	orl		%esi,%eax
   1190 	movl	%edx,%ebp
   1191 	movl	%eax,(%edi)
   1192 	subl	%ebx,%edx
   1193 	shrl	$16,%ebp
   1194 	movl	%edx,%esi
   1195 	subl	%ebx,%edx
   1196 	andl	$0xFFFF0000,%esi
   1197 	orl		%esi,%ebp
   1198 	movl	%ebp,4(%edi)	// FIXME: eliminate register contention
   1199 	addl	$8,%edi
   1200 
   1201 	decl	%ecx
   1202 	jnz		LFNegMiddleLoop
   1203 
   1204 LFNegLast:
   1205 	popl	%ecx			// retrieve count
   1206 	popl	%esi			// retrieve span pointer
   1207 
   1208 // do the last, unaligned pixel, if there is one
   1209 	andl	$1,%ecx			// is there an odd pixel left to do?
   1210 	jz		LFNegSpanDone	// no
   1211 	shrl	$16,%edx
   1212 	movw	%dx,(%edi)		// do the final pixel's z
   1213 
   1214 LFNegSpanDone:
   1215 	movl	espan_t_pnext(%esi),%esi
   1216 	testl	%esi,%esi
   1217 	jnz		LFNegSpanLoop
   1218 
   1219 LFDone:
   1220 	popl	%ebx				// restore register variables
   1221 	popl	%esi
   1222 	popl	%edi
   1223 	popl	%ebp				// restore the caller's stack frame
   1224 	ret
   1225 
   1226 #endif	// id386
   1227