/// tmap_scanline_per - Pentium-optimized assembly version
/// written by Brian Raiter, Mar 1998.
/// lighting roundoff error fixed by Matt Mueller, July 1999

	
/// The gist of the algorithm is as follows (note that this is
/// pseudocode, not actual C):
///
/// int  u = fx_u;
/// int  v = fx_v;
/// int  z = fx_z;
/// int  l = fx_l;
/// int  x, ubyz, vbyz;
/// byte texmap[64][64] = pixptr;
/// byte framebuffer[][bytes_per_row] = write_buffer;
/// byte lightingtable[][256] = gr_fade_table;
/// byte c;
///
/// for (x = fx_xleft ; x <= fx_xright ; ++x) {
///	ubyz = (u / z) & 63;
///	vbyz = (v / z) & 63;
///	c = texmap[ubyz][vbyz];
///	if (c != TRANSPARENT_COLOR)
///	    framebuffer[fx_y][x] = lightingtable[l / 65536][c];
///	u += fx_du_dx;
///	v += fx_dv_dx;
///	z += fx_dz_dx;
///	l += fx_dl_dx;
/// }
///
/// The global variable Transparency_on is zero when it is known that
/// there are no transparencies involved, so in that case we use a
/// different loop that skips the transparency test.
///
/// The actual algorithm used here only does the division calculations
/// every fourth pixel, and linearly interpolates the other three.
/// Something along the lines of:
///
/// /* Initial values as before */
/// int ubyz0, ubyz0, ubyz4, ubyz4, du1, dv1, i;
///
/// ubyz0 = u / z;
/// vbyz0 = v / z;
/// for (x = fx_xleft ; x <= fx_xright - 3 ; x += 4) {
///	u += fx_du_dx * 4;
///	v += fx_dv_dx * 4;
///	z += fx_dz_dx * 4;
///	ubyz4 = u / z;
///	vbyz4 = v / z;
///	du1 = (ubyz4 - ubyz0) / 4;
///	dv1 = (vbyz4 - vbyz0) / 4;
///	ubyz = ubyz0;
///	vbyz = vbyz0;
///	for (i = 0 ; i < 4 ; ++i) {
///	    c = texmap[ubyz & 63][vbyz & 63];
///	    if (c != TRANSPARENT_COLOR)
///		framebuffer[fx_y][x + i] = lightingtable[l / 65536][c];
///	    ubyz += du1;
///	    vbyz += dv1;
///	    l += fx_dl_dx;
///	}
///	ubyz0 = ubyz4;
///	vbyz0 = vbyz4;
/// }
/// for ( ; x <= fx_xright ; ++x) {
///	/* Finish off remaining 0-3 pixels */
/// }
///
/// So much for the basic overview.
///
/// In this version, the Pentium's floating-point unit is pressed into
/// service to do the actual divisions, so that 1/z can be calculated
/// first, and the resulting reciprocal multiplied with u and v. These
/// two products are then stored back out as integers. This keeps us
/// down to doing only one division every four pixels, during which
/// other integer instructions can be overlapped.
///
/// The algorithm actually divides 64 by z, so that the rounded-off
/// products will effectively be stored with six fraction bits. This
/// allows the algorithm to correct for minor floating-point roundoff
/// errors. Two fraction bits are kept during the interpolation of the
/// three middle pixels, which hopefully increases the accuracy of the
/// approximations.
///
/// We only need the lowest six (integral) bits of u/z and v/z for
/// each pixptr offset, so we only need eight bits of each fourth pair
/// of values to figure the interpolation. Add with the two fractional
/// bits we keep for extra precision flavor, this makes ten bits for
/// each value, or twenty to store the full pair. To simplify the
/// interpolation, the pair is packed into a single 32-bit register
/// like so:
///
///		3      2       1
///		1      4       6       8       0
///		________vvVVVVVVvv____uuUUUUUUuu
///		          \v&63/        \u&63/
///
/// The unused bits between the u and v values permit the packed
/// values to be added/subtracted without the u values spilling over
/// into the v values. Then, after anding out the carry/borrow bits,
/// the instructions "movb %al, %ah ; shrl $10, %eax" nicely
/// right-justify the desired values into a pixptr offset.
/// 
/// The FP stack is loaded up with the values of u, v, and z,
/// converted to floats. %ebp is used to hold the value of l, %esi is
/// set to pixptr, and %edi points to our current position in
/// write_buffer.


// This is used to abbreviate an annoying external variable name.

.equ    fadetbl, _gr_fade_table


// The following macro encapsulates the floating-point instructions
// that put the results of a prior division to use and prepare for the
// next division. At the beginning of the macro, the FP stack contains
// (from top to bottom): z, u, v, 64/z. The macro computes (64*u)/z,
// which is stored in ubyz4, and (64*v)/z, which is stored in vbyz4.
// The number (2^51 + 2^52) is added to each number before they are
// stored as qwords. Since qwords only have 52 bits of precision, this
// magic number causes the fractional part to be shifted off the end,
// leaving the integral part right-shifted. Thus, reading the low
// dword gives the original number rounded off to the nearest integer
// - in two's complement, no less. (This technique allows for more
// pipelining than using the more straightforward fist/p
// instruction.) Simultaneous with this, the macro adds dudx to u,
// dvdx to v, and dzdx to z, and finally puts 64 back onto the stack.
// At the end of the macro, the stack contains: z, u, v, 64.

.macro DoFPCalcs		// The FP stack after each instruction:
				//                 z    u    v  64/z
	fadds	(dzdx)		// z += dzdx	  z'   u    v  64/z
	fxch	%st(1)		//		  u    z'   v  64/z
	fst	%st(4)		//		  u    z'   v  64/z   u
	fmul	%st(3)		// (64 / z) * u	 u/z   z'   v  64/z   u
	fxch	%st(4)		//		  u    z'   v  64/z  u/z
	fadds	(dudx)		// u += dudx	  u'   z'   v  64/z   u
	fxch	%st(2)		//		  v    z'   u' 64/z  u/z
	fmul	%st, %st(3)	// (64 / z) * v	  v    z'   u'  v/z  u/z
	fxch	%st(4)		//		 u/z   z'   u'  v/z   v
	fadds	(magic)		//		 U/Z   z'   u'  v/z   v
	fxch	%st(4)		//		  v    z'   u'  v/z  U/Z
	fadds	(dvdx)		// v += dvdx	  v'   z'   u'  v/z  U/Z
	fxch	%st(3)		//		 v/z   z'   u'   v'  U/Z
	fadds	(magic)		//		 V/Z   z'   u'   v'  U/Z
	flds	(flt64)		//		  64  V/Z   z'   u'   v'  U/Z
	fxch	%st(5)		//		 U/Z  V/Z   z'   u'   v'   64
	fstpl	(ubyz4)		//		 V/Z   z'   u'   v'   64
	fstpl	(vbyz4)		//		  z'   u'   v'   64
				// (ready to start the next division)
.endm

#ifdef __ENV_LINUX__
.equ _pixptr, pixptr
.equ _gr_fade_table, gr_fade_table
.equ _write_buffer, write_buffer
.equ _bytes_per_row,bytes_per_row
.equ _fx_xleft, fx_xleft
.equ _fx_xright, fx_xright
.equ _fx_y, fx_y
.equ _fx_u, fx_u
.equ _fx_v, fx_v
.equ _fx_z, fx_z
.equ _fx_l, fx_l
.equ _fx_du_dx, fx_du_dx
.equ _fx_dv_dx, fx_dv_dx
.equ _fx_dz_dx, fx_dz_dx
.equ _fx_dl_dx, fx_dl_dx
.equ _Transparency_on, Transparency_on

.globl asm_tmap_scanline_per
#else
.globl _asm_tmap_scanline_per
#endif

.extern _pixptr, _gr_fade_table, _write_buffer
.extern _bytes_per_row, _fx_xleft, _fx_xright, _fx_y
.extern _fx_u, _fx_v, _fx_z, _fx_l
.extern _fx_du_dx, _fx_dv_dx, _fx_dz_dx, _fx_dl_dx
.extern _Transparency_on


//.local  dudx, dvdx, dzdx, dldx
//.local  ubyz4, vbyz4, uvzero
//.local  lastquartet, lastpixel, ctlwd
//.local  flt64, magic


.data

.balign 8

lastquartet:	.long	0		// where to stop the 4-pixels loop
lastpixel:	.long	0		// where to stop drawing entirely
flt64:		.long	0x42800000	// 64.0 (what we divide z into)
magic:		.long	0x59C00000	// 2^51 + 2^52 (to get ints from floats)
ubyz4:		.double	0.0		// u/z for the next iteration
vbyz4:		.double	0.0		// v/z for the next iteration
dudx:		.long	0		// u's rate of change as a float
dvdx:		.long	0		// v's rate of change as a float
dzdx:		.long	0		// z's rate of change as a float
dldx:		.long	0		// l's rate of change as an integer
uvzero:		.long	0		// packed u/z and v/z values
ctlwd:		.word	0		// the pre-tweaked FPU control word


.text

.balign 4

//
// void c_tmap_scanline_per(void)
//

#ifdef __ENV_LINUX__
asm_tmap_scanline_per:
#else
_asm_tmap_scanline_per:
#endif

// Save registers the compiler might be using.

		pushl	%ebp
		pushl	%edi
		pushl	%esi

// Tell the FPU to use 64-bit numbers (still plenty precise enough for
// our needs) so as to speed up fdiv.

		fnstcw	(ctlwd)
		movw	(ctlwd), %ax
		movl	%eax, %ebx
		andb	$0xFC, %bh
		orb	$0x02, %bh
		movw	%bx, (ctlwd)
		fldcw	(ctlwd)
		movw	%ax, (ctlwd)

// Multiply dudx, dvdx, and dzdx by four, and store locally, converted
// into floating point.

                movl    (_fx_du_dx), %ebx
                movl    (_fx_dv_dx), %ecx
		sall	$2, %ebx
                movl    (_fx_dz_dx), %edx
		sall	$2, %ecx
		movl	%ebx, (dudx)
		sall	$2, %edx
		movl	%ecx, (dvdx)
		movl	%edx, (dzdx)
		fildl	(dudx)
		fildl	(dvdx)
		fildl	(dzdx)
		fxch	%st(2)
		fstps	(dudx)
		fstps	(dvdx)
		fstps	(dzdx)

// bytes_per_row * fx_y is the offset for the current scanline. (We do
// this now before we start the first FP division.)

                movl    (_bytes_per_row), %eax
		xorl	%edx, %edx
                mull    (_fx_y)

// Push 64.0, v, u, and z onto the FPU stack, and then start
// calculating the first 64 / z.

		flds	(flt64)
                fildl   (_fx_v)
                fildl   (_fx_u)
                fildl   (_fx_z)
		fdivr	%st, %st(3)

// Meanwhile, get l and dldx (again, the latter multiplied by four)
// into %edx and %ebp, where they will be stored for the duration. The
// original values are divided by 256 so that the byte needed for the
// fade table offset is squarely in %dh.

//Dividing by 256 is bad.. rounding errors and crap.  We'll now do that
//right before we need to access the table instead.  -MM

                movl    (_fx_l), %ebp
//		sarl	$8, %ebp
                movl    (_fx_dl_dx), %edx
//		sarl	$6, %edx
		sall	$2, %edx
		movl	%edx, (dldx)

// Store pixptr, the pointer to our 64x64 texture map, in %esi. Store
// write_buffer, the pointer to our frame buffer, in %edi. Then offset
// %edi so that it points to pixel (fx_y)(fx_xleft). Calculate a
// pointer to (fx_y)[fx_xright + 1] so we know when to stop drawing.
// Also calculate a pointer to (fx_y)[(fx_xright + 1) & ~3] so we know
// when to stop drawing four pixels at a time.

                movl    (_pixptr), %esi
                movl    (_write_buffer), %edi
                movl    (_fx_xright), %ecx
		addl	%eax, %edi
		incl	%ecx
		addl	%edi, %ecx
                addl    (_fx_xleft), %edi
		movl	%ecx, %eax
		subl	%edi, %eax
		jle	LeaveNow
		andl	$3, %eax
		movl	%ecx, (lastpixel)
		subl	%eax, %ecx
		movl	%ecx, (lastquartet)

// Calculate round(64 * u / z) and round(64 * v / z), store, and
// increment u, v, and z. Then start calculating the second 64 / z.

		DoFPCalcs
		fdivr	%st, %st(3)

// Get our u/z and v/z values, lop off the bits we don't care about,
// pack, and store in uvzero.

		movl	(ubyz4), %eax
		movl	(vbyz4), %ebx
		incl	%eax
		incl	%ebx
		andl	$0x3FF0, %eax
		andl	$0x3FF0, %ebx
		shrl	$4, %eax
		shll	$10, %ebx
		orl	%eax, %ebx
		movl	%ebx, (uvzero)

// While we're waiting for the last division to finish, we might as
// well get the frame buffer into the cache.

		cmpb	(%edi), %al

// Are there at least four pixels to draw? If not, skip to the epilog
// code.

		cmpl	%ecx, %edi
		je	LastBits

// Do we need to test for transparencies?

                testl   $(~0), (_Transparency_on)
		jnz	LoopTransOn

// If not, then use the simpler loop here.


.balign 4

LoopTransOff:

// While the FPU is busy dividing, the latest u/z and v/z values are
// retrieved, packed, and stored in uvzero (to be used again in the
// next iteration). The old uvzero value, which contains the uv values
// for pixel 0, gets subtracted from the new uvzero value to
// determined the total change in u/z and v/z across the four pixels,
// and this is divided by 4 to get the average. This average is then
// used to estimate the values for pixels 1, 2, and 3. The old uvzero
// value is used immediately to calculate pixel 0, while %eax, %ebx, and
// %ecx are entrusted with the uv values for pixels 1, 2, and 3
// respectively. %edx is set to the current value of l, such that %dh is
// already set as half of the offset into fadetbl. Each uv value is
// used to set its pixel as follows (assuming our packed uv value is
// in %ebx):
//
//	a:	andl	$0x003F00FC, %ebx	/ mask off extraneous bits
//	b:	movb	%bl, %bh		/ make u flush with v
//	c:	shrl	$10, %ebx		/ right-justify u and v
//	d:	movb	(%esi,%ebx), %dl	/ get texture-map pixel
//	e:	movb	fadetbl(%edx), %bl	/ correct for lighting level
//	f:	movb	%bl, (%edi)		/ write pixel to frame buffer
// 
// The above is done four times, once for each pixel. All of the
// calculcations are interleaved in order to avoid AGI stalls and
// missed pairing opportunities.

		DoFPCalcs
		fdivr	%st, %st(3)
		movl	(ubyz4), %ebx
		movl	(vbyz4), %edx
		incl	%ebx
		incl	%edx
		shrl	$4, %ebx
		andl	$0x3FF0, %edx
		shll	$10, %edx
		andl	$0x03FF, %ebx
                movl    (uvzero), %ecx          // %ecx = uv value for pixel 0
		orl	%edx, %ebx
		movl	%ecx, %eax
		movl	%ebx, (uvzero)
                andl    $0x003F00FC, %ecx       // 0 a
		orl	$0x1000, %ebx
                movb    %cl, %ch                // 0 b
		subl	%eax, %ebx
                shrl    $10, %ecx               // 0 c
		movl	$0x7F0000, %edx
		shrl	$2, %ebx
		andl	%ebp, %edx
		sarl	$8, %edx
                movb    (%esi,%ecx), %dl        // 0 d
		addl	$4, %edi
                lea     (%eax,%ebx,2), %ecx     // %ecx = uv value for pixel 2
                addl    %ebx, %eax              // %eax = uv value for pixel 1
                addl    %ecx, %ebx              // %ebx = uv value for pixel 3
                andl    $0x003F00FC, %ecx       // 2 a
                movb    %cl, %ch                // 2 b
                movb    fadetbl(%edx), %dl      // 0 e
                shrl    $10, %ecx               // 2 c
                andl    $0x003F00FC, %eax       // 1 a
                movb    %dl, -4(%edi)           // 0 f
                movb    %al, %ah                // 1 b
                movb    (%esi,%ecx), %dl        // 2 d
                andl    $0x003F00FC, %ebx       // 3 a
                shrl    $10, %eax               // 1 c
                movb    %bl, %bh                // 3 b
                movb    fadetbl(%edx), %cl      // 2 e
                movb    (%esi,%eax), %dl        // 1 d
                shrl    $10, %ebx               // 3 c
                movb    %cl, -2(%edi)           // 2 f
		movl	(dldx), %ecx
                movb    fadetbl(%edx), %al      // 1 e
                movb    (%esi,%ebx), %dl        // 3 d
                movb    %al, -3(%edi)           // 1 f
		addl	%ecx, %ebp
                movb    fadetbl(%edx), %bl      // 3 e
		movl	(lastquartet), %ecx
                movb    %bl, -1(%edi)           // 3 f
		cmpl	%ecx, %edi
		jl	LoopTransOff

// Are there any pixels left at all?

		cmpl	(lastpixel), %edi
		jnz	LastBits
		jmp	LeaveNow


.balign 4

LoopTransOn:

// This is similar to the LoopTransOff loop, the big change being that
// each value retrieved from the texture map is tested against 255,
// the transparent "color". A value of 255 in the texture map means to
// let the existing value for that pixel in write_buffer go by
// unchanged. Thus the code for each pixel looks something like this
// instead:
//
//	a:	andl	$0x003F00FC, %ebx	/ mask off extraneous bits
//	b:	movb	%bl, %bh		/ make u flush with v
//	c:	shrl	$10, %ebx		/ right-justify u and v
//	d:	movb	(%esi,%ebx), %dl	/ get texture-map pixel
//	e:	cmpb	$255, %dl		/ is pixel transparent?
//	f:	sbbb	%bh, %bh		/ yes, %bh=00; no, %bh=FF
//	g:	movb	fadetbl(%edx), %dl	/ get lighting-corrected pixel
//	h:	movb	(%edi), %bl		/ get pixel in frame buffer now
//	i:	xorb	%bl, %dl		/ combine the two
//	j:	andb	%dl, %bh		/ use %bh as a mask to select
//	k:	xorb	%bl, %bh		/     which pixel to keep
//	l:	movb	%bh, (%edi)		/ write pixel to frame buffer
// 
// When the texture-map value is 255, the code simply writes the
// original frame-buffer value back out again; otherwise the new pixel
// is written instead. The ands and xors used to accomplish this bulk
// up the code, but on the whole it is better than having four
// unpredictable jumps in the loop. The four repeats of the above code
// are even more intertwined than the other loop, due to the extra
// register usage. Also note that the last two pixels combine steps i,
// j, and k with each other.

		DoFPCalcs
		fdivr	%st, %st(3)
		movl	(ubyz4), %ebx
		movl	(vbyz4), %edx
		incl	%ebx
		incl	%edx
                movl    (uvzero), %ecx          // %ecx = uv for pixel 0
		andl	$0x3FF0, %ebx
		shrl	$4, %ebx
		andl	$0x3FF0, %edx
		shll	$10, %edx
		movl	%ecx, %eax
                andl    $0x003F00FC, %ecx       // 0 a
		orl	%edx, %ebx
                movb    %cl, %ch                // 0 b
		addl	$4, %edi
                shrl    $10, %ecx               // 0 c
		movl	$0x7F0000, %edx
		movl	%ebx, (uvzero)
		andl	%ebp, %edx
		sarl	$8, %edx
                movb    (%esi,%ecx), %dl        // 0 d
		orl	$0x1000, %ebx
		subl	%eax, %ebx
                movb    -4(%edi), %ch           // 0 h
                movb    fadetbl(%edx), %cl      // 0 g
                cmpb    $255, %dl               // 0 e
                sbbb    %dl, %dl                // 0 f
                xorb    %ch, %cl                // 0 i
		shrl	$2, %ebx
                andb    %cl, %dl                // 0 j
                xorb    %ch, %dl                // 0 k
/               nop                             // (V-pipe idle)
                lea     (%eax,%ebx,2), %ecx     // %ecx = uv for pixel 2
                addl    %ebx, %eax              // %eax = uv for pixel 1
                andl    $0x003F00FC, %eax       // 1 a
                addl    %ecx, %ebx              // %ebx = uv for pixel 3
                movb    %al, %ah                // 1 b
                andl    $0x003F00FC, %ecx       // 2 a
                shrl    $10, %eax               // 1 c
                andl    $0x003F00FC, %ebx       // 3 a
                movb    %cl, %ch                // 2 b
                movb    %bl, %bh                // 3 b
                movb    %dl, -4(%edi)           // 0 l
                movb    (%esi,%eax), %dl        // 1 d
                movb    -3(%edi), %al           // 1 h
                cmpb    $255, %dl               // 1 e
                sbbb    %ah, %ah                // 1 f
                movb    fadetbl(%edx), %dl      // 1 g
                shrl    $10, %ecx               // 2 c
                xorb    %al, %dl                // 1 i
                shrl    $10, %ebx               // 3 c
                andb    %dl, %ah                // 1 j
                xorb    %al, %ah                // 1 k
                movb    (%esi,%ecx), %dl        // 2 d
                movb    %ah, -3(%edi)           // 1 l
                cmpb    $255, %dl               // 2 e
                sbbb    %ah, %ah                // 2 f
                movb    fadetbl(%edx), %ch      // 2 g
                movb    (%esi,%ebx), %dl        // 3 d
                movb    -2(%edi), %bh           // 2 h
                cmpb    $255, %dl               // 3 e
                movb    -1(%edi), %bl           // 3 h
                sbbb    %al, %al                // 3 f
                movb    fadetbl(%edx), %cl      // 2 g
		movl	(dldx), %edx
                xorl    %ebx, %ecx              // 2 i and 3 i
		addl	%edx, %ebp
                andl    %ecx, %eax              // 2 j and 3 j
		movl	(lastquartet), %ecx
                xorl    %ebx, %eax              // 2 k and 3 k
                movb    %ah, -2(%edi)           // 2 l
		cmpl	%ecx, %edi
                movb    %al, -1(%edi)           // 3 l
		jl	LoopTransOn

// Quit if there are none at all left.

		cmpl	(lastpixel), %edi
		jz	LeaveNow


LastBits:

// Here we finish off the last one-to-three pixels assigned to us.
// Rather than calculating values for all four pixels, we just divide
// the difference by four and keep adding this average into the value
// as needed. (This code is not particularly optimized, by the way,
// since it represents such a miniscule amount of the running time.)

		DoFPCalcs
		movl	(ubyz4), %ecx
		movl	(vbyz4), %edx
		incl	%ecx
		incl	%edx
		shrl	$4, %ecx
		andl	$0x3FF0, %edx
		shll	$10, %edx
		andl	$0x03FF, %ecx
		movl	(uvzero), %ebx
		orl	%edx, %ecx
		orl	$0x1000, %ecx
		subl	%ebx, %ecx
		shrl	$2, %ecx
		andl	$0x003FC0FF, %ecx
		movl	%ebp, %edx
		movl	(lastpixel), %ebp
		andl	$0x7F0000, %edx
		sarl	$8, %edx

LoopLastBits:	movl	%ebx, %eax
		movb	%al, %ah
		shrl	$10, %eax
		andb	$0x0F, %ah
		movb	(%esi,%eax), %dl
		cmpb	$255, %dl
		jz	LetPixelBy
		movb	fadetbl(%edx), %al
		movb	%al, (%edi)
LetPixelBy:	addl	%ecx, %ebx
		incl	%edi
		cmpl	%ebp, %edi
		jl	LoopLastBits


LeaveNow:

// We're done! Clear the stacks, reset the FPU control word, and we
// are so out of here.

		popl	%esi
		popl	%edi
		popl	%ebp
		fcompp
		fcompp
		fldcw	(ctlwd)
		ret