1 /// $Id: tmappent.S,v 1.4 2003-02-18 20:15:48 btb Exp $
2 /// tmap_scanline_per - Pentium-optimized assembly version
3 /// written by Brian Raiter, Mar 1998.
4 /// lighting roundoff error fixed by Matt Mueller, July 1999
7 /// The gist of the algorithm is as follows (note that this is
8 /// pseudocode, not actual C):
14 /// int x, ubyz, vbyz;
15 /// byte texmap[64][64] = pixptr;
16 /// byte framebuffer[][bytes_per_row] = write_buffer;
17 /// byte lightingtable[][256] = gr_fade_table;
20 /// for (x = fx_xleft ; x <= fx_xright ; ++x) {
21 /// ubyz = (u / z) & 63;
22 /// vbyz = (v / z) & 63;
23 /// c = texmap[ubyz][vbyz];
24 /// if (c != TRANSPARENT_COLOR)
25 /// framebuffer[fx_y][x] = lightingtable[l / 65536][c];
32 /// The global variable Transparency_on is zero when it is known that
33 /// there are no transparencies involved, so in that case we use a
34 /// different loop that skips the transparency test.
36 /// The actual algorithm used here only does the division calculations
37 /// every fourth pixel, and linearly interpolates the other three.
38 /// Something along the lines of:
40 /// /* Initial values as before */
41 /// int ubyz0, ubyz0, ubyz4, ubyz4, du1, dv1, i;
45 /// for (x = fx_xleft ; x <= fx_xright - 3 ; x += 4) {
46 /// u += fx_du_dx * 4;
47 /// v += fx_dv_dx * 4;
48 /// z += fx_dz_dx * 4;
51 /// du1 = (ubyz4 - ubyz0) / 4;
52 /// dv1 = (vbyz4 - vbyz0) / 4;
55 /// for (i = 0 ; i < 4 ; ++i) {
56 /// c = texmap[ubyz & 63][vbyz & 63];
57 /// if (c != TRANSPARENT_COLOR)
58 /// framebuffer[fx_y][x + i] = lightingtable[l / 65536][c];
66 /// for ( ; x <= fx_xright ; ++x) {
67 /// /* Finish off remaining 0-3 pixels */
70 /// So much for the basic overview.
72 /// In this version, the Pentium's floating-point unit is pressed into
73 /// service to do the actual divisions, so that 1/z can be calculated
74 /// first, and the resulting reciprocal multiplied with u and v. These
75 /// two products are then stored back out as integers. This keeps us
76 /// down to doing only one division every four pixels, during which
77 /// other integer instructions can be overlapped.
79 /// The algorithm actually divides 64 by z, so that the rounded-off
80 /// products will effectively be stored with six fraction bits. This
81 /// allows the algorithm to correct for minor floating-point roundoff
82 /// errors. Two fraction bits are kept during the interpolation of the
83 /// three middle pixels, which hopefully increases the accuracy of the
86 /// We only need the lowest six (integral) bits of u/z and v/z for
87 /// each pixptr offset, so we only need eight bits of each fourth pair
88 /// of values to figure the interpolation. Add with the two fractional
89 /// bits we keep for extra precision flavor, this makes ten bits for
90 /// each value, or twenty to store the full pair. To simplify the
91 /// interpolation, the pair is packed into a single 32-bit register
96 /// ________vvVVVVVVvv____uuUUUUUUuu
99 /// The unused bits between the u and v values permit the packed
100 /// values to be added/subtracted without the u values spilling over
101 /// into the v values. Then, after anding out the carry/borrow bits,
102 /// the instructions "movb %al, %ah ; shrl $10, %eax" nicely
103 /// right-justify the desired values into a pixptr offset.
105 /// The FP stack is loaded up with the values of u, v, and z,
106 /// converted to floats. %ebp is used to hold the value of l, %esi is
107 /// set to pixptr, and %edi points to our current position in
112 // This is used to abbreviate an annoying external variable name.
114 .equ fadetbl, _gr_fade_table
117 // The following macro encapsulates the floating-point instructions
118 // that put the results of a prior division to use and prepare for the
119 // next division. At the beginning of the macro, the FP stack contains
120 // (from top to bottom): z, u, v, 64/z. The macro computes (64*u)/z,
121 // which is stored in ubyz4, and (64*v)/z, which is stored in vbyz4.
122 // The number (2^51 + 2^52) is added to each number before they are
123 // stored as qwords. Since qwords only have 52 bits of precision, this
124 // magic number causes the fractional part to be shifted off the end,
125 // leaving the integral part right-shifted. Thus, reading the low
126 // dword gives the original number rounded off to the nearest integer
127 // - in two's complement, no less. (This technique allows for more
128 // pipelining than using the more straightforward fist/p
129 // instruction.) Simultaneous with this, the macro adds dudx to u,
130 // dvdx to v, and dzdx to z, and finally puts 64 back onto the stack.
131 // At the end of the macro, the stack contains: z, u, v, 64.
133 .macro DoFPCalcs // The FP stack after each instruction:
135 fadds (dzdx) // z += dzdx z' u v 64/z
136 fxch %st(1) // u z' v 64/z
137 fst %st(4) // u z' v 64/z u
138 fmul %st(3) // (64 / z) * u u/z z' v 64/z u
139 fxch %st(4) // u z' v 64/z u/z
140 fadds (dudx) // u += dudx u' z' v 64/z u
141 fxch %st(2) // v z' u' 64/z u/z
142 fmul %st, %st(3) // (64 / z) * v v z' u' v/z u/z
143 fxch %st(4) // u/z z' u' v/z v
144 fadds (magic) // U/Z z' u' v/z v
145 fxch %st(4) // v z' u' v/z U/Z
146 fadds (dvdx) // v += dvdx v' z' u' v/z U/Z
147 fxch %st(3) // v/z z' u' v' U/Z
148 fadds (magic) // V/Z z' u' v' U/Z
149 flds (flt64) // 64 V/Z z' u' v' U/Z
150 fxch %st(5) // U/Z V/Z z' u' v' 64
151 fstpl (ubyz4) // V/Z z' u' v' 64
152 fstpl (vbyz4) // z' u' v' 64
153 // (ready to start the next division)
158 .equ _gr_fade_table, gr_fade_table
159 .equ _write_buffer, write_buffer
160 .equ _bytes_per_row,bytes_per_row
161 .equ _fx_xleft, fx_xleft
162 .equ _fx_xright, fx_xright
168 .equ _fx_du_dx, fx_du_dx
169 .equ _fx_dv_dx, fx_dv_dx
170 .equ _fx_dz_dx, fx_dz_dx
171 .equ _fx_dl_dx, fx_dl_dx
172 .equ _Transparency_on, Transparency_on
174 .globl asm_pent_tmap_scanline_per
176 .globl _asm_pent_tmap_scanline_per
179 .extern _pixptr, _gr_fade_table, _write_buffer
180 .extern _bytes_per_row, _fx_xleft, _fx_xright, _fx_y
181 .extern _fx_u, _fx_v, _fx_z, _fx_l
182 .extern _fx_du_dx, _fx_dv_dx, _fx_dz_dx, _fx_dl_dx
183 .extern _Transparency_on
188 //.local dudx, dvdx, dzdx, dldx
189 //.local ubyz4, vbyz4, uvzero
190 //.local lastquartet, lastpixel, ctlwd
191 //.local flt64, magic
198 lastquartet: .long 0 // where to stop the 4-pixels loop
199 lastpixel: .long 0 // where to stop drawing entirely
200 flt64: .long 0x42800000 // 64.0 (what we divide z into)
201 magic: .long 0x59C00000 // 2^51 + 2^52 (to get ints from floats)
202 ubyz4: .double 0.0 // u/z for the next iteration
203 vbyz4: .double 0.0 // v/z for the next iteration
204 dudx: .long 0 // u's rate of change as a float
205 dvdx: .long 0 // v's rate of change as a float
206 dzdx: .long 0 // z's rate of change as a float
207 dldx: .long 0 // l's rate of change as an integer
208 uvzero: .long 0 // packed u/z and v/z values
209 ctlwd: .word 0 // the pre-tweaked FPU control word
217 // void c_tmap_scanline_per(void)
221 asm_pent_tmap_scanline_per:
223 _asm_pent_tmap_scanline_per:
226 // Save registers the compiler might be using.
232 // Tell the FPU to use 64-bit numbers (still plenty precise enough for
233 // our needs) so as to speed up fdiv.
244 // Multiply dudx, dvdx, and dzdx by four, and store locally, converted
245 // into floating point.
247 movl (_fx_du_dx), %ebx
248 movl (_fx_dv_dx), %ecx
250 movl (_fx_dz_dx), %edx
264 // bytes_per_row * fx_y is the offset for the current scanline. (We do
265 // this now before we start the first FP division.)
267 movl (_bytes_per_row), %eax
271 // Push 64.0, v, u, and z onto the FPU stack, and then start
272 // calculating the first 64 / z.
280 // Meanwhile, get l and dldx (again, the latter multiplied by four)
281 // into %edx and %ebp, where they will be stored for the duration. The
282 // original values are divided by 256 so that the byte needed for the
283 // fade table offset is squarely in %dh.
285 //Dividing by 256 is bad.. rounding errors and crap. We'll now do that
286 //right before we need to access the table instead. -MM
290 movl (_fx_dl_dx), %edx
295 // Store pixptr, the pointer to our 64x64 texture map, in %esi. Store
296 // write_buffer, the pointer to our frame buffer, in %edi. Then offset
297 // %edi so that it points to pixel (fx_y)(fx_xleft). Calculate a
298 // pointer to (fx_y)[fx_xright + 1] so we know when to stop drawing.
299 // Also calculate a pointer to (fx_y)[(fx_xright + 1) & ~3] so we know
300 // when to stop drawing four pixels at a time.
303 movl (_write_buffer), %edi
304 movl (_fx_xright), %ecx
308 addl (_fx_xleft), %edi
313 movl %ecx, (lastpixel)
315 movl %ecx, (lastquartet)
317 // Calculate round(64 * u / z) and round(64 * v / z), store, and
318 // increment u, v, and z. Then start calculating the second 64 / z.
323 // Get our u/z and v/z values, lop off the bits we don't care about,
324 // pack, and store in uvzero.
337 // While we're waiting for the last division to finish, we might as
338 // well get the frame buffer into the cache.
342 // Are there at least four pixels to draw? If not, skip to the epilog
348 // Do we need to test for transparencies?
350 testl $(~0), (_Transparency_on)
353 // If not, then use the simpler loop here.
360 // While the FPU is busy dividing, the latest u/z and v/z values are
361 // retrieved, packed, and stored in uvzero (to be used again in the
362 // next iteration). The old uvzero value, which contains the uv values
363 // for pixel 0, gets subtracted from the new uvzero value to
364 // determined the total change in u/z and v/z across the four pixels,
365 // and this is divided by 4 to get the average. This average is then
366 // used to estimate the values for pixels 1, 2, and 3. The old uvzero
367 // value is used immediately to calculate pixel 0, while %eax, %ebx, and
368 // %ecx are entrusted with the uv values for pixels 1, 2, and 3
369 // respectively. %edx is set to the current value of l, such that %dh is
370 // already set as half of the offset into fadetbl. Each uv value is
371 // used to set its pixel as follows (assuming our packed uv value is
374 // a: andl $0x003F00FC, %ebx / mask off extraneous bits
375 // b: movb %bl, %bh / make u flush with v
376 // c: shrl $10, %ebx / right-justify u and v
377 // d: movb (%esi,%ebx), %dl / get texture-map pixel
378 // e: movb fadetbl(%edx), %bl / correct for lighting level
379 // f: movb %bl, (%edi) / write pixel to frame buffer
381 // The above is done four times, once for each pixel. All of the
382 // calculcations are interleaved in order to avoid AGI stalls and
383 // missed pairing opportunities.
395 movl (uvzero), %ecx // %ecx = uv value for pixel 0
399 andl $0x003F00FC, %ecx // 0 a
403 shrl $10, %ecx // 0 c
408 movb (%esi,%ecx), %dl // 0 d
410 lea (%eax,%ebx,2), %ecx // %ecx = uv value for pixel 2
411 addl %ebx, %eax // %eax = uv value for pixel 1
412 addl %ecx, %ebx // %ebx = uv value for pixel 3
413 andl $0x003F00FC, %ecx // 2 a
415 movb fadetbl(%edx), %dl // 0 e
416 shrl $10, %ecx // 2 c
417 andl $0x003F00FC, %eax // 1 a
418 movb %dl, -4(%edi) // 0 f
420 movb (%esi,%ecx), %dl // 2 d
421 andl $0x003F00FC, %ebx // 3 a
422 shrl $10, %eax // 1 c
424 movb fadetbl(%edx), %cl // 2 e
425 movb (%esi,%eax), %dl // 1 d
426 shrl $10, %ebx // 3 c
427 movb %cl, -2(%edi) // 2 f
429 movb fadetbl(%edx), %al // 1 e
430 movb (%esi,%ebx), %dl // 3 d
431 movb %al, -3(%edi) // 1 f
433 movb fadetbl(%edx), %bl // 3 e
434 movl (lastquartet), %ecx
435 movb %bl, -1(%edi) // 3 f
439 // Are there any pixels left at all?
441 cmpl (lastpixel), %edi
450 // This is similar to the LoopTransOff loop, the big change being that
451 // each value retrieved from the texture map is tested against 255,
452 // the transparent "color". A value of 255 in the texture map means to
453 // let the existing value for that pixel in write_buffer go by
454 // unchanged. Thus the code for each pixel looks something like this
457 // a: andl $0x003F00FC, %ebx / mask off extraneous bits
458 // b: movb %bl, %bh / make u flush with v
459 // c: shrl $10, %ebx / right-justify u and v
460 // d: movb (%esi,%ebx), %dl / get texture-map pixel
461 // e: cmpb $255, %dl / is pixel transparent?
462 // f: sbbb %bh, %bh / yes, %bh=00; no, %bh=FF
463 // g: movb fadetbl(%edx), %dl / get lighting-corrected pixel
464 // h: movb (%edi), %bl / get pixel in frame buffer now
465 // i: xorb %bl, %dl / combine the two
466 // j: andb %dl, %bh / use %bh as a mask to select
467 // k: xorb %bl, %bh / which pixel to keep
468 // l: movb %bh, (%edi) / write pixel to frame buffer
470 // When the texture-map value is 255, the code simply writes the
471 // original frame-buffer value back out again; otherwise the new pixel
472 // is written instead. The ands and xors used to accomplish this bulk
473 // up the code, but on the whole it is better than having four
474 // unpredictable jumps in the loop. The four repeats of the above code
475 // are even more intertwined than the other loop, due to the extra
476 // register usage. Also note that the last two pixels combine steps i,
477 // j, and k with each other.
485 movl (uvzero), %ecx // %ecx = uv for pixel 0
491 andl $0x003F00FC, %ecx // 0 a
495 shrl $10, %ecx // 0 c
500 movb (%esi,%ecx), %dl // 0 d
503 movb -4(%edi), %ch // 0 h
504 movb fadetbl(%edx), %cl // 0 g
505 cmpb $255, %dl // 0 e
511 / nop // (V-pipe idle)
512 lea (%eax,%ebx,2), %ecx // %ecx = uv for pixel 2
513 addl %ebx, %eax // %eax = uv for pixel 1
514 andl $0x003F00FC, %eax // 1 a
515 addl %ecx, %ebx // %ebx = uv for pixel 3
517 andl $0x003F00FC, %ecx // 2 a
518 shrl $10, %eax // 1 c
519 andl $0x003F00FC, %ebx // 3 a
522 movb %dl, -4(%edi) // 0 l
523 movb (%esi,%eax), %dl // 1 d
524 movb -3(%edi), %al // 1 h
525 cmpb $255, %dl // 1 e
527 movb fadetbl(%edx), %dl // 1 g
528 shrl $10, %ecx // 2 c
530 shrl $10, %ebx // 3 c
533 movb (%esi,%ecx), %dl // 2 d
534 movb %ah, -3(%edi) // 1 l
535 cmpb $255, %dl // 2 e
537 movb fadetbl(%edx), %ch // 2 g
538 movb (%esi,%ebx), %dl // 3 d
539 movb -2(%edi), %bh // 2 h
540 cmpb $255, %dl // 3 e
541 movb -1(%edi), %bl // 3 h
543 movb fadetbl(%edx), %cl // 2 g
545 xorl %ebx, %ecx // 2 i and 3 i
547 andl %ecx, %eax // 2 j and 3 j
548 movl (lastquartet), %ecx
549 xorl %ebx, %eax // 2 k and 3 k
550 movb %ah, -2(%edi) // 2 l
552 movb %al, -1(%edi) // 3 l
555 // Quit if there are none at all left.
557 cmpl (lastpixel), %edi
563 // Here we finish off the last one-to-three pixels assigned to us.
564 // Rather than calculating values for all four pixels, we just divide
565 // the difference by four and keep adding this average into the value
566 // as needed. (This code is not particularly optimized, by the way,
567 // since it represents such a miniscule amount of the running time.)
583 andl $0x003FC0FF, %ecx
585 movl (lastpixel), %ebp
589 LoopLastBits: movl %ebx, %eax
593 movb (%esi,%eax), %dl
596 movb fadetbl(%edx), %al
598 LetPixelBy: addl %ecx, %ebx
606 // We're done! Clear the stacks, reset the FPU control word, and we
607 // are so out of here.