1 /// $Id: tmapppro.S,v 1.5 2003-02-18 20:15:48 btb Exp $
2 /// tmap_scanline_per - Pentium-Pro-optimized assembly version
3 /// written by Brian Raiter, Mar 1998.
4 /// lighting roundoff error fixed by Matt Mueller, July 1999
6 /// The gist of the algorithm is as follows (note that this is
7 /// pseudocode, not actual C):
13 /// int x, ubyz, vbyz;
14 /// byte texmap[64][64] = pixptr;
15 /// byte framebuffer[][bytes_per_row] = write_buffer;
16 /// byte lightingtable[][256] = gr_fade_table;
19 /// for (x = fx_xleft ; x <= fx_xright ; ++x) {
20 /// ubyz = (u / z) & 63;
21 /// vbyz = (v / z) & 63;
22 /// c = texmap[ubyz][vbyz];
23 /// if (c != TRANSPARENT_COLOR)
24 /// framebuffer[fx_y][x] = lightingtable[l / 65536][c];
31 /// The global variable Transparency_on is zero when it is known that
32 /// there are no transparencies involved, so in that case we use a
33 /// different loop that skips the transparency test.
35 /// The actual algorithm used here only does the division calculations
36 /// every fourth pixel, and linearly interpolates the other three.
37 /// Something along the lines of:
39 /// /* Initial values as before */
40 /// int ubyz0, ubyz0, ubyz4, ubyz4, du1, dv1, i;
44 /// for (x = fx_xleft ; x <= fx_xright - 3 ; x += 4) {
45 /// u += fx_du_dx * 4;
46 /// v += fx_dv_dx * 4;
47 /// z += fx_dz_dx * 4;
50 /// du1 = (ubyz4 - ubyz0) / 4;
51 /// dv1 = (vbyz4 - vbyz0) / 4;
54 /// for (i = 0 ; i < 4 ; ++i) {
55 /// c = texmap[ubyz & 63][vbyz & 63];
56 /// if (c != TRANSPARENT_COLOR)
57 /// framebuffer[fx_y][x + i] = lightingtable[l / 65536][c];
65 /// for ( ; x <= fx_xright ; ++x) {
66 /// /* Finish off remaining 0-3 pixels */
69 /// So much for the basic overview.
71 /// In this version, the PPro's floating-point unit is pressed into
72 /// service to do the actual divisions, so that 1/z can be calculated
73 /// first, and the resulting reciprocal multiplied with u and v. These
74 /// two products are then stored back out as integers. This keeps us
75 /// down to doing only one division every four pixels, during which
76 /// other integer instructions can be overlapped.
78 /// The algorithm actually divides 64 by z, so that the rounded-off
79 /// products will effectively be stored with six fraction bits. This
80 /// allows the algorithm to correct for minor floating-point roundoff
81 /// errors. Two fraction bits are kept during the interpolation of the
82 /// three middle pixels, which hopefully increases the accuracy of the
85 /// We only need the lowest six (integral) bits of u/z and v/z for
86 /// each pixptr offset, so we only need eight bits of each fourth
87 /// pair of values to figure the interpolation. Add with the two
88 /// fractional bits we keep for extra precision flavor, this makes ten
89 /// bits for each value, or twenty to store the full pair. To simplify
90 /// the interpolation, the pair is packed into a single 32-bit
95 /// vvVVVVVVvv____________uuUUUUUUuu
98 /// The unused bits between the u and v values permit the packed
99 /// values to be added/subtracted without the u values spilling over
100 /// into the v values. Then, the instructions "bswap %eax ; roll $6,
101 /// %eax ; andl $0x0FFF, %eax" will right-justify the desired values
102 /// into a pixptr offset.
104 /// The FP stack is loaded up with the values of u, v, and z,
105 /// converted to floats. %ebp is used to hold the value of l, %esi is
106 /// is set to pixptr, and %edi points to our current position in
111 // This is used to abbreviate an annoying external variable name.
113 .equ fadetbl, _gr_fade_table
116 // The following macro encapsulates the floating-point instructions
117 // that put the results of a prior division to use and prepare for the
118 // next division. At the beginning of the macro, the FP stack contains
119 // (from top to bottom): 64/z, z, u, v. The macro computes (64*u)/z,
120 // which is stored in ubyz4, and (64*v)/z, which is stored in vybz4.
121 // Simultaneous with this, the macro adds dudx to u, dvdx to v, and
122 // dzdx to z, and finally puts 64 back onto the stack. At the end of
123 // the macro, the stack contains: 64, z, u, v.
125 .macro DoFPCalcs 0 // The FP stack after each instruction:
127 fst %st(4) // 64/z z u v 64/z
128 fxch %st(2) // u z 64/z v 64/z
129 fmul %st, %st(4) // (64 * u) / z u z 64/z v u/z
130 fadds (dudx) // u += dudx u' z 64/z v u/z
131 fxch %st(3) // v z 64/z u' u/z
132 fmul %st, %st(2) // (64 * v) / z v z v/z u' u/z
133 fadds (dvdx) // v += dvdx v' z v/z u' u/z
134 fxch %st(1) // z v' v/z u' u/z
135 fadds (dzdx) // z += dzdx z' v' v/z u' u/z
136 fxch %st(2) // v/z v' z' u' u/z
137 flds (flt64) // 64 v/z v' z' u' u/z
138 fxch %st(5) // u/z v/z v' z' u' 64
139 fistpl (ubyz4) // v/z v' z' u' 64
140 fistpl (vbyz4) // v' z' u' 64
141 fxch %st(3) // 64 z' u' v'
142 // (ready to start the next division)
148 .equ _gr_fade_table, gr_fade_table
149 .equ _write_buffer, write_buffer
150 .equ _bytes_per_row, bytes_per_row
151 .equ _fx_xleft, fx_xleft
152 .equ _fx_xright, fx_xright
158 .equ _fx_du_dx, fx_du_dx
159 .equ _fx_dv_dx, fx_dv_dx
160 .equ _fx_dz_dx, fx_dz_dx
161 .equ _fx_dl_dx, fx_dl_dx
162 .equ _Transparency_on, Transparency_on
164 .globl asm_ppro_tmap_scanline_per
166 .globl _asm_ppro_tmap_scanline_per
169 .extern _pixptr, _gr_fade_table, _write_buffer
170 .extern _bytes_per_row, _fx_xleft, _fx_xright, _fx_y
171 .extern _fx_u, _fx_v, _fx_z, _fx_l
172 .extern _fx_du_dx, _fx_dv_dx, _fx_dz_dx, _fx_dl_dx
173 .extern _Transparency_on
175 //.local dudx, dvdx, dzdx, dldx, l
176 //.local ubyz, vbyz, uvzero
177 //.local lastquartet, lastpixel, ctwl
184 dudx: .long 0 // u's rate of change as a float
185 dvdx: .long 0 // v's rate of change as a float
186 dzdx: .long 0 // z's rate of change as a float
187 dldx: .long 0 // l's rate of change as an integer
188 l: .long 0 // the current l value
189 ubyz4: .long 0 // u/z for the next iteration
190 vbyz4: .long 0 // v/z for the next iteration
191 uvzero: .long 0 // packed u/z and v/z values
192 lastquartet: .long 0 // where to stop the 4-pixels loop
193 lastpixel: .long 0 // where to stop drawing entirely
194 flt64: .long 0x42800000 // 64.0 (what we divide z into)
195 ctlwd: .long 0 // the pre-tweaked FPU control word
203 // void c_tmap_scanline_per(void)
207 asm_ppro_tmap_scanline_per:
209 _asm_ppro_tmap_scanline_per:
212 // Save registers the compiler might be using.
218 // Kick the FPU into the lowest precision (still enough for our needs)
219 // so as to speed up fdiv.
229 // Multiply dudx, dvdx, and dzdx by four, and store locally, converted
230 // into floating point.
232 movl (_fx_du_dx), %eax
235 movl (_fx_dv_dx), %eax
238 movl (_fx_dz_dx), %eax
249 // bytes_per_row * fx_y is the offset for the current scanline. (We do
250 // this now before we start the first FP division.)
252 movl (_bytes_per_row), %eax
256 // Push v, u, z, and 64.0 onto the FPU stack, and then start
257 // calculating the first 64 / z.
265 // Meanwhile, get l and dldx (again, the latter multiplied by four).
266 // l will be stored in %ebp for the duration. The original values are
267 // divided by 256 so that the byte needed for the fade table offset
270 //Dividing by 256 is bad.. rounding errors and crap. We'll now do that
271 //right before we need to access the table instead. -MM
276 movl (_fx_dl_dx), %edx
281 // Store pixptr, the pointer to our 64x64 texture map, in %esi. Store
282 // write_buffer, the pointer to our frame buffer, in %edi. Then offset
283 // %edi so that it points to pixel [fx_y][fx_xleft]. Calculate a
284 // pointer to [fx_y][fx_xright + 1] so we know when to stop drawing.
285 // Also calculate a pointer to [fx_y][(fx_xright + 1) & ~3] so we know
286 // when to stop drawing four pixels at a time.
289 movl (_write_buffer), %edi
290 movl (_fx_xright), %ecx
294 movl %ecx, (lastpixel)
295 addl (_fx_xleft), %edi
301 movl %ecx, (lastquartet)
303 // Calculate round(64 * u / z) and round(64 * v / z), store, and
304 // increment u, v, and z. Then start calculating the second 64 / z.
309 // Get our u/z and v/z values, lop off the bits we don't care
310 // about, pack, and store in uvzero.
323 // Are there at least four pixels to draw? If not, skip to the epilog
329 // Do we need to test for transparencies?
331 testl $(~0), (_Transparency_on)
334 // If not, then use the simpler loop here.
340 // While the FPU is busy dividing, the latest u/z and v/z values are
341 // retrieved, packed, and stored in uvzero (to be used again in the
342 // next iteration). The old uvzero value, which contains the uv values
343 // for pixel 0, gets subtracted from the new uvzero value to
344 // determined the total change in u/z and v/z across the four pixels,
345 // and this is divided by 4 to get the average. This average is then
346 // used to estimate the values for pixels 1, $2, and 3. The old uvzero
347 // value is used immediately to calculate pixel 0, while %eax, %ebx, and
348 // %ecx are entrusted with the uv values for pixels 1, $2, and 3
349 // respectively, while %edx is our "cleansed" register for using byte
350 // values as memory pointer offsets. %ebp is loaded with the high byte
351 // of l, forming half of the offset for the fade table lookup. (The
352 // pixel from the texture-map bitmap supplies the other half.) Each
353 // value is used to set its pixel as follows (assuming %eax holds our
356 // a: bswapl %eax / move u and v to the
357 // b: roll $6, %eax / far right
358 // c: andl $0x0FFF, %eax / mask off extra bits
359 // d: movb (%esi,%eax), %dl / get texture-map pixel
360 // e: movb fadetbl(%edx,%ebp), %dl / correct for lighting
361 // f: movb %dl, (%edi) / write to frame buffer
363 // The above is done four times, once for each pixel. Some of the
364 // calculations may appear to be interleaved haphazardly, but the PPro
365 // seems to like it this way.
371 movl (uvzero), %eax // %eax = uv for pixel 0
374 andl $0x0FFF, %eax // 0 c
375 movb (%esi,%eax), %dl // 0 d
382 movb fadetbl(%edx,%ebp), %dl // 0 e
399 movb %dl, (%edi) // 0 f
400 lea (%eax,%ecx,2), %ebx // %ebx = uv for pixel 2
401 addl %ecx, %eax // %eax = uv for pixel 1
404 addl %ebx, %ecx // %ecx = uv for pixel 3
408 andl $0x0FFF, %eax // 1 c
409 andl $0x0FFF, %ebx // 2 c
412 movb (%esi,%eax), %dl // 1 d
413 movb fadetbl(%edx,%ebp), %al // 1 e
414 movb (%esi,%ebx), %dl // 2 d
415 movb fadetbl(%edx,%ebp), %bl // 2 e
416 movb %al, 1(%edi) // 1 f
417 andl $0x0FFF, %ecx // 3 c
418 movb %bl, 2(%edi) // 2 f
419 movb (%esi,%ecx), %dl // 3 d
420 movb fadetbl(%edx,%ebp), %cl // 3 e
421 movb %cl, 3(%edi) // 3 f
424 cmpl (lastquartet), %edi
427 // Are there any pixels left at all?
429 cmpl (lastpixel), %edi
438 // This is similar to the LoopTransOff loop, the big change being that
439 // each value retrieved from the texture map is tested against 255,
440 // the transparent "color". A value of 255 in the texture map means to
441 // let the existing value for that pixel in write_buffer go by
442 // unchanged. Thus the code for each pixel looks something like this
445 // a: bswapl %eax / move u and v to the
446 // b: roll $6, %eax / far right
447 // c: andl $0x0FFF, %eax / mask off extra bits
448 // d: movb (%esi,%eax), %dl / get texture-map pixel
449 // e: cmpb $255, %dl / is pixel transparent?
450 // f: sbbb %ah, %ah / yes:%ah=00, no:%ah=FF
451 // g: movb fadetbl(%edx,%ebp), %dl / correct for lighting
452 // h: movb (%edi), %al / get current pixel
453 // i: xorb %al, %dl / combine the two
454 // j: andb %dl, %ah / use %ah as a mask to
455 // k: xorb %ah, %al / select which pixel
456 // l: movb %al, (%edi) / write to frame buffer
458 // When the texture-map value is 255, the code simply writes the
459 // original frame-buffer value back out again; otherwise the new pixel
460 // is written instead. The ands and xors used to accomplish this bulk
461 // up the code, but on the whole it is better than having four
462 // unpredictable jumps in the loop.
467 movl (uvzero), %eax // %eax = uv for pixel 0
473 andl $0x0FFF, %eax // 0 c
475 movb (%esi,%eax), %dl // 0 d
476 cmpb $255, %dl // 0 e
482 movb fadetbl(%edx,%ebp), %dl // 0 g
483 movb (%edi), %al // 0 h
487 movb %al, (%edi) // 0 l
504 lea (%eax,%ecx,2), %ebx // %ebx = uv for pixel 2
505 addl %ecx, %eax // %eax = uv for pixel 1
508 addl %ebx, %ecx // %ecx = uv for pixel 3
511 andl $0x0FFF, %eax // 1 c
512 movb (%esi,%eax), %dl // 1 d
513 cmpb $255, %dl // 1 e
516 movb 1(%edi), %al // 1 h
517 movb fadetbl(%edx,%ebp), %dl // 1 g
520 andl $0x0FFF, %ebx // 2 c
523 movb (%esi,%ebx), %dl // 2 d
524 cmpb $255, %dl // 2 e
526 movb fadetbl(%edx,%ebp), %dl // 2 g
527 andl $0x0FFF, %ecx // 3 c
528 movb 2(%edi), %bl // 2 h
532 movb (%esi,%ecx), %dl // 3 d
533 cmpb $255, %dl // 3 e
535 movb 3(%edi), %cl // 3 h
536 movb fadetbl(%edx,%ebp), %dl // 3 g
541 movb %al, 1(%edi) // 1 l
543 movb %bl, 2(%edi) // 2 l
545 movb %cl, 3(%edi) // 3 l
548 cmpl (lastquartet), %edi
551 // Quit if there are none at all left.
553 cmpl (lastpixel), %edi
559 // Here we finish off the last one-to-three pixels assigned to us.
560 // Rather than calculating values for all four pixels, we just divide
561 // the difference by four and keep adding this average into the value
562 // as needed. (This code is not particularly optimized, by the way,
563 // since it represents such a miniscule amount of the running time.)
584 LoopLastBits: movl %ebx, %eax
588 movb (%esi,%eax), %dl
591 movb fadetbl(%edx,%ebp), %dl
593 LetPixelBy: incl %edi
595 cmpl (lastpixel), %edi
601 // We're done! Clear the stacks, reset the FPU control word, and we
602 // are so out of here.