1 /// tmap_scanline_per - Pentium-optimized assembly version
2 /// written by Brian Raiter, Mar 1998.
3 /// lighting roundoff error fixed by Matt Mueller, July 1999
6 /// The gist of the algorithm is as follows (note that this is
7 /// pseudocode, not actual C):
13 /// int x, ubyz, vbyz;
14 /// byte texmap[64][64] = pixptr;
15 /// byte framebuffer[][bytes_per_row] = write_buffer;
16 /// byte lightingtable[][256] = gr_fade_table;
19 /// for (x = fx_xleft ; x <= fx_xright ; ++x) {
20 /// ubyz = (u / z) & 63;
21 /// vbyz = (v / z) & 63;
22 /// c = texmap[ubyz][vbyz];
23 /// if (c != TRANSPARENT_COLOR)
24 /// framebuffer[fx_y][x] = lightingtable[l / 65536][c];
31 /// The global variable Transparency_on is zero when it is known that
32 /// there are no transparencies involved, so in that case we use a
33 /// different loop that skips the transparency test.
35 /// The actual algorithm used here only does the division calculations
36 /// every fourth pixel, and linearly interpolates the other three.
37 /// Something along the lines of:
39 /// /* Initial values as before */
40 /// int ubyz0, ubyz0, ubyz4, ubyz4, du1, dv1, i;
44 /// for (x = fx_xleft ; x <= fx_xright - 3 ; x += 4) {
45 /// u += fx_du_dx * 4;
46 /// v += fx_dv_dx * 4;
47 /// z += fx_dz_dx * 4;
50 /// du1 = (ubyz4 - ubyz0) / 4;
51 /// dv1 = (vbyz4 - vbyz0) / 4;
54 /// for (i = 0 ; i < 4 ; ++i) {
55 /// c = texmap[ubyz & 63][vbyz & 63];
56 /// if (c != TRANSPARENT_COLOR)
57 /// framebuffer[fx_y][x + i] = lightingtable[l / 65536][c];
65 /// for ( ; x <= fx_xright ; ++x) {
66 /// /* Finish off remaining 0-3 pixels */
69 /// So much for the basic overview.
71 /// In this version, the Pentium's floating-point unit is pressed into
72 /// service to do the actual divisions, so that 1/z can be calculated
73 /// first, and the resulting reciprocal multiplied with u and v. These
74 /// two products are then stored back out as integers. This keeps us
75 /// down to doing only one division every four pixels, during which
76 /// other integer instructions can be overlapped.
78 /// The algorithm actually divides 64 by z, so that the rounded-off
79 /// products will effectively be stored with six fraction bits. This
80 /// allows the algorithm to correct for minor floating-point roundoff
81 /// errors. Two fraction bits are kept during the interpolation of the
82 /// three middle pixels, which hopefully increases the accuracy of the
85 /// We only need the lowest six (integral) bits of u/z and v/z for
86 /// each pixptr offset, so we only need eight bits of each fourth pair
87 /// of values to figure the interpolation. Add with the two fractional
88 /// bits we keep for extra precision flavor, this makes ten bits for
89 /// each value, or twenty to store the full pair. To simplify the
90 /// interpolation, the pair is packed into a single 32-bit register
95 /// ________vvVVVVVVvv____uuUUUUUUuu
98 /// The unused bits between the u and v values permit the packed
99 /// values to be added/subtracted without the u values spilling over
100 /// into the v values. Then, after anding out the carry/borrow bits,
101 /// the instructions "movb %al, %ah ; shrl $10, %eax" nicely
102 /// right-justify the desired values into a pixptr offset.
104 /// The FP stack is loaded up with the values of u, v, and z,
105 /// converted to floats. %ebp is used to hold the value of l, %esi is
106 /// set to pixptr, and %edi points to our current position in
111 // This is used to abbreviate an annoying external variable name.
113 .equ fadetbl, _gr_fade_table
116 // The following macro encapsulates the floating-point instructions
117 // that put the results of a prior division to use and prepare for the
118 // next division. At the beginning of the macro, the FP stack contains
119 // (from top to bottom): z, u, v, 64/z. The macro computes (64*u)/z,
120 // which is stored in ubyz4, and (64*v)/z, which is stored in vbyz4.
121 // The number (2^51 + 2^52) is added to each number before they are
122 // stored as qwords. Since qwords only have 52 bits of precision, this
123 // magic number causes the fractional part to be shifted off the end,
124 // leaving the integral part right-shifted. Thus, reading the low
125 // dword gives the original number rounded off to the nearest integer
126 // - in two's complement, no less. (This technique allows for more
127 // pipelining than using the more straightforward fist/p
128 // instruction.) Simultaneous with this, the macro adds dudx to u,
129 // dvdx to v, and dzdx to z, and finally puts 64 back onto the stack.
130 // At the end of the macro, the stack contains: z, u, v, 64.
132 .macro DoFPCalcs // The FP stack after each instruction:
134 fadds (dzdx) // z += dzdx z' u v 64/z
135 fxch %st(1) // u z' v 64/z
136 fst %st(4) // u z' v 64/z u
137 fmul %st(3) // (64 / z) * u u/z z' v 64/z u
138 fxch %st(4) // u z' v 64/z u/z
139 fadds (dudx) // u += dudx u' z' v 64/z u
140 fxch %st(2) // v z' u' 64/z u/z
141 fmul %st, %st(3) // (64 / z) * v v z' u' v/z u/z
142 fxch %st(4) // u/z z' u' v/z v
143 fadds (magic) // U/Z z' u' v/z v
144 fxch %st(4) // v z' u' v/z U/Z
145 fadds (dvdx) // v += dvdx v' z' u' v/z U/Z
146 fxch %st(3) // v/z z' u' v' U/Z
147 fadds (magic) // V/Z z' u' v' U/Z
148 flds (flt64) // 64 V/Z z' u' v' U/Z
149 fxch %st(5) // U/Z V/Z z' u' v' 64
150 fstpl (ubyz4) // V/Z z' u' v' 64
151 fstpl (vbyz4) // z' u' v' 64
152 // (ready to start the next division)
157 .equ _gr_fade_table, gr_fade_table
158 .equ _write_buffer, write_buffer
159 .equ _bytes_per_row,bytes_per_row
160 .equ _fx_xleft, fx_xleft
161 .equ _fx_xright, fx_xright
167 .equ _fx_du_dx, fx_du_dx
168 .equ _fx_dv_dx, fx_dv_dx
169 .equ _fx_dz_dx, fx_dz_dx
170 .equ _fx_dl_dx, fx_dl_dx
171 .equ _Transparency_on, Transparency_on
173 .globl asm_tmap_scanline_per
175 .globl _asm_tmap_scanline_per
178 .extern _pixptr, _gr_fade_table, _write_buffer
179 .extern _bytes_per_row, _fx_xleft, _fx_xright, _fx_y
180 .extern _fx_u, _fx_v, _fx_z, _fx_l
181 .extern _fx_du_dx, _fx_dv_dx, _fx_dz_dx, _fx_dl_dx
182 .extern _Transparency_on
187 //.local dudx, dvdx, dzdx, dldx
188 //.local ubyz4, vbyz4, uvzero
189 //.local lastquartet, lastpixel, ctlwd
190 //.local flt64, magic
197 lastquartet: .long 0 // where to stop the 4-pixels loop
198 lastpixel: .long 0 // where to stop drawing entirely
199 flt64: .long 0x42800000 // 64.0 (what we divide z into)
200 magic: .long 0x59C00000 // 2^51 + 2^52 (to get ints from floats)
201 ubyz4: .double 0.0 // u/z for the next iteration
202 vbyz4: .double 0.0 // v/z for the next iteration
203 dudx: .long 0 // u's rate of change as a float
204 dvdx: .long 0 // v's rate of change as a float
205 dzdx: .long 0 // z's rate of change as a float
206 dldx: .long 0 // l's rate of change as an integer
207 uvzero: .long 0 // packed u/z and v/z values
208 ctlwd: .word 0 // the pre-tweaked FPU control word
216 // void c_tmap_scanline_per(void)
220 asm_tmap_scanline_per:
222 _asm_tmap_scanline_per:
225 // Save registers the compiler might be using.
231 // Tell the FPU to use 64-bit numbers (still plenty precise enough for
232 // our needs) so as to speed up fdiv.
243 // Multiply dudx, dvdx, and dzdx by four, and store locally, converted
244 // into floating point.
246 movl (_fx_du_dx), %ebx
247 movl (_fx_dv_dx), %ecx
249 movl (_fx_dz_dx), %edx
263 // bytes_per_row * fx_y is the offset for the current scanline. (We do
264 // this now before we start the first FP division.)
266 movl (_bytes_per_row), %eax
270 // Push 64.0, v, u, and z onto the FPU stack, and then start
271 // calculating the first 64 / z.
279 // Meanwhile, get l and dldx (again, the latter multiplied by four)
280 // into %edx and %ebp, where they will be stored for the duration. The
281 // original values are divided by 256 so that the byte needed for the
282 // fade table offset is squarely in %dh.
284 //Dividing by 256 is bad.. rounding errors and crap. We'll now do that
285 //right before we need to access the table instead. -MM
289 movl (_fx_dl_dx), %edx
294 // Store pixptr, the pointer to our 64x64 texture map, in %esi. Store
295 // write_buffer, the pointer to our frame buffer, in %edi. Then offset
296 // %edi so that it points to pixel (fx_y)(fx_xleft). Calculate a
297 // pointer to (fx_y)[fx_xright + 1] so we know when to stop drawing.
298 // Also calculate a pointer to (fx_y)[(fx_xright + 1) & ~3] so we know
299 // when to stop drawing four pixels at a time.
302 movl (_write_buffer), %edi
303 movl (_fx_xright), %ecx
307 addl (_fx_xleft), %edi
312 movl %ecx, (lastpixel)
314 movl %ecx, (lastquartet)
316 // Calculate round(64 * u / z) and round(64 * v / z), store, and
317 // increment u, v, and z. Then start calculating the second 64 / z.
322 // Get our u/z and v/z values, lop off the bits we don't care about,
323 // pack, and store in uvzero.
336 // While we're waiting for the last division to finish, we might as
337 // well get the frame buffer into the cache.
341 // Are there at least four pixels to draw? If not, skip to the epilog
347 // Do we need to test for transparencies?
349 testl $(~0), (_Transparency_on)
352 // If not, then use the simpler loop here.
359 // While the FPU is busy dividing, the latest u/z and v/z values are
360 // retrieved, packed, and stored in uvzero (to be used again in the
361 // next iteration). The old uvzero value, which contains the uv values
362 // for pixel 0, gets subtracted from the new uvzero value to
363 // determined the total change in u/z and v/z across the four pixels,
364 // and this is divided by 4 to get the average. This average is then
365 // used to estimate the values for pixels 1, 2, and 3. The old uvzero
366 // value is used immediately to calculate pixel 0, while %eax, %ebx, and
367 // %ecx are entrusted with the uv values for pixels 1, 2, and 3
368 // respectively. %edx is set to the current value of l, such that %dh is
369 // already set as half of the offset into fadetbl. Each uv value is
370 // used to set its pixel as follows (assuming our packed uv value is
373 // a: andl $0x003F00FC, %ebx / mask off extraneous bits
374 // b: movb %bl, %bh / make u flush with v
375 // c: shrl $10, %ebx / right-justify u and v
376 // d: movb (%esi,%ebx), %dl / get texture-map pixel
377 // e: movb fadetbl(%edx), %bl / correct for lighting level
378 // f: movb %bl, (%edi) / write pixel to frame buffer
380 // The above is done four times, once for each pixel. All of the
381 // calculcations are interleaved in order to avoid AGI stalls and
382 // missed pairing opportunities.
394 movl (uvzero), %ecx // %ecx = uv value for pixel 0
398 andl $0x003F00FC, %ecx // 0 a
402 shrl $10, %ecx // 0 c
407 movb (%esi,%ecx), %dl // 0 d
409 lea (%eax,%ebx,2), %ecx // %ecx = uv value for pixel 2
410 addl %ebx, %eax // %eax = uv value for pixel 1
411 addl %ecx, %ebx // %ebx = uv value for pixel 3
412 andl $0x003F00FC, %ecx // 2 a
414 movb fadetbl(%edx), %dl // 0 e
415 shrl $10, %ecx // 2 c
416 andl $0x003F00FC, %eax // 1 a
417 movb %dl, -4(%edi) // 0 f
419 movb (%esi,%ecx), %dl // 2 d
420 andl $0x003F00FC, %ebx // 3 a
421 shrl $10, %eax // 1 c
423 movb fadetbl(%edx), %cl // 2 e
424 movb (%esi,%eax), %dl // 1 d
425 shrl $10, %ebx // 3 c
426 movb %cl, -2(%edi) // 2 f
428 movb fadetbl(%edx), %al // 1 e
429 movb (%esi,%ebx), %dl // 3 d
430 movb %al, -3(%edi) // 1 f
432 movb fadetbl(%edx), %bl // 3 e
433 movl (lastquartet), %ecx
434 movb %bl, -1(%edi) // 3 f
438 // Are there any pixels left at all?
440 cmpl (lastpixel), %edi
449 // This is similar to the LoopTransOff loop, the big change being that
450 // each value retrieved from the texture map is tested against 255,
451 // the transparent "color". A value of 255 in the texture map means to
452 // let the existing value for that pixel in write_buffer go by
453 // unchanged. Thus the code for each pixel looks something like this
456 // a: andl $0x003F00FC, %ebx / mask off extraneous bits
457 // b: movb %bl, %bh / make u flush with v
458 // c: shrl $10, %ebx / right-justify u and v
459 // d: movb (%esi,%ebx), %dl / get texture-map pixel
460 // e: cmpb $255, %dl / is pixel transparent?
461 // f: sbbb %bh, %bh / yes, %bh=00; no, %bh=FF
462 // g: movb fadetbl(%edx), %dl / get lighting-corrected pixel
463 // h: movb (%edi), %bl / get pixel in frame buffer now
464 // i: xorb %bl, %dl / combine the two
465 // j: andb %dl, %bh / use %bh as a mask to select
466 // k: xorb %bl, %bh / which pixel to keep
467 // l: movb %bh, (%edi) / write pixel to frame buffer
469 // When the texture-map value is 255, the code simply writes the
470 // original frame-buffer value back out again; otherwise the new pixel
471 // is written instead. The ands and xors used to accomplish this bulk
472 // up the code, but on the whole it is better than having four
473 // unpredictable jumps in the loop. The four repeats of the above code
474 // are even more intertwined than the other loop, due to the extra
475 // register usage. Also note that the last two pixels combine steps i,
476 // j, and k with each other.
484 movl (uvzero), %ecx // %ecx = uv for pixel 0
490 andl $0x003F00FC, %ecx // 0 a
494 shrl $10, %ecx // 0 c
499 movb (%esi,%ecx), %dl // 0 d
502 movb -4(%edi), %ch // 0 h
503 movb fadetbl(%edx), %cl // 0 g
504 cmpb $255, %dl // 0 e
510 / nop // (V-pipe idle)
511 lea (%eax,%ebx,2), %ecx // %ecx = uv for pixel 2
512 addl %ebx, %eax // %eax = uv for pixel 1
513 andl $0x003F00FC, %eax // 1 a
514 addl %ecx, %ebx // %ebx = uv for pixel 3
516 andl $0x003F00FC, %ecx // 2 a
517 shrl $10, %eax // 1 c
518 andl $0x003F00FC, %ebx // 3 a
521 movb %dl, -4(%edi) // 0 l
522 movb (%esi,%eax), %dl // 1 d
523 movb -3(%edi), %al // 1 h
524 cmpb $255, %dl // 1 e
526 movb fadetbl(%edx), %dl // 1 g
527 shrl $10, %ecx // 2 c
529 shrl $10, %ebx // 3 c
532 movb (%esi,%ecx), %dl // 2 d
533 movb %ah, -3(%edi) // 1 l
534 cmpb $255, %dl // 2 e
536 movb fadetbl(%edx), %ch // 2 g
537 movb (%esi,%ebx), %dl // 3 d
538 movb -2(%edi), %bh // 2 h
539 cmpb $255, %dl // 3 e
540 movb -1(%edi), %bl // 3 h
542 movb fadetbl(%edx), %cl // 2 g
544 xorl %ebx, %ecx // 2 i and 3 i
546 andl %ecx, %eax // 2 j and 3 j
547 movl (lastquartet), %ecx
548 xorl %ebx, %eax // 2 k and 3 k
549 movb %ah, -2(%edi) // 2 l
551 movb %al, -1(%edi) // 3 l
554 // Quit if there are none at all left.
556 cmpl (lastpixel), %edi
562 // Here we finish off the last one-to-three pixels assigned to us.
563 // Rather than calculating values for all four pixels, we just divide
564 // the difference by four and keep adding this average into the value
565 // as needed. (This code is not particularly optimized, by the way,
566 // since it represents such a miniscule amount of the running time.)
582 andl $0x003FC0FF, %ecx
584 movl (lastpixel), %ebp
588 LoopLastBits: movl %ebx, %eax
592 movb (%esi,%eax), %dl
595 movb fadetbl(%edx), %al
597 LetPixelBy: addl %ecx, %ebx
605 // We're done! Clear the stacks, reset the FPU control word, and we
606 // are so out of here.