texmap/tmappent.S

   1 /// $Id: tmappent.S,v 1.4 2003-02-18 20:15:48 btb Exp $
   2 /// tmap_scanline_per - Pentium-optimized assembly version
   3 /// written by Brian Raiter, Mar 1998.
   4 /// lighting roundoff error fixed by Matt Mueller, July 1999
   5
   6
   7 /// The gist of the algorithm is as follows (note that this is
   8 /// pseudocode, not actual C):
   9 ///
  10 /// int  u = fx_u;
  11 /// int  v = fx_v;
  12 /// int  z = fx_z;
  13 /// int  l = fx_l;
  14 /// int  x, ubyz, vbyz;
  15 /// byte texmap[64][64] = pixptr;
  16 /// byte framebuffer[][bytes_per_row] = write_buffer;
  17 /// byte lightingtable[][256] = gr_fade_table;
  18 /// byte c;
  19 ///
  20 /// for (x = fx_xleft ; x <= fx_xright ; ++x) {
  21 ///     ubyz = (u / z) & 63;
  22 ///     vbyz = (v / z) & 63;
  23 ///     c = texmap[ubyz][vbyz];
  24 ///     if (c != TRANSPARENT_COLOR)
  25 ///         framebuffer[fx_y][x] = lightingtable[l / 65536][c];
  26 ///     u += fx_du_dx;
  27 ///     v += fx_dv_dx;
  28 ///     z += fx_dz_dx;
  29 ///     l += fx_dl_dx;
  30 /// }
  31 ///
  32 /// The global variable Transparency_on is zero when it is known that
  33 /// there are no transparencies involved, so in that case we use a
  34 /// different loop that skips the transparency test.
  35 ///
  36 /// The actual algorithm used here only does the division calculations
  37 /// every fourth pixel, and linearly interpolates the other three.
  38 /// Something along the lines of:
  39 ///
  40 /// /* Initial values as before */
  41 /// int ubyz0, ubyz0, ubyz4, ubyz4, du1, dv1, i;
  42 ///
  43 /// ubyz0 = u / z;
  44 /// vbyz0 = v / z;
  45 /// for (x = fx_xleft ; x <= fx_xright - 3 ; x += 4) {
  46 ///     u += fx_du_dx * 4;
  47 ///     v += fx_dv_dx * 4;
  48 ///     z += fx_dz_dx * 4;
  49 ///     ubyz4 = u / z;
  50 ///     vbyz4 = v / z;
  51 ///     du1 = (ubyz4 - ubyz0) / 4;
  52 ///     dv1 = (vbyz4 - vbyz0) / 4;
  53 ///     ubyz = ubyz0;
  54 ///     vbyz = vbyz0;
  55 ///     for (i = 0 ; i < 4 ; ++i) {
  56 ///         c = texmap[ubyz & 63][vbyz & 63];
  57 ///         if (c != TRANSPARENT_COLOR)
  58 ///             framebuffer[fx_y][x + i] = lightingtable[l / 65536][c];
  59 ///         ubyz += du1;
  60 ///         vbyz += dv1;
  61 ///         l += fx_dl_dx;
  62 ///     }
  63 ///     ubyz0 = ubyz4;
  64 ///     vbyz0 = vbyz4;
  65 /// }
  66 /// for ( ; x <= fx_xright ; ++x) {
  67 ///     /* Finish off remaining 0-3 pixels */
  68 /// }
  69 ///
  70 /// So much for the basic overview.
  71 ///
  72 /// In this version, the Pentium's floating-point unit is pressed into
  73 /// service to do the actual divisions, so that 1/z can be calculated
  74 /// first, and the resulting reciprocal multiplied with u and v. These
  75 /// two products are then stored back out as integers. This keeps us
  76 /// down to doing only one division every four pixels, during which
  77 /// other integer instructions can be overlapped.
  78 ///
  79 /// The algorithm actually divides 64 by z, so that the rounded-off
  80 /// products will effectively be stored with six fraction bits. This
  81 /// allows the algorithm to correct for minor floating-point roundoff
  82 /// errors. Two fraction bits are kept during the interpolation of the
  83 /// three middle pixels, which hopefully increases the accuracy of the
  84 /// approximations.
  85 ///
  86 /// We only need the lowest six (integral) bits of u/z and v/z for
  87 /// each pixptr offset, so we only need eight bits of each fourth pair
  88 /// of values to figure the interpolation. Add with the two fractional
  89 /// bits we keep for extra precision flavor, this makes ten bits for
  90 /// each value, or twenty to store the full pair. To simplify the
  91 /// interpolation, the pair is packed into a single 32-bit register
  92 /// like so:
  93 ///
  94 ///             3      2       1
  95 ///             1      4       6       8       0
  96 ///             ________vvVVVVVVvv____uuUUUUUUuu
  97 ///                       \v&63/        \u&63/
  98 ///
  99 /// The unused bits between the u and v values permit the packed
 100 /// values to be added/subtracted without the u values spilling over
 101 /// into the v values. Then, after anding out the carry/borrow bits,
 102 /// the instructions "movb %al, %ah ; shrl $10, %eax" nicely
 103 /// right-justify the desired values into a pixptr offset.
 104 ///
 105 /// The FP stack is loaded up with the values of u, v, and z,
 106 /// converted to floats. %ebp is used to hold the value of l, %esi is
 107 /// set to pixptr, and %edi points to our current position in
 108 /// write_buffer.
 109
 110
 111
 112 // This is used to abbreviate an annoying external variable name.
 113
 114 .equ    fadetbl, _gr_fade_table
 115
 116
 117 // The following macro encapsulates the floating-point instructions
 118 // that put the results of a prior division to use and prepare for the
 119 // next division. At the beginning of the macro, the FP stack contains
 120 // (from top to bottom): z, u, v, 64/z. The macro computes (64*u)/z,
 121 // which is stored in ubyz4, and (64*v)/z, which is stored in vbyz4.
 122 // The number (2^51 + 2^52) is added to each number before they are
 123 // stored as qwords. Since qwords only have 52 bits of precision, this
 124 // magic number causes the fractional part to be shifted off the end,
 125 // leaving the integral part right-shifted. Thus, reading the low
 126 // dword gives the original number rounded off to the nearest integer
 127 // - in two's complement, no less. (This technique allows for more
 128 // pipelining than using the more straightforward fist/p
 129 // instruction.) Simultaneous with this, the macro adds dudx to u,
 130 // dvdx to v, and dzdx to z, and finally puts 64 back onto the stack.
 131 // At the end of the macro, the stack contains: z, u, v, 64.
 132
 133 .macro DoFPCalcs                // The FP stack after each instruction:
 134                                 //                 z    u    v  64/z
 135         fadds   (dzdx)          // z += dzdx      z'   u    v  64/z
 136         fxch    %st(1)          //                u    z'   v  64/z
 137         fst     %st(4)          //                u    z'   v  64/z   u
 138         fmul    %st(3)          // (64 / z) * u  u/z   z'   v  64/z   u
 139         fxch    %st(4)          //                u    z'   v  64/z  u/z
 140         fadds   (dudx)          // u += dudx      u'   z'   v  64/z   u
 141         fxch    %st(2)          //                v    z'   u' 64/z  u/z
 142         fmul    %st, %st(3)     // (64 / z) * v   v    z'   u'  v/z  u/z
 143         fxch    %st(4)          //               u/z   z'   u'  v/z   v
 144         fadds   (magic)         //               U/Z   z'   u'  v/z   v
 145         fxch    %st(4)          //                v    z'   u'  v/z  U/Z
 146         fadds   (dvdx)          // v += dvdx      v'   z'   u'  v/z  U/Z
 147         fxch    %st(3)          //               v/z   z'   u'   v'  U/Z
 148         fadds   (magic)         //               V/Z   z'   u'   v'  U/Z
 149         flds    (flt64)         //                64  V/Z   z'   u'   v'  U/Z
 150         fxch    %st(5)          //               U/Z  V/Z   z'   u'   v'   64
 151         fstpl   (ubyz4)         //               V/Z   z'   u'   v'   64
 152         fstpl   (vbyz4)         //                z'   u'   v'   64
 153                                 // (ready to start the next division)
 154 .endm
 155
 156 #ifdef __linux__
 157 .equ _pixptr, pixptr
 158 .equ _gr_fade_table, gr_fade_table
 159 .equ _write_buffer, write_buffer
 160 .equ _bytes_per_row,bytes_per_row
 161 .equ _fx_xleft, fx_xleft
 162 .equ _fx_xright, fx_xright
 163 .equ _fx_y, fx_y
 164 .equ _fx_u, fx_u
 165 .equ _fx_v, fx_v
 166 .equ _fx_z, fx_z
 167 .equ _fx_l, fx_l
 168 .equ _fx_du_dx, fx_du_dx
 169 .equ _fx_dv_dx, fx_dv_dx
 170 .equ _fx_dz_dx, fx_dz_dx
 171 .equ _fx_dl_dx, fx_dl_dx
 172 .equ _Transparency_on, Transparency_on
 173
 174 .globl asm_pent_tmap_scanline_per
 175 #else
 176 .globl _asm_pent_tmap_scanline_per
 177 #endif
 178
 179 .extern _pixptr, _gr_fade_table, _write_buffer
 180 .extern _bytes_per_row, _fx_xleft, _fx_xright, _fx_y
 181 .extern _fx_u, _fx_v, _fx_z, _fx_l
 182 .extern _fx_du_dx, _fx_dv_dx, _fx_dz_dx, _fx_dl_dx
 183 .extern _Transparency_on
 184
 185
 186
 187
 188 //.local  dudx, dvdx, dzdx, dldx
 189 //.local  ubyz4, vbyz4, uvzero
 190 //.local  lastquartet, lastpixel, ctlwd
 191 //.local  flt64, magic
 192
 193
 194 .data
 195
 196 .balign 8
 197
 198 lastquartet:    .long   0               // where to stop the 4-pixels loop
 199 lastpixel:      .long   0               // where to stop drawing entirely
 200 flt64:          .long   0x42800000      // 64.0 (what we divide z into)
 201 magic:          .long   0x59C00000      // 2^51 + 2^52 (to get ints from floats)
 202 ubyz4:          .double 0.0             // u/z for the next iteration
 203 vbyz4:          .double 0.0             // v/z for the next iteration
 204 dudx:           .long   0               // u's rate of change as a float
 205 dvdx:           .long   0               // v's rate of change as a float
 206 dzdx:           .long   0               // z's rate of change as a float
 207 dldx:           .long   0               // l's rate of change as an integer
 208 uvzero:         .long   0               // packed u/z and v/z values
 209 ctlwd:          .word   0               // the pre-tweaked FPU control word
 210
 211
 212 .text
 213
 214 .balign 4
 215
 216 //
 217 // void c_tmap_scanline_per(void)
 218 //
 219
 220 #ifdef __linux__
 221 asm_pent_tmap_scanline_per:
 222 #else
 223 _asm_pent_tmap_scanline_per:
 224 #endif
 225
 226 // Save registers the compiler might be using.
 227
 228                 pushl   %ebp
 229                 pushl   %edi
 230                 pushl   %esi
 231
 232 // Tell the FPU to use 64-bit numbers (still plenty precise enough for
 233 // our needs) so as to speed up fdiv.
 234
 235                 fnstcw  (ctlwd)
 236                 movw    (ctlwd), %ax
 237                 movl    %eax, %ebx
 238                 andb    $0xFC, %bh
 239                 orb     $0x02, %bh
 240                 movw    %bx, (ctlwd)
 241                 fldcw   (ctlwd)
 242                 movw    %ax, (ctlwd)
 243
 244 // Multiply dudx, dvdx, and dzdx by four, and store locally, converted
 245 // into floating point.
 246
 247                 movl    (_fx_du_dx), %ebx
 248                 movl    (_fx_dv_dx), %ecx
 249                 sall    $2, %ebx
 250                 movl    (_fx_dz_dx), %edx
 251                 sall    $2, %ecx
 252                 movl    %ebx, (dudx)
 253                 sall    $2, %edx
 254                 movl    %ecx, (dvdx)
 255                 movl    %edx, (dzdx)
 256                 fildl   (dudx)
 257                 fildl   (dvdx)
 258                 fildl   (dzdx)
 259                 fxch    %st(2)
 260                 fstps   (dudx)
 261                 fstps   (dvdx)
 262                 fstps   (dzdx)
 263
 264 // bytes_per_row * fx_y is the offset for the current scanline. (We do
 265 // this now before we start the first FP division.)
 266
 267                 movl    (_bytes_per_row), %eax
 268                 xorl    %edx, %edx
 269                 mull    (_fx_y)
 270
 271 // Push 64.0, v, u, and z onto the FPU stack, and then start
 272 // calculating the first 64 / z.
 273
 274                 flds    (flt64)
 275                 fildl   (_fx_v)
 276                 fildl   (_fx_u)
 277                 fildl   (_fx_z)
 278                 fdivr   %st, %st(3)
 279
 280 // Meanwhile, get l and dldx (again, the latter multiplied by four)
 281 // into %edx and %ebp, where they will be stored for the duration. The
 282 // original values are divided by 256 so that the byte needed for the
 283 // fade table offset is squarely in %dh.
 284
 285 //Dividing by 256 is bad.. rounding errors and crap.  We'll now do that
 286 //right before we need to access the table instead.  -MM
 287
 288                 movl    (_fx_l), %ebp
 289 //              sarl    $8, %ebp
 290                 movl    (_fx_dl_dx), %edx
 291 //              sarl    $6, %edx
 292                 sall    $2, %edx
 293                 movl    %edx, (dldx)
 294
 295 // Store pixptr, the pointer to our 64x64 texture map, in %esi. Store
 296 // write_buffer, the pointer to our frame buffer, in %edi. Then offset
 297 // %edi so that it points to pixel (fx_y)(fx_xleft). Calculate a
 298 // pointer to (fx_y)[fx_xright + 1] so we know when to stop drawing.
 299 // Also calculate a pointer to (fx_y)[(fx_xright + 1) & ~3] so we know
 300 // when to stop drawing four pixels at a time.
 301
 302                 movl    (_pixptr), %esi
 303                 movl    (_write_buffer), %edi
 304                 movl    (_fx_xright), %ecx
 305                 addl    %eax, %edi
 306                 incl    %ecx
 307                 addl    %edi, %ecx
 308                 addl    (_fx_xleft), %edi
 309                 movl    %ecx, %eax
 310                 subl    %edi, %eax
 311                 jle     LeaveNow
 312                 andl    $3, %eax
 313                 movl    %ecx, (lastpixel)
 314                 subl    %eax, %ecx
 315                 movl    %ecx, (lastquartet)
 316
 317 // Calculate round(64 * u / z) and round(64 * v / z), store, and
 318 // increment u, v, and z. Then start calculating the second 64 / z.
 319
 320                 DoFPCalcs
 321                 fdivr   %st, %st(3)
 322
 323 // Get our u/z and v/z values, lop off the bits we don't care about,
 324 // pack, and store in uvzero.
 325
 326                 movl    (ubyz4), %eax
 327                 movl    (vbyz4), %ebx
 328                 incl    %eax
 329                 incl    %ebx
 330                 andl    $0x3FF0, %eax
 331                 andl    $0x3FF0, %ebx
 332                 shrl    $4, %eax
 333                 shll    $10, %ebx
 334                 orl     %eax, %ebx
 335                 movl    %ebx, (uvzero)
 336
 337 // While we're waiting for the last division to finish, we might as
 338 // well get the frame buffer into the cache.
 339
 340                 cmpb    (%edi), %al
 341
 342 // Are there at least four pixels to draw? If not, skip to the epilog
 343 // code.
 344
 345                 cmpl    %ecx, %edi
 346                 je      LastBits
 347
 348 // Do we need to test for transparencies?
 349
 350                 testl   $(~0), (_Transparency_on)
 351                 jnz     LoopTransOn
 352
 353 // If not, then use the simpler loop here.
 354
 355
 356 .balign 4
 357
 358 LoopTransOff:
 359
 360 // While the FPU is busy dividing, the latest u/z and v/z values are
 361 // retrieved, packed, and stored in uvzero (to be used again in the
 362 // next iteration). The old uvzero value, which contains the uv values
 363 // for pixel 0, gets subtracted from the new uvzero value to
 364 // determined the total change in u/z and v/z across the four pixels,
 365 // and this is divided by 4 to get the average. This average is then
 366 // used to estimate the values for pixels 1, 2, and 3. The old uvzero
 367 // value is used immediately to calculate pixel 0, while %eax, %ebx, and
 368 // %ecx are entrusted with the uv values for pixels 1, 2, and 3
 369 // respectively. %edx is set to the current value of l, such that %dh is
 370 // already set as half of the offset into fadetbl. Each uv value is
 371 // used to set its pixel as follows (assuming our packed uv value is
 372 // in %ebx):
 373 //
 374 //      a:      andl    $0x003F00FC, %ebx       / mask off extraneous bits
 375 //      b:      movb    %bl, %bh                / make u flush with v
 376 //      c:      shrl    $10, %ebx               / right-justify u and v
 377 //      d:      movb    (%esi,%ebx), %dl        / get texture-map pixel
 378 //      e:      movb    fadetbl(%edx), %bl      / correct for lighting level
 379 //      f:      movb    %bl, (%edi)             / write pixel to frame buffer
 380 //
 381 // The above is done four times, once for each pixel. All of the
 382 // calculcations are interleaved in order to avoid AGI stalls and
 383 // missed pairing opportunities.
 384
 385                 DoFPCalcs
 386                 fdivr   %st, %st(3)
 387                 movl    (ubyz4), %ebx
 388                 movl    (vbyz4), %edx
 389                 incl    %ebx
 390                 incl    %edx
 391                 shrl    $4, %ebx
 392                 andl    $0x3FF0, %edx
 393                 shll    $10, %edx
 394                 andl    $0x03FF, %ebx
 395                 movl    (uvzero), %ecx          // %ecx = uv value for pixel 0
 396                 orl     %edx, %ebx
 397                 movl    %ecx, %eax
 398                 movl    %ebx, (uvzero)
 399                 andl    $0x003F00FC, %ecx       // 0 a
 400                 orl     $0x1000, %ebx
 401                 movb    %cl, %ch                // 0 b
 402                 subl    %eax, %ebx
 403                 shrl    $10, %ecx               // 0 c
 404                 movl    $0x7F0000, %edx
 405                 shrl    $2, %ebx
 406                 andl    %ebp, %edx
 407                 sarl    $8, %edx
 408                 movb    (%esi,%ecx), %dl        // 0 d
 409                 addl    $4, %edi
 410                 lea     (%eax,%ebx,2), %ecx     // %ecx = uv value for pixel 2
 411                 addl    %ebx, %eax              // %eax = uv value for pixel 1
 412                 addl    %ecx, %ebx              // %ebx = uv value for pixel 3
 413                 andl    $0x003F00FC, %ecx       // 2 a
 414                 movb    %cl, %ch                // 2 b
 415                 movb    fadetbl(%edx), %dl      // 0 e
 416                 shrl    $10, %ecx               // 2 c
 417                 andl    $0x003F00FC, %eax       // 1 a
 418                 movb    %dl, -4(%edi)           // 0 f
 419                 movb    %al, %ah                // 1 b
 420                 movb    (%esi,%ecx), %dl        // 2 d
 421                 andl    $0x003F00FC, %ebx       // 3 a
 422                 shrl    $10, %eax               // 1 c
 423                 movb    %bl, %bh                // 3 b
 424                 movb    fadetbl(%edx), %cl      // 2 e
 425                 movb    (%esi,%eax), %dl        // 1 d
 426                 shrl    $10, %ebx               // 3 c
 427                 movb    %cl, -2(%edi)           // 2 f
 428                 movl    (dldx), %ecx
 429                 movb    fadetbl(%edx), %al      // 1 e
 430                 movb    (%esi,%ebx), %dl        // 3 d
 431                 movb    %al, -3(%edi)           // 1 f
 432                 addl    %ecx, %ebp
 433                 movb    fadetbl(%edx), %bl      // 3 e
 434                 movl    (lastquartet), %ecx
 435                 movb    %bl, -1(%edi)           // 3 f
 436                 cmpl    %ecx, %edi
 437                 jl      LoopTransOff
 438
 439 // Are there any pixels left at all?
 440
 441                 cmpl    (lastpixel), %edi
 442                 jnz     LastBits
 443                 jmp     LeaveNow
 444
 445
 446 .balign 4
 447
 448 LoopTransOn:
 449
 450 // This is similar to the LoopTransOff loop, the big change being that
 451 // each value retrieved from the texture map is tested against 255,
 452 // the transparent "color". A value of 255 in the texture map means to
 453 // let the existing value for that pixel in write_buffer go by
 454 // unchanged. Thus the code for each pixel looks something like this
 455 // instead:
 456 //
 457 //      a:      andl    $0x003F00FC, %ebx       / mask off extraneous bits
 458 //      b:      movb    %bl, %bh                / make u flush with v
 459 //      c:      shrl    $10, %ebx               / right-justify u and v
 460 //      d:      movb    (%esi,%ebx), %dl        / get texture-map pixel
 461 //      e:      cmpb    $255, %dl               / is pixel transparent?
 462 //      f:      sbbb    %bh, %bh                / yes, %bh=00; no, %bh=FF
 463 //      g:      movb    fadetbl(%edx), %dl      / get lighting-corrected pixel
 464 //      h:      movb    (%edi), %bl             / get pixel in frame buffer now
 465 //      i:      xorb    %bl, %dl                / combine the two
 466 //      j:      andb    %dl, %bh                / use %bh as a mask to select
 467 //      k:      xorb    %bl, %bh                /     which pixel to keep
 468 //      l:      movb    %bh, (%edi)             / write pixel to frame buffer
 469 //
 470 // When the texture-map value is 255, the code simply writes the
 471 // original frame-buffer value back out again; otherwise the new pixel
 472 // is written instead. The ands and xors used to accomplish this bulk
 473 // up the code, but on the whole it is better than having four
 474 // unpredictable jumps in the loop. The four repeats of the above code
 475 // are even more intertwined than the other loop, due to the extra
 476 // register usage. Also note that the last two pixels combine steps i,
 477 // j, and k with each other.
 478
 479                 DoFPCalcs
 480                 fdivr   %st, %st(3)
 481                 movl    (ubyz4), %ebx
 482                 movl    (vbyz4), %edx
 483                 incl    %ebx
 484                 incl    %edx
 485                 movl    (uvzero), %ecx          // %ecx = uv for pixel 0
 486                 andl    $0x3FF0, %ebx
 487                 shrl    $4, %ebx
 488                 andl    $0x3FF0, %edx
 489                 shll    $10, %edx
 490                 movl    %ecx, %eax
 491                 andl    $0x003F00FC, %ecx       // 0 a
 492                 orl     %edx, %ebx
 493                 movb    %cl, %ch                // 0 b
 494                 addl    $4, %edi
 495                 shrl    $10, %ecx               // 0 c
 496                 movl    $0x7F0000, %edx
 497                 movl    %ebx, (uvzero)
 498                 andl    %ebp, %edx
 499                 sarl    $8, %edx
 500                 movb    (%esi,%ecx), %dl        // 0 d
 501                 orl     $0x1000, %ebx
 502                 subl    %eax, %ebx
 503                 movb    -4(%edi), %ch           // 0 h
 504                 movb    fadetbl(%edx), %cl      // 0 g
 505                 cmpb    $255, %dl               // 0 e
 506                 sbbb    %dl, %dl                // 0 f
 507                 xorb    %ch, %cl                // 0 i
 508                 shrl    $2, %ebx
 509                 andb    %cl, %dl                // 0 j
 510                 xorb    %ch, %dl                // 0 k
 511 /               nop                             // (V-pipe idle)
 512                 lea     (%eax,%ebx,2), %ecx     // %ecx = uv for pixel 2
 513                 addl    %ebx, %eax              // %eax = uv for pixel 1
 514                 andl    $0x003F00FC, %eax       // 1 a
 515                 addl    %ecx, %ebx              // %ebx = uv for pixel 3
 516                 movb    %al, %ah                // 1 b
 517                 andl    $0x003F00FC, %ecx       // 2 a
 518                 shrl    $10, %eax               // 1 c
 519                 andl    $0x003F00FC, %ebx       // 3 a
 520                 movb    %cl, %ch                // 2 b
 521                 movb    %bl, %bh                // 3 b
 522                 movb    %dl, -4(%edi)           // 0 l
 523                 movb    (%esi,%eax), %dl        // 1 d
 524                 movb    -3(%edi), %al           // 1 h
 525                 cmpb    $255, %dl               // 1 e
 526                 sbbb    %ah, %ah                // 1 f
 527                 movb    fadetbl(%edx), %dl      // 1 g
 528                 shrl    $10, %ecx               // 2 c
 529                 xorb    %al, %dl                // 1 i
 530                 shrl    $10, %ebx               // 3 c
 531                 andb    %dl, %ah                // 1 j
 532                 xorb    %al, %ah                // 1 k
 533                 movb    (%esi,%ecx), %dl        // 2 d
 534                 movb    %ah, -3(%edi)           // 1 l
 535                 cmpb    $255, %dl               // 2 e
 536                 sbbb    %ah, %ah                // 2 f
 537                 movb    fadetbl(%edx), %ch      // 2 g
 538                 movb    (%esi,%ebx), %dl        // 3 d
 539                 movb    -2(%edi), %bh           // 2 h
 540                 cmpb    $255, %dl               // 3 e
 541                 movb    -1(%edi), %bl           // 3 h
 542                 sbbb    %al, %al                // 3 f
 543                 movb    fadetbl(%edx), %cl      // 2 g
 544                 movl    (dldx), %edx
 545                 xorl    %ebx, %ecx              // 2 i and 3 i
 546                 addl    %edx, %ebp
 547                 andl    %ecx, %eax              // 2 j and 3 j
 548                 movl    (lastquartet), %ecx
 549                 xorl    %ebx, %eax              // 2 k and 3 k
 550                 movb    %ah, -2(%edi)           // 2 l
 551                 cmpl    %ecx, %edi
 552                 movb    %al, -1(%edi)           // 3 l
 553                 jl      LoopTransOn
 554
 555 // Quit if there are none at all left.
 556
 557                 cmpl    (lastpixel), %edi
 558                 jz      LeaveNow
 559
 560
 561 LastBits:
 562
 563 // Here we finish off the last one-to-three pixels assigned to us.
 564 // Rather than calculating values for all four pixels, we just divide
 565 // the difference by four and keep adding this average into the value
 566 // as needed. (This code is not particularly optimized, by the way,
 567 // since it represents such a miniscule amount of the running time.)
 568
 569                 DoFPCalcs
 570                 movl    (ubyz4), %ecx
 571                 movl    (vbyz4), %edx
 572                 incl    %ecx
 573                 incl    %edx
 574                 shrl    $4, %ecx
 575                 andl    $0x3FF0, %edx
 576                 shll    $10, %edx
 577                 andl    $0x03FF, %ecx
 578                 movl    (uvzero), %ebx
 579                 orl     %edx, %ecx
 580                 orl     $0x1000, %ecx
 581                 subl    %ebx, %ecx
 582                 shrl    $2, %ecx
 583                 andl    $0x003FC0FF, %ecx
 584                 movl    %ebp, %edx
 585                 movl    (lastpixel), %ebp
 586                 andl    $0x7F0000, %edx
 587                 sarl    $8, %edx
 588
 589 LoopLastBits:   movl    %ebx, %eax
 590                 movb    %al, %ah
 591                 shrl    $10, %eax
 592                 andb    $0x0F, %ah
 593                 movb    (%esi,%eax), %dl
 594                 cmpb    $255, %dl
 595                 jz      LetPixelBy
 596                 movb    fadetbl(%edx), %al
 597                 movb    %al, (%edi)
 598 LetPixelBy:     addl    %ecx, %ebx
 599                 incl    %edi
 600                 cmpl    %ebp, %edi
 601                 jl      LoopLastBits
 602
 603
 604 LeaveNow:
 605
 606 // We're done! Clear the stacks, reset the FPU control word, and we
 607 // are so out of here.
 608
 609                 popl    %esi
 610                 popl    %edi
 611                 popl    %ebp
 612                 fcompp
 613                 fcompp
 614                 fldcw   (ctlwd)
 615                 ret