texmap/tmappent.S

   1 /// tmap_scanline_per - Pentium-optimized assembly version
   2 /// written by Brian Raiter, Mar 1998.
   3 /// lighting roundoff error fixed by Matt Mueller, July 1999
   4
   5
   6 /// The gist of the algorithm is as follows (note that this is
   7 /// pseudocode, not actual C):
   8 ///
   9 /// int  u = fx_u;
  10 /// int  v = fx_v;
  11 /// int  z = fx_z;
  12 /// int  l = fx_l;
  13 /// int  x, ubyz, vbyz;
  14 /// byte texmap[64][64] = pixptr;
  15 /// byte framebuffer[][bytes_per_row] = write_buffer;
  16 /// byte lightingtable[][256] = gr_fade_table;
  17 /// byte c;
  18 ///
  19 /// for (x = fx_xleft ; x <= fx_xright ; ++x) {
  20 ///     ubyz = (u / z) & 63;
  21 ///     vbyz = (v / z) & 63;
  22 ///     c = texmap[ubyz][vbyz];
  23 ///     if (c != TRANSPARENT_COLOR)
  24 ///         framebuffer[fx_y][x] = lightingtable[l / 65536][c];
  25 ///     u += fx_du_dx;
  26 ///     v += fx_dv_dx;
  27 ///     z += fx_dz_dx;
  28 ///     l += fx_dl_dx;
  29 /// }
  30 ///
  31 /// The global variable Transparency_on is zero when it is known that
  32 /// there are no transparencies involved, so in that case we use a
  33 /// different loop that skips the transparency test.
  34 ///
  35 /// The actual algorithm used here only does the division calculations
  36 /// every fourth pixel, and linearly interpolates the other three.
  37 /// Something along the lines of:
  38 ///
  39 /// /* Initial values as before */
  40 /// int ubyz0, ubyz0, ubyz4, ubyz4, du1, dv1, i;
  41 ///
  42 /// ubyz0 = u / z;
  43 /// vbyz0 = v / z;
  44 /// for (x = fx_xleft ; x <= fx_xright - 3 ; x += 4) {
  45 ///     u += fx_du_dx * 4;
  46 ///     v += fx_dv_dx * 4;
  47 ///     z += fx_dz_dx * 4;
  48 ///     ubyz4 = u / z;
  49 ///     vbyz4 = v / z;
  50 ///     du1 = (ubyz4 - ubyz0) / 4;
  51 ///     dv1 = (vbyz4 - vbyz0) / 4;
  52 ///     ubyz = ubyz0;
  53 ///     vbyz = vbyz0;
  54 ///     for (i = 0 ; i < 4 ; ++i) {
  55 ///         c = texmap[ubyz & 63][vbyz & 63];
  56 ///         if (c != TRANSPARENT_COLOR)
  57 ///             framebuffer[fx_y][x + i] = lightingtable[l / 65536][c];
  58 ///         ubyz += du1;
  59 ///         vbyz += dv1;
  60 ///         l += fx_dl_dx;
  61 ///     }
  62 ///     ubyz0 = ubyz4;
  63 ///     vbyz0 = vbyz4;
  64 /// }
  65 /// for ( ; x <= fx_xright ; ++x) {
  66 ///     /* Finish off remaining 0-3 pixels */
  67 /// }
  68 ///
  69 /// So much for the basic overview.
  70 ///
  71 /// In this version, the Pentium's floating-point unit is pressed into
  72 /// service to do the actual divisions, so that 1/z can be calculated
  73 /// first, and the resulting reciprocal multiplied with u and v. These
  74 /// two products are then stored back out as integers. This keeps us
  75 /// down to doing only one division every four pixels, during which
  76 /// other integer instructions can be overlapped.
  77 ///
  78 /// The algorithm actually divides 64 by z, so that the rounded-off
  79 /// products will effectively be stored with six fraction bits. This
  80 /// allows the algorithm to correct for minor floating-point roundoff
  81 /// errors. Two fraction bits are kept during the interpolation of the
  82 /// three middle pixels, which hopefully increases the accuracy of the
  83 /// approximations.
  84 ///
  85 /// We only need the lowest six (integral) bits of u/z and v/z for
  86 /// each pixptr offset, so we only need eight bits of each fourth pair
  87 /// of values to figure the interpolation. Add with the two fractional
  88 /// bits we keep for extra precision flavor, this makes ten bits for
  89 /// each value, or twenty to store the full pair. To simplify the
  90 /// interpolation, the pair is packed into a single 32-bit register
  91 /// like so:
  92 ///
  93 ///             3      2       1
  94 ///             1      4       6       8       0
  95 ///             ________vvVVVVVVvv____uuUUUUUUuu
  96 ///                       \v&63/        \u&63/
  97 ///
  98 /// The unused bits between the u and v values permit the packed
  99 /// values to be added/subtracted without the u values spilling over
 100 /// into the v values. Then, after anding out the carry/borrow bits,
 101 /// the instructions "movb %al, %ah ; shrl $10, %eax" nicely
 102 /// right-justify the desired values into a pixptr offset.
 103 ///
 104 /// The FP stack is loaded up with the values of u, v, and z,
 105 /// converted to floats. %ebp is used to hold the value of l, %esi is
 106 /// set to pixptr, and %edi points to our current position in
 107 /// write_buffer.
 108
 109
 110
 111 // This is used to abbreviate an annoying external variable name.
 112
 113 .equ    fadetbl, _gr_fade_table
 114
 115
 116 // The following macro encapsulates the floating-point instructions
 117 // that put the results of a prior division to use and prepare for the
 118 // next division. At the beginning of the macro, the FP stack contains
 119 // (from top to bottom): z, u, v, 64/z. The macro computes (64*u)/z,
 120 // which is stored in ubyz4, and (64*v)/z, which is stored in vbyz4.
 121 // The number (2^51 + 2^52) is added to each number before they are
 122 // stored as qwords. Since qwords only have 52 bits of precision, this
 123 // magic number causes the fractional part to be shifted off the end,
 124 // leaving the integral part right-shifted. Thus, reading the low
 125 // dword gives the original number rounded off to the nearest integer
 126 // - in two's complement, no less. (This technique allows for more
 127 // pipelining than using the more straightforward fist/p
 128 // instruction.) Simultaneous with this, the macro adds dudx to u,
 129 // dvdx to v, and dzdx to z, and finally puts 64 back onto the stack.
 130 // At the end of the macro, the stack contains: z, u, v, 64.
 131
 132 .macro DoFPCalcs                // The FP stack after each instruction:
 133                                 //                 z    u    v  64/z
 134         fadds   (dzdx)          // z += dzdx      z'   u    v  64/z
 135         fxch    %st(1)          //                u    z'   v  64/z
 136         fst     %st(4)          //                u    z'   v  64/z   u
 137         fmul    %st(3)          // (64 / z) * u  u/z   z'   v  64/z   u
 138         fxch    %st(4)          //                u    z'   v  64/z  u/z
 139         fadds   (dudx)          // u += dudx      u'   z'   v  64/z   u
 140         fxch    %st(2)          //                v    z'   u' 64/z  u/z
 141         fmul    %st, %st(3)     // (64 / z) * v   v    z'   u'  v/z  u/z
 142         fxch    %st(4)          //               u/z   z'   u'  v/z   v
 143         fadds   (magic)         //               U/Z   z'   u'  v/z   v
 144         fxch    %st(4)          //                v    z'   u'  v/z  U/Z
 145         fadds   (dvdx)          // v += dvdx      v'   z'   u'  v/z  U/Z
 146         fxch    %st(3)          //               v/z   z'   u'   v'  U/Z
 147         fadds   (magic)         //               V/Z   z'   u'   v'  U/Z
 148         flds    (flt64)         //                64  V/Z   z'   u'   v'  U/Z
 149         fxch    %st(5)          //               U/Z  V/Z   z'   u'   v'   64
 150         fstpl   (ubyz4)         //               V/Z   z'   u'   v'   64
 151         fstpl   (vbyz4)         //                z'   u'   v'   64
 152                                 // (ready to start the next division)
 153 .endm
 154
 155 #ifdef __ENV_LINUX__
 156 .equ _pixptr, pixptr
 157 .equ _gr_fade_table, gr_fade_table
 158 .equ _write_buffer, write_buffer
 159 .equ _bytes_per_row,bytes_per_row
 160 .equ _fx_xleft, fx_xleft
 161 .equ _fx_xright, fx_xright
 162 .equ _fx_y, fx_y
 163 .equ _fx_u, fx_u
 164 .equ _fx_v, fx_v
 165 .equ _fx_z, fx_z
 166 .equ _fx_l, fx_l
 167 .equ _fx_du_dx, fx_du_dx
 168 .equ _fx_dv_dx, fx_dv_dx
 169 .equ _fx_dz_dx, fx_dz_dx
 170 .equ _fx_dl_dx, fx_dl_dx
 171 .equ _Transparency_on, Transparency_on
 172
 173 .globl asm_tmap_scanline_per
 174 #else
 175 .globl _asm_tmap_scanline_per
 176 #endif
 177
 178 .extern _pixptr, _gr_fade_table, _write_buffer
 179 .extern _bytes_per_row, _fx_xleft, _fx_xright, _fx_y
 180 .extern _fx_u, _fx_v, _fx_z, _fx_l
 181 .extern _fx_du_dx, _fx_dv_dx, _fx_dz_dx, _fx_dl_dx
 182 .extern _Transparency_on
 183
 184
 185
 186
 187 //.local  dudx, dvdx, dzdx, dldx
 188 //.local  ubyz4, vbyz4, uvzero
 189 //.local  lastquartet, lastpixel, ctlwd
 190 //.local  flt64, magic
 191
 192
 193 .data
 194
 195 .balign 8
 196
 197 lastquartet:    .long   0               // where to stop the 4-pixels loop
 198 lastpixel:      .long   0               // where to stop drawing entirely
 199 flt64:          .long   0x42800000      // 64.0 (what we divide z into)
 200 magic:          .long   0x59C00000      // 2^51 + 2^52 (to get ints from floats)
 201 ubyz4:          .double 0.0             // u/z for the next iteration
 202 vbyz4:          .double 0.0             // v/z for the next iteration
 203 dudx:           .long   0               // u's rate of change as a float
 204 dvdx:           .long   0               // v's rate of change as a float
 205 dzdx:           .long   0               // z's rate of change as a float
 206 dldx:           .long   0               // l's rate of change as an integer
 207 uvzero:         .long   0               // packed u/z and v/z values
 208 ctlwd:          .word   0               // the pre-tweaked FPU control word
 209
 210
 211 .text
 212
 213 .balign 4
 214
 215 //
 216 // void c_tmap_scanline_per(void)
 217 //
 218
 219 #ifdef __ENV_LINUX__
 220 asm_tmap_scanline_per:
 221 #else
 222 _asm_tmap_scanline_per:
 223 #endif
 224
 225 // Save registers the compiler might be using.
 226
 227                 pushl   %ebp
 228                 pushl   %edi
 229                 pushl   %esi
 230
 231 // Tell the FPU to use 64-bit numbers (still plenty precise enough for
 232 // our needs) so as to speed up fdiv.
 233
 234                 fnstcw  (ctlwd)
 235                 movw    (ctlwd), %ax
 236                 movl    %eax, %ebx
 237                 andb    $0xFC, %bh
 238                 orb     $0x02, %bh
 239                 movw    %bx, (ctlwd)
 240                 fldcw   (ctlwd)
 241                 movw    %ax, (ctlwd)
 242
 243 // Multiply dudx, dvdx, and dzdx by four, and store locally, converted
 244 // into floating point.
 245
 246                 movl    (_fx_du_dx), %ebx
 247                 movl    (_fx_dv_dx), %ecx
 248                 sall    $2, %ebx
 249                 movl    (_fx_dz_dx), %edx
 250                 sall    $2, %ecx
 251                 movl    %ebx, (dudx)
 252                 sall    $2, %edx
 253                 movl    %ecx, (dvdx)
 254                 movl    %edx, (dzdx)
 255                 fildl   (dudx)
 256                 fildl   (dvdx)
 257                 fildl   (dzdx)
 258                 fxch    %st(2)
 259                 fstps   (dudx)
 260                 fstps   (dvdx)
 261                 fstps   (dzdx)
 262
 263 // bytes_per_row * fx_y is the offset for the current scanline. (We do
 264 // this now before we start the first FP division.)
 265
 266                 movl    (_bytes_per_row), %eax
 267                 xorl    %edx, %edx
 268                 mull    (_fx_y)
 269
 270 // Push 64.0, v, u, and z onto the FPU stack, and then start
 271 // calculating the first 64 / z.
 272
 273                 flds    (flt64)
 274                 fildl   (_fx_v)
 275                 fildl   (_fx_u)
 276                 fildl   (_fx_z)
 277                 fdivr   %st, %st(3)
 278
 279 // Meanwhile, get l and dldx (again, the latter multiplied by four)
 280 // into %edx and %ebp, where they will be stored for the duration. The
 281 // original values are divided by 256 so that the byte needed for the
 282 // fade table offset is squarely in %dh.
 283
 284 //Dividing by 256 is bad.. rounding errors and crap.  We'll now do that
 285 //right before we need to access the table instead.  -MM
 286
 287                 movl    (_fx_l), %ebp
 288 //              sarl    $8, %ebp
 289                 movl    (_fx_dl_dx), %edx
 290 //              sarl    $6, %edx
 291                 sall    $2, %edx
 292                 movl    %edx, (dldx)
 293
 294 // Store pixptr, the pointer to our 64x64 texture map, in %esi. Store
 295 // write_buffer, the pointer to our frame buffer, in %edi. Then offset
 296 // %edi so that it points to pixel (fx_y)(fx_xleft). Calculate a
 297 // pointer to (fx_y)[fx_xright + 1] so we know when to stop drawing.
 298 // Also calculate a pointer to (fx_y)[(fx_xright + 1) & ~3] so we know
 299 // when to stop drawing four pixels at a time.
 300
 301                 movl    (_pixptr), %esi
 302                 movl    (_write_buffer), %edi
 303                 movl    (_fx_xright), %ecx
 304                 addl    %eax, %edi
 305                 incl    %ecx
 306                 addl    %edi, %ecx
 307                 addl    (_fx_xleft), %edi
 308                 movl    %ecx, %eax
 309                 subl    %edi, %eax
 310                 jle     LeaveNow
 311                 andl    $3, %eax
 312                 movl    %ecx, (lastpixel)
 313                 subl    %eax, %ecx
 314                 movl    %ecx, (lastquartet)
 315
 316 // Calculate round(64 * u / z) and round(64 * v / z), store, and
 317 // increment u, v, and z. Then start calculating the second 64 / z.
 318
 319                 DoFPCalcs
 320                 fdivr   %st, %st(3)
 321
 322 // Get our u/z and v/z values, lop off the bits we don't care about,
 323 // pack, and store in uvzero.
 324
 325                 movl    (ubyz4), %eax
 326                 movl    (vbyz4), %ebx
 327                 incl    %eax
 328                 incl    %ebx
 329                 andl    $0x3FF0, %eax
 330                 andl    $0x3FF0, %ebx
 331                 shrl    $4, %eax
 332                 shll    $10, %ebx
 333                 orl     %eax, %ebx
 334                 movl    %ebx, (uvzero)
 335
 336 // While we're waiting for the last division to finish, we might as
 337 // well get the frame buffer into the cache.
 338
 339                 cmpb    (%edi), %al
 340
 341 // Are there at least four pixels to draw? If not, skip to the epilog
 342 // code.
 343
 344                 cmpl    %ecx, %edi
 345                 je      LastBits
 346
 347 // Do we need to test for transparencies?
 348
 349                 testl   $(~0), (_Transparency_on)
 350                 jnz     LoopTransOn
 351
 352 // If not, then use the simpler loop here.
 353
 354
 355 .balign 4
 356
 357 LoopTransOff:
 358
 359 // While the FPU is busy dividing, the latest u/z and v/z values are
 360 // retrieved, packed, and stored in uvzero (to be used again in the
 361 // next iteration). The old uvzero value, which contains the uv values
 362 // for pixel 0, gets subtracted from the new uvzero value to
 363 // determined the total change in u/z and v/z across the four pixels,
 364 // and this is divided by 4 to get the average. This average is then
 365 // used to estimate the values for pixels 1, 2, and 3. The old uvzero
 366 // value is used immediately to calculate pixel 0, while %eax, %ebx, and
 367 // %ecx are entrusted with the uv values for pixels 1, 2, and 3
 368 // respectively. %edx is set to the current value of l, such that %dh is
 369 // already set as half of the offset into fadetbl. Each uv value is
 370 // used to set its pixel as follows (assuming our packed uv value is
 371 // in %ebx):
 372 //
 373 //      a:      andl    $0x003F00FC, %ebx       / mask off extraneous bits
 374 //      b:      movb    %bl, %bh                / make u flush with v
 375 //      c:      shrl    $10, %ebx               / right-justify u and v
 376 //      d:      movb    (%esi,%ebx), %dl        / get texture-map pixel
 377 //      e:      movb    fadetbl(%edx), %bl      / correct for lighting level
 378 //      f:      movb    %bl, (%edi)             / write pixel to frame buffer
 379 //
 380 // The above is done four times, once for each pixel. All of the
 381 // calculcations are interleaved in order to avoid AGI stalls and
 382 // missed pairing opportunities.
 383
 384                 DoFPCalcs
 385                 fdivr   %st, %st(3)
 386                 movl    (ubyz4), %ebx
 387                 movl    (vbyz4), %edx
 388                 incl    %ebx
 389                 incl    %edx
 390                 shrl    $4, %ebx
 391                 andl    $0x3FF0, %edx
 392                 shll    $10, %edx
 393                 andl    $0x03FF, %ebx
 394                 movl    (uvzero), %ecx          // %ecx = uv value for pixel 0
 395                 orl     %edx, %ebx
 396                 movl    %ecx, %eax
 397                 movl    %ebx, (uvzero)
 398                 andl    $0x003F00FC, %ecx       // 0 a
 399                 orl     $0x1000, %ebx
 400                 movb    %cl, %ch                // 0 b
 401                 subl    %eax, %ebx
 402                 shrl    $10, %ecx               // 0 c
 403                 movl    $0x7F0000, %edx
 404                 shrl    $2, %ebx
 405                 andl    %ebp, %edx
 406                 sarl    $8, %edx
 407                 movb    (%esi,%ecx), %dl        // 0 d
 408                 addl    $4, %edi
 409                 lea     (%eax,%ebx,2), %ecx     // %ecx = uv value for pixel 2
 410                 addl    %ebx, %eax              // %eax = uv value for pixel 1
 411                 addl    %ecx, %ebx              // %ebx = uv value for pixel 3
 412                 andl    $0x003F00FC, %ecx       // 2 a
 413                 movb    %cl, %ch                // 2 b
 414                 movb    fadetbl(%edx), %dl      // 0 e
 415                 shrl    $10, %ecx               // 2 c
 416                 andl    $0x003F00FC, %eax       // 1 a
 417                 movb    %dl, -4(%edi)           // 0 f
 418                 movb    %al, %ah                // 1 b
 419                 movb    (%esi,%ecx), %dl        // 2 d
 420                 andl    $0x003F00FC, %ebx       // 3 a
 421                 shrl    $10, %eax               // 1 c
 422                 movb    %bl, %bh                // 3 b
 423                 movb    fadetbl(%edx), %cl      // 2 e
 424                 movb    (%esi,%eax), %dl        // 1 d
 425                 shrl    $10, %ebx               // 3 c
 426                 movb    %cl, -2(%edi)           // 2 f
 427                 movl    (dldx), %ecx
 428                 movb    fadetbl(%edx), %al      // 1 e
 429                 movb    (%esi,%ebx), %dl        // 3 d
 430                 movb    %al, -3(%edi)           // 1 f
 431                 addl    %ecx, %ebp
 432                 movb    fadetbl(%edx), %bl      // 3 e
 433                 movl    (lastquartet), %ecx
 434                 movb    %bl, -1(%edi)           // 3 f
 435                 cmpl    %ecx, %edi
 436                 jl      LoopTransOff
 437
 438 // Are there any pixels left at all?
 439
 440                 cmpl    (lastpixel), %edi
 441                 jnz     LastBits
 442                 jmp     LeaveNow
 443
 444
 445 .balign 4
 446
 447 LoopTransOn:
 448
 449 // This is similar to the LoopTransOff loop, the big change being that
 450 // each value retrieved from the texture map is tested against 255,
 451 // the transparent "color". A value of 255 in the texture map means to
 452 // let the existing value for that pixel in write_buffer go by
 453 // unchanged. Thus the code for each pixel looks something like this
 454 // instead:
 455 //
 456 //      a:      andl    $0x003F00FC, %ebx       / mask off extraneous bits
 457 //      b:      movb    %bl, %bh                / make u flush with v
 458 //      c:      shrl    $10, %ebx               / right-justify u and v
 459 //      d:      movb    (%esi,%ebx), %dl        / get texture-map pixel
 460 //      e:      cmpb    $255, %dl               / is pixel transparent?
 461 //      f:      sbbb    %bh, %bh                / yes, %bh=00; no, %bh=FF
 462 //      g:      movb    fadetbl(%edx), %dl      / get lighting-corrected pixel
 463 //      h:      movb    (%edi), %bl             / get pixel in frame buffer now
 464 //      i:      xorb    %bl, %dl                / combine the two
 465 //      j:      andb    %dl, %bh                / use %bh as a mask to select
 466 //      k:      xorb    %bl, %bh                /     which pixel to keep
 467 //      l:      movb    %bh, (%edi)             / write pixel to frame buffer
 468 //
 469 // When the texture-map value is 255, the code simply writes the
 470 // original frame-buffer value back out again; otherwise the new pixel
 471 // is written instead. The ands and xors used to accomplish this bulk
 472 // up the code, but on the whole it is better than having four
 473 // unpredictable jumps in the loop. The four repeats of the above code
 474 // are even more intertwined than the other loop, due to the extra
 475 // register usage. Also note that the last two pixels combine steps i,
 476 // j, and k with each other.
 477
 478                 DoFPCalcs
 479                 fdivr   %st, %st(3)
 480                 movl    (ubyz4), %ebx
 481                 movl    (vbyz4), %edx
 482                 incl    %ebx
 483                 incl    %edx
 484                 movl    (uvzero), %ecx          // %ecx = uv for pixel 0
 485                 andl    $0x3FF0, %ebx
 486                 shrl    $4, %ebx
 487                 andl    $0x3FF0, %edx
 488                 shll    $10, %edx
 489                 movl    %ecx, %eax
 490                 andl    $0x003F00FC, %ecx       // 0 a
 491                 orl     %edx, %ebx
 492                 movb    %cl, %ch                // 0 b
 493                 addl    $4, %edi
 494                 shrl    $10, %ecx               // 0 c
 495                 movl    $0x7F0000, %edx
 496                 movl    %ebx, (uvzero)
 497                 andl    %ebp, %edx
 498                 sarl    $8, %edx
 499                 movb    (%esi,%ecx), %dl        // 0 d
 500                 orl     $0x1000, %ebx
 501                 subl    %eax, %ebx
 502                 movb    -4(%edi), %ch           // 0 h
 503                 movb    fadetbl(%edx), %cl      // 0 g
 504                 cmpb    $255, %dl               // 0 e
 505                 sbbb    %dl, %dl                // 0 f
 506                 xorb    %ch, %cl                // 0 i
 507                 shrl    $2, %ebx
 508                 andb    %cl, %dl                // 0 j
 509                 xorb    %ch, %dl                // 0 k
 510 /               nop                             // (V-pipe idle)
 511                 lea     (%eax,%ebx,2), %ecx     // %ecx = uv for pixel 2
 512                 addl    %ebx, %eax              // %eax = uv for pixel 1
 513                 andl    $0x003F00FC, %eax       // 1 a
 514                 addl    %ecx, %ebx              // %ebx = uv for pixel 3
 515                 movb    %al, %ah                // 1 b
 516                 andl    $0x003F00FC, %ecx       // 2 a
 517                 shrl    $10, %eax               // 1 c
 518                 andl    $0x003F00FC, %ebx       // 3 a
 519                 movb    %cl, %ch                // 2 b
 520                 movb    %bl, %bh                // 3 b
 521                 movb    %dl, -4(%edi)           // 0 l
 522                 movb    (%esi,%eax), %dl        // 1 d
 523                 movb    -3(%edi), %al           // 1 h
 524                 cmpb    $255, %dl               // 1 e
 525                 sbbb    %ah, %ah                // 1 f
 526                 movb    fadetbl(%edx), %dl      // 1 g
 527                 shrl    $10, %ecx               // 2 c
 528                 xorb    %al, %dl                // 1 i
 529                 shrl    $10, %ebx               // 3 c
 530                 andb    %dl, %ah                // 1 j
 531                 xorb    %al, %ah                // 1 k
 532                 movb    (%esi,%ecx), %dl        // 2 d
 533                 movb    %ah, -3(%edi)           // 1 l
 534                 cmpb    $255, %dl               // 2 e
 535                 sbbb    %ah, %ah                // 2 f
 536                 movb    fadetbl(%edx), %ch      // 2 g
 537                 movb    (%esi,%ebx), %dl        // 3 d
 538                 movb    -2(%edi), %bh           // 2 h
 539                 cmpb    $255, %dl               // 3 e
 540                 movb    -1(%edi), %bl           // 3 h
 541                 sbbb    %al, %al                // 3 f
 542                 movb    fadetbl(%edx), %cl      // 2 g
 543                 movl    (dldx), %edx
 544                 xorl    %ebx, %ecx              // 2 i and 3 i
 545                 addl    %edx, %ebp
 546                 andl    %ecx, %eax              // 2 j and 3 j
 547                 movl    (lastquartet), %ecx
 548                 xorl    %ebx, %eax              // 2 k and 3 k
 549                 movb    %ah, -2(%edi)           // 2 l
 550                 cmpl    %ecx, %edi
 551                 movb    %al, -1(%edi)           // 3 l
 552                 jl      LoopTransOn
 553
 554 // Quit if there are none at all left.
 555
 556                 cmpl    (lastpixel), %edi
 557                 jz      LeaveNow
 558
 559
 560 LastBits:
 561
 562 // Here we finish off the last one-to-three pixels assigned to us.
 563 // Rather than calculating values for all four pixels, we just divide
 564 // the difference by four and keep adding this average into the value
 565 // as needed. (This code is not particularly optimized, by the way,
 566 // since it represents such a miniscule amount of the running time.)
 567
 568                 DoFPCalcs
 569                 movl    (ubyz4), %ecx
 570                 movl    (vbyz4), %edx
 571                 incl    %ecx
 572                 incl    %edx
 573                 shrl    $4, %ecx
 574                 andl    $0x3FF0, %edx
 575                 shll    $10, %edx
 576                 andl    $0x03FF, %ecx
 577                 movl    (uvzero), %ebx
 578                 orl     %edx, %ecx
 579                 orl     $0x1000, %ecx
 580                 subl    %ebx, %ecx
 581                 shrl    $2, %ecx
 582                 andl    $0x003FC0FF, %ecx
 583                 movl    %ebp, %edx
 584                 movl    (lastpixel), %ebp
 585                 andl    $0x7F0000, %edx
 586                 sarl    $8, %edx
 587
 588 LoopLastBits:   movl    %ebx, %eax
 589                 movb    %al, %ah
 590                 shrl    $10, %eax
 591                 andb    $0x0F, %ah
 592                 movb    (%esi,%eax), %dl
 593                 cmpb    $255, %dl
 594                 jz      LetPixelBy
 595                 movb    fadetbl(%edx), %al
 596                 movb    %al, (%edi)
 597 LetPixelBy:     addl    %ecx, %ebx
 598                 incl    %edi
 599                 cmpl    %ebp, %edi
 600                 jl      LoopLastBits
 601
 602
 603 LeaveNow:
 604
 605 // We're done! Clear the stacks, reset the FPU control word, and we
 606 // are so out of here.
 607
 608                 popl    %esi
 609                 popl    %edi
 610                 popl    %ebp
 611                 fcompp
 612                 fcompp
 613                 fldcw   (ctlwd)
 614                 ret