texmap/tmapppro.S

   1 /// $Id: tmapppro.S,v 1.5 2003-02-18 20:15:48 btb Exp $
   2 /// tmap_scanline_per - Pentium-Pro-optimized assembly version
   3 /// written by Brian Raiter, Mar 1998.
   4 /// lighting roundoff error fixed by Matt Mueller, July 1999
   5
   6 /// The gist of the algorithm is as follows (note that this is
   7 /// pseudocode, not actual C):
   8 ///
   9 /// int  u = fx_u;
  10 /// int  v = fx_v;
  11 /// int  z = fx_z;
  12 /// int  l = fx_l;
  13 /// int  x, ubyz, vbyz;
  14 /// byte texmap[64][64] = pixptr;
  15 /// byte framebuffer[][bytes_per_row] = write_buffer;
  16 /// byte lightingtable[][256] = gr_fade_table;
  17 /// byte c;
  18 ///
  19 /// for (x = fx_xleft ; x <= fx_xright ; ++x) {
  20 ///     ubyz = (u / z) & 63;
  21 ///     vbyz = (v / z) & 63;
  22 ///     c = texmap[ubyz][vbyz];
  23 ///     if (c != TRANSPARENT_COLOR)
  24 ///         framebuffer[fx_y][x] = lightingtable[l / 65536][c];
  25 ///     u += fx_du_dx;
  26 ///     v += fx_dv_dx;
  27 ///     z += fx_dz_dx;
  28 ///     l += fx_dl_dx;
  29 /// }
  30 ///
  31 /// The global variable Transparency_on is zero when it is known that
  32 /// there are no transparencies involved, so in that case we use a
  33 /// different loop that skips the transparency test.
  34 ///
  35 /// The actual algorithm used here only does the division calculations
  36 /// every fourth pixel, and linearly interpolates the other three.
  37 /// Something along the lines of:
  38 ///
  39 /// /* Initial values as before */
  40 /// int ubyz0, ubyz0, ubyz4, ubyz4, du1, dv1, i;
  41 ///
  42 /// ubyz0 = u / z;
  43 /// vbyz0 = v / z;
  44 /// for (x = fx_xleft ; x <= fx_xright - 3 ; x += 4) {
  45 ///     u += fx_du_dx * 4;
  46 ///     v += fx_dv_dx * 4;
  47 ///     z += fx_dz_dx * 4;
  48 ///     ubyz4 = u / z;
  49 ///     vbyz4 = v / z;
  50 ///     du1 = (ubyz4 - ubyz0) / 4;
  51 ///     dv1 = (vbyz4 - vbyz0) / 4;
  52 ///     ubyz = ubyz0;
  53 ///     vbyz = vbyz0;
  54 ///     for (i = 0 ; i < 4 ; ++i) {
  55 ///         c = texmap[ubyz & 63][vbyz & 63];
  56 ///         if (c != TRANSPARENT_COLOR)
  57 ///             framebuffer[fx_y][x + i] = lightingtable[l / 65536][c];
  58 ///         ubyz += du1;
  59 ///         vbyz += dv1;
  60 ///         l += fx_dl_dx;
  61 ///     }
  62 ///     ubyz0 = ubyz4;
  63 ///     vbyz0 = vbyz4;
  64 /// }
  65 /// for ( ; x <= fx_xright ; ++x) {
  66 ///     /* Finish off remaining 0-3 pixels */
  67 /// }
  68 ///
  69 /// So much for the basic overview.
  70 ///
  71 /// In this version, the PPro's floating-point unit is pressed into
  72 /// service to do the actual divisions, so that 1/z can be calculated
  73 /// first, and the resulting reciprocal multiplied with u and v. These
  74 /// two products are then stored back out as integers. This keeps us
  75 /// down to doing only one division every four pixels, during which
  76 /// other integer instructions can be overlapped.
  77 ///
  78 /// The algorithm actually divides 64 by z, so that the rounded-off
  79 /// products will effectively be stored with six fraction bits. This
  80 /// allows the algorithm to correct for minor floating-point roundoff
  81 /// errors. Two fraction bits are kept during the interpolation of the
  82 /// three middle pixels, which hopefully increases the accuracy of the
  83 /// approximations.
  84 ///
  85 /// We only need the lowest six (integral) bits of u/z and v/z for
  86 /// each pixptr offset, so we only need eight bits of each fourth
  87 /// pair of values to figure the interpolation. Add with the two
  88 /// fractional bits we keep for extra precision flavor, this makes ten
  89 /// bits for each value, or twenty to store the full pair. To simplify
  90 /// the interpolation, the pair is packed into a single 32-bit
  91 /// register like so:
  92 ///
  93 ///             3      2       1
  94 ///             1      4       6       8       0
  95 ///             vvVVVVVVvv____________uuUUUUUUuu
  96 ///               \v&63/                \u&63/
  97 ///
  98 /// The unused bits between the u and v values permit the packed
  99 /// values to be added/subtracted without the u values spilling over
 100 /// into the v values. Then, the instructions "bswap %eax ; roll $6,
 101 /// %eax ; andl $0x0FFF, %eax" will right-justify the desired values
 102 /// into a pixptr offset.
 103 ///
 104 /// The FP stack is loaded up with the values of u, v, and z,
 105 /// converted to floats. %ebp is used to hold the value of l, %esi is
 106 /// is set to pixptr, and %edi points to our current position in
 107 /// write_buffer.
 108
 109
 110
 111 // This is used to abbreviate an annoying external variable name.
 112
 113 .equ    fadetbl, _gr_fade_table
 114
 115
 116 // The following macro encapsulates the floating-point instructions
 117 // that put the results of a prior division to use and prepare for the
 118 // next division. At the beginning of the macro, the FP stack contains
 119 // (from top to bottom): 64/z, z, u, v. The macro computes (64*u)/z,
 120 // which is stored in ubyz4, and (64*v)/z, which is stored in vybz4.
 121 // Simultaneous with this, the macro adds dudx to u, dvdx to v, and
 122 // dzdx to z, and finally puts 64 back onto the stack. At the end of
 123 // the macro, the stack contains: 64, z, u, v.
 124
 125 .macro DoFPCalcs 0              // The FP stack after each instruction:
 126                                 //               64/z  z    u    v
 127         fst     %st(4)          //               64/z  z    u    v  64/z
 128         fxch    %st(2)          //                 u   z  64/z   v  64/z
 129         fmul    %st, %st(4)     // (64 * u) / z    u   z  64/z   v   u/z
 130         fadds   (dudx)          // u += dudx       u'  z  64/z   v   u/z
 131         fxch    %st(3)          //                 v   z  64/z   u'  u/z
 132         fmul    %st, %st(2)     // (64 * v) / z     v   z   v/z   u'  u/z
 133         fadds   (dvdx)          // v += dvdx       v'  z   v/z   u'  u/z
 134         fxch    %st(1)          //                 z   v'  v/z   u'  u/z
 135         fadds   (dzdx)          // z += dzdx       z'  v'  v/z   u'  u/z
 136         fxch    %st(2)          //                v/z  v'   z'   u'  u/z
 137         flds    (flt64)         //                 64 v/z   v'   z'   u'   u/z
 138         fxch    %st(5)          //                u/z v/z   v'   z'   u'    64
 139         fistpl  (ubyz4)         //                v/z  v'   z'   u'   64
 140         fistpl  (vbyz4)         //                 v'  z'   u'   64
 141         fxch    %st(3)          //                 64  z'   u'   v'
 142                                 // (ready to start the next division)
 143 .endm
 144
 145
 146 #ifdef __linux__
 147 .equ _pixptr, pixptr
 148 .equ _gr_fade_table, gr_fade_table
 149 .equ _write_buffer, write_buffer
 150 .equ _bytes_per_row, bytes_per_row
 151 .equ _fx_xleft, fx_xleft
 152 .equ _fx_xright, fx_xright
 153 .equ _fx_y, fx_y
 154 .equ _fx_u, fx_u
 155 .equ _fx_v, fx_v
 156 .equ _fx_z, fx_z
 157 .equ _fx_l, fx_l
 158 .equ _fx_du_dx, fx_du_dx
 159 .equ _fx_dv_dx, fx_dv_dx
 160 .equ _fx_dz_dx, fx_dz_dx
 161 .equ _fx_dl_dx, fx_dl_dx
 162 .equ _Transparency_on, Transparency_on
 163
 164 .globl asm_ppro_tmap_scanline_per
 165 #else
 166 .globl _asm_ppro_tmap_scanline_per
 167 #endif
 168
 169 .extern _pixptr, _gr_fade_table, _write_buffer
 170 .extern _bytes_per_row, _fx_xleft, _fx_xright, _fx_y
 171 .extern _fx_u, _fx_v, _fx_z, _fx_l
 172 .extern _fx_du_dx, _fx_dv_dx, _fx_dz_dx, _fx_dl_dx
 173 .extern _Transparency_on
 174
 175 //.local  dudx, dvdx, dzdx, dldx, l
 176 //.local  ubyz, vbyz, uvzero
 177 //.local  lastquartet, lastpixel, ctwl
 178 //.local  flt64
 179
 180 .data
 181
 182 .balign 4
 183
 184 dudx:           .long   0               // u's rate of change as a float
 185 dvdx:           .long   0               // v's rate of change as a float
 186 dzdx:           .long   0               // z's rate of change as a float
 187 dldx:           .long   0               // l's rate of change as an integer
 188 l:              .long   0               // the current l value
 189 ubyz4:          .long   0               // u/z for the next iteration
 190 vbyz4:          .long   0               // v/z for the next iteration
 191 uvzero:         .long   0               // packed u/z and v/z values
 192 lastquartet:    .long   0               // where to stop the 4-pixels loop
 193 lastpixel:      .long   0               // where to stop drawing entirely
 194 flt64:          .long   0x42800000      // 64.0 (what we divide z into)
 195 ctlwd:          .long   0               // the pre-tweaked FPU control word
 196
 197
 198 .text
 199
 200 .balign 4
 201
 202 //
 203 // void c_tmap_scanline_per(void)
 204 //
 205
 206 #ifdef __linux__
 207 asm_ppro_tmap_scanline_per:
 208 #else
 209 _asm_ppro_tmap_scanline_per:
 210 #endif
 211
 212 // Save registers the compiler might be using.
 213
 214                 pushl   %ebp
 215                 pushl   %edi
 216                 pushl   %esi
 217
 218 // Kick the FPU into the lowest precision (still enough for our needs)
 219 // so as to speed up fdiv.
 220
 221                 fnstcw  (ctlwd)
 222                 movw    (ctlwd), %ax
 223                 movl    %eax, %ebx
 224                 andb    $0xFC, %bh
 225                 movw    %bx, (ctlwd)
 226                 fldcw   (ctlwd)
 227                 movw    %ax, (ctlwd)
 228
 229 // Multiply dudx, dvdx, and dzdx by four, and store locally, converted
 230 // into floating point.
 231
 232                 movl    (_fx_du_dx), %eax
 233                 sall    $2, %eax
 234                 movl    %eax, (dudx)
 235                 movl    (_fx_dv_dx), %eax
 236                 sall    $2, %eax
 237                 movl    %eax, (dvdx)
 238                 movl    (_fx_dz_dx), %eax
 239                 sall    $2, %eax
 240                 movl    %eax, (dzdx)
 241                 fildl   (dudx)
 242                 fildl   (dvdx)
 243                 fildl   (dzdx)
 244                 fxch    %st(2)
 245                 fstps   (dudx)
 246                 fstps   (dvdx)
 247                 fstps   (dzdx)
 248
 249 // bytes_per_row * fx_y is the offset for the current scanline. (We do
 250 // this now before we start the first FP division.)
 251
 252                 movl    (_bytes_per_row), %eax
 253                 xorl    %edx, %edx
 254                 mull    (_fx_y)
 255
 256 // Push v, u, z, and 64.0 onto the FPU stack, and then start
 257 // calculating the first 64 / z.
 258
 259                 fildl   (_fx_v)
 260                 fildl   (_fx_u)
 261                 fildl   (_fx_z)
 262                 flds    (flt64)
 263                 fdiv    %st(1)
 264
 265 // Meanwhile, get l and dldx (again, the latter multiplied by four).
 266 // l will be stored in %ebp for the duration. The original values are
 267 // divided by 256 so that the byte needed for the fade table offset
 268 // will be aligned.
 269
 270 //Dividing by 256 is bad.. rounding errors and crap.  We'll now do that
 271 //right before we need to access the table instead.  -MM
 272
 273                 movl    (_fx_l), %edx
 274 //              sarl    $8, %edx
 275                 movl    %edx, (l)
 276                 movl    (_fx_dl_dx), %edx
 277 //              sarl    $6, %edx
 278                 sall    $2, %edx
 279                 movl    %edx, (dldx)
 280
 281 // Store pixptr, the pointer to our 64x64 texture map, in %esi. Store
 282 // write_buffer, the pointer to our frame buffer, in %edi. Then offset
 283 // %edi so that it points to pixel [fx_y][fx_xleft]. Calculate a
 284 // pointer to [fx_y][fx_xright + 1] so we know when to stop drawing.
 285 // Also calculate a pointer to [fx_y][(fx_xright + 1) & ~3] so we know
 286 // when to stop drawing four pixels at a time.
 287
 288                 movl    (_pixptr), %esi
 289                 movl    (_write_buffer), %edi
 290                 movl    (_fx_xright), %ecx
 291                 addl    %eax, %edi
 292                 incl    %ecx
 293                 addl    %edi, %ecx
 294                 movl    %ecx, (lastpixel)
 295                 addl    (_fx_xleft), %edi
 296                 movl    %ecx, %eax
 297                 subl    %edi, %eax
 298                 jle     LeaveNow
 299                 andl    $3, %eax
 300                 subl    %eax, %ecx
 301                 movl    %ecx, (lastquartet)
 302
 303 // Calculate round(64 * u / z) and round(64 * v / z), store, and
 304 // increment u, v, and z. Then start calculating the second 64 / z.
 305
 306                 DoFPCalcs
 307                 fdiv    %st(1)
 308
 309 // Get our u/z and v/z values, lop off the bits we don't care
 310 // about, pack, and store in uvzero.
 311
 312                 movl    (ubyz4), %eax
 313                 incl    %eax
 314                 andl    $0x3FF0, %eax
 315                 shrl    $4, %eax
 316                 movl    (vbyz4), %ebx
 317                 incl    %ebx
 318                 andl    $0x3FF0, %ebx
 319                 shll    $18, %ebx
 320                 orl     %eax, %ebx
 321                 movl    %ebx, (uvzero)
 322
 323 // Are there at least four pixels to draw? If not, skip to the epilog
 324 // code.
 325
 326                 cmpl    %ecx, %edi
 327                 je      LastBits
 328
 329 // Do we need to test for transparencies?
 330
 331                 testl   $(~0), (_Transparency_on)
 332                 jnz     LoopTransOn
 333
 334 // If not, then use the simpler loop here.
 335
 336 .balign 4
 337
 338 LoopTransOff:
 339
 340 // While the FPU is busy dividing, the latest u/z and v/z values are
 341 // retrieved, packed, and stored in uvzero (to be used again in the
 342 // next iteration). The old uvzero value, which contains the uv values
 343 // for pixel 0, gets subtracted from the new uvzero value to
 344 // determined the total change in u/z and v/z across the four pixels,
 345 // and this is divided by 4 to get the average. This average is then
 346 // used to estimate the values for pixels 1, $2, and 3. The old uvzero
 347 // value is used immediately to calculate pixel 0, while %eax, %ebx, and
 348 // %ecx are entrusted with the uv values for pixels 1, $2, and 3
 349 // respectively, while %edx is our "cleansed" register for using byte
 350 // values as memory pointer offsets. %ebp is loaded with the high byte
 351 // of l, forming half of the offset for the fade table lookup. (The
 352 // pixel from the texture-map bitmap supplies the other half.) Each
 353 // value is used to set its pixel as follows (assuming %eax holds our
 354 // packed uv value):
 355 //
 356 //      a:      bswapl  %eax                            / move u and v to the
 357 //      b:      roll    $6, %eax                        /   far right
 358 //      c:      andl    $0x0FFF, %eax                   / mask off extra bits
 359 //      d:      movb    (%esi,%eax), %dl                / get texture-map pixel
 360 //      e:      movb    fadetbl(%edx,%ebp), %dl         / correct for lighting
 361 //      f:      movb    %dl, (%edi)                     / write to frame buffer
 362 //
 363 // The above is done four times, once for each pixel. Some of the
 364 // calculations may appear to be interleaved haphazardly, but the PPro
 365 // seems to like it this way.
 366
 367                 DoFPCalcs
 368                 fdiv    %st(1)
 369
 370                 xorl    %edx, %edx
 371                 movl    (uvzero), %eax                  // %eax = uv for pixel 0
 372                 bswapl  %eax                            // 0 a
 373                 roll    $6, %eax                        // 0 b
 374                 andl    $0x0FFF, %eax                   // 0 c
 375                 movb    (%esi,%eax), %dl                // 0 d
 376                 movl    (l), %ebp
 377                 movl    (dldx), %ecx
 378                 addl    %ebp, %ecx
 379                 movl    %ecx, (l)
 380                 sarl    $8, %ebp
 381                 andl    $0x7F00, %ebp
 382         movb    fadetbl(%edx,%ebp), %dl         // 0 e
 383
 384                 movl    (vbyz4), %ebx
 385                 incl    %ebx
 386                 andl    $0x3FF0, %ebx
 387                 movl    (ubyz4), %ecx
 388                 shll    $18, %ebx
 389                 incl    %ecx
 390                 andl    $0x3FF0, %ecx
 391                 shrl    $4, %ecx
 392                 movl    (uvzero), %eax
 393                 orl     %ebx, %ecx
 394                 movl    %ecx, (uvzero)
 395                 orl     $0x1000, %ecx
 396                 subl    %eax, %ecx
 397                 shrl    $2, %ecx
 398
 399                 movb    %dl, (%edi)                     // 0 f
 400                 lea     (%eax,%ecx,2), %ebx             // %ebx = uv for pixel 2
 401                 addl    %ecx, %eax                      // %eax = uv for pixel 1
 402                 bswapl  %eax                            // 1 a
 403                 roll    $6, %eax                        // 1 b
 404                 addl    %ebx, %ecx                      // %ecx = uv for pixel 3
 405                 bswapl  %ebx                            // 2 a
 406                 roll    $6, %ebx                        // 2 b
 407                 bswapl  %ecx                            // 3 a
 408                 andl    $0x0FFF, %eax                   // 1 c
 409                 andl    $0x0FFF, %ebx                   // 2 c
 410                 roll    $6, %ecx                        // 3 b
 411
 412                 movb    (%esi,%eax), %dl                // 1 d
 413                 movb    fadetbl(%edx,%ebp), %al         // 1 e
 414                 movb    (%esi,%ebx), %dl                // 2 d
 415                 movb    fadetbl(%edx,%ebp), %bl         // 2 e
 416                 movb    %al, 1(%edi)                    // 1 f
 417                 andl    $0x0FFF, %ecx                   // 3 c
 418                 movb    %bl, 2(%edi)                    // 2 f
 419                 movb    (%esi,%ecx), %dl                // 3 d
 420                 movb    fadetbl(%edx,%ebp), %cl         // 3 e
 421                 movb    %cl, 3(%edi)                    // 3 f
 422
 423                 addl    $4, %edi
 424                 cmpl    (lastquartet), %edi
 425                 jl      LoopTransOff
 426
 427 // Are there any pixels left at all?
 428
 429                 cmpl    (lastpixel), %edi
 430                 jne     LastBits
 431                 jmp     LeaveNow
 432
 433
 434 .balign 4
 435
 436 LoopTransOn:
 437
 438 // This is similar to the LoopTransOff loop, the big change being that
 439 // each value retrieved from the texture map is tested against 255,
 440 // the transparent "color". A value of 255 in the texture map means to
 441 // let the existing value for that pixel in write_buffer go by
 442 // unchanged. Thus the code for each pixel looks something like this
 443 // instead:
 444 //
 445 //      a:      bswapl  %eax                            / move u and v to the
 446 //      b:      roll    $6, %eax                        /   far right
 447 //      c:      andl    $0x0FFF, %eax                   / mask off extra bits
 448 //      d:      movb    (%esi,%eax), %dl                / get texture-map pixel
 449 //      e:      cmpb    $255, %dl                       / is pixel transparent?
 450 //      f:      sbbb    %ah, %ah                        / yes:%ah=00, no:%ah=FF
 451 //      g:      movb    fadetbl(%edx,%ebp), %dl         / correct for lighting
 452 //      h:      movb    (%edi), %al                     / get current pixel
 453 //      i:      xorb    %al, %dl                        / combine the two
 454 //      j:      andb    %dl, %ah                        / use %ah as a mask to
 455 //      k:      xorb    %ah, %al                        /   select which pixel
 456 //      l:      movb    %al, (%edi)                     / write to frame buffer
 457 //
 458 // When the texture-map value is 255, the code simply writes the
 459 // original frame-buffer value back out again; otherwise the new pixel
 460 // is written instead. The ands and xors used to accomplish this bulk
 461 // up the code, but on the whole it is better than having four
 462 // unpredictable jumps in the loop.
 463
 464                 DoFPCalcs
 465                 fdiv    %st(1)
 466
 467                 movl    (uvzero), %eax                  // %eax = uv for pixel 0
 468                 bswapl  %eax                            // 0 a
 469                 movl    (dldx), %ecx
 470                 movl    (l), %ebp
 471                 addl    %ebp, %ecx
 472                 roll    $6, %eax                        // 0 b
 473                 andl    $0x0FFF, %eax                   // 0 c
 474                 xorl    %edx, %edx
 475                 movb    (%esi,%eax), %dl                // 0 d
 476                 cmpb    $255, %dl                       // 0 e
 477                 sbbb    %ah, %ah                        // 0 f
 478                 movl    %ecx, (l)
 479                 sarl    $8, %ebp
 480                 andl    $0x7F00, %ebp
 481
 482                 movb    fadetbl(%edx,%ebp), %dl         // 0 g
 483                 movb    (%edi), %al                     // 0 h
 484                 xorb    %al, %dl                        // 0 i
 485                 andb    %dl, %ah                        // 0 j
 486                 xorb    %ah, %al                        // 0 k
 487                 movb    %al, (%edi)                     // 0 l
 488
 489                 movl    (vbyz4), %ebx
 490                 movl    (ubyz4), %ecx
 491                 incl    %ebx
 492                 andl    $0x3FF0, %ebx
 493                 incl    %ecx
 494                 andl    $0x3FF0, %ecx
 495                 shll    $18, %ebx
 496                 shrl    $4, %ecx
 497                 orl     %ebx, %ecx
 498                 movl    (uvzero), %eax
 499                 movl    %ecx, (uvzero)
 500                 orl     $0x1000, %ecx
 501                 subl    %eax, %ecx
 502                 shrl    $2, %ecx
 503
 504                 lea     (%eax,%ecx,2), %ebx             // %ebx = uv for pixel 2
 505                 addl    %ecx, %eax                      // %eax = uv for pixel 1
 506                 bswapl  %eax                            // 1 a
 507                 roll    $6, %eax                        // 1 b
 508                 addl    %ebx, %ecx                      // %ecx = uv for pixel 3
 509                 bswapl  %ebx                            // 2 a
 510                 roll    $6, %ebx                        // 2 b
 511                 andl    $0x0FFF, %eax                   // 1 c
 512                 movb    (%esi,%eax), %dl                // 1 d
 513                 cmpb    $255, %dl                       // 1 e
 514                 sbbb    %ah, %ah                        // 1 f
 515                 bswapl  %ecx                            // 3 a
 516                 movb    1(%edi), %al                    // 1 h
 517                 movb    fadetbl(%edx,%ebp), %dl         // 1 g
 518
 519                 roll    $6, %ecx                        // 3 b
 520                 andl    $0x0FFF, %ebx                   // 2 c
 521                 xorb    %al, %dl                        // 1 i
 522                 andb    %dl, %ah                        // 1 j
 523                 movb    (%esi,%ebx), %dl                // 2 d
 524                 cmpb    $255, %dl                       // 2 e
 525                 sbbb    %bh, %bh                        // 2 f
 526                 movb    fadetbl(%edx,%ebp), %dl         // 2 g
 527                 andl    $0x0FFF, %ecx                   // 3 c
 528                 movb    2(%edi), %bl                    // 2 h
 529                 xorb    %bl, %dl                        // 2 i
 530                 andb    %dl, %bh                        // 2 j
 531
 532                 movb    (%esi,%ecx), %dl                // 3 d
 533                 cmpb    $255, %dl                       // 3 e
 534                 sbbb    %ch, %ch                        // 3 f
 535                 movb    3(%edi), %cl                    // 3 h
 536                 movb    fadetbl(%edx,%ebp), %dl         // 3 g
 537                 xorb    %cl, %dl                        // 3 i
 538                 andb    %dl, %ch                        // 3 j
 539
 540                 xorb    %ah, %al                        // 1 k
 541                 movb    %al, 1(%edi)                    // 1 l
 542                 xorb    %bh, %bl                        // 2 k
 543                 movb    %bl, 2(%edi)                    // 2 l
 544                 xorb    %ch, %cl                        // 3 k
 545                 movb    %cl, 3(%edi)                    // 3 l
 546
 547                 addl    $4, %edi
 548                 cmpl    (lastquartet), %edi
 549                 jl      LoopTransOn
 550
 551 // Quit if there are none at all left.
 552
 553                 cmpl    (lastpixel), %edi
 554                 je      LeaveNow
 555
 556
 557 LastBits:
 558
 559 // Here we finish off the last one-to-three pixels assigned to us.
 560 // Rather than calculating values for all four pixels, we just divide
 561 // the difference by four and keep adding this average into the value
 562 // as needed. (This code is not particularly optimized, by the way,
 563 // since it represents such a miniscule amount of the running time.)
 564
 565                 DoFPCalcs
 566                 movl    (l), %ebp
 567                 sarl    $8, %ebp
 568                 andl    $0x7F00, %ebp
 569                 movl    (ubyz4), %eax
 570                 incl    %eax
 571                 andl    $0x3FF0, %eax
 572                 shrl    $4, %eax
 573                 movl    (vbyz4), %ecx
 574                 incl    %ecx
 575                 andl    $0x3FF0, %ecx
 576                 shll    $18, %ecx
 577                 orl     %eax, %ecx
 578                 movl    (uvzero), %ebx
 579                 orl     $0x1000, %ecx
 580                 subl    %ebx, %ecx
 581                 shrl    $2, %ecx
 582                 xorl    %edx, %edx
 583
 584 LoopLastBits:   movl    %ebx, %eax
 585                 bswapl  %eax
 586                 roll    $6, %eax
 587                 andl    $0x0FFF, %eax
 588                 movb    (%esi,%eax), %dl
 589                 cmpb    $255, %dl
 590                 je      LetPixelBy
 591                 movb    fadetbl(%edx,%ebp), %dl
 592                 movb    %dl, (%edi)
 593 LetPixelBy:     incl    %edi
 594                 addl    %ecx, %ebx
 595                 cmpl    (lastpixel), %edi
 596                 jl      LoopLastBits
 597
 598
 599 LeaveNow:
 600
 601 // We're done! Clear the stacks, reset the FPU control word, and we
 602 // are so out of here.
 603
 604                 popl    %esi
 605                 popl    %edi
 606                 popl    %ebp
 607                 fcompp
 608                 fcompp
 609                 fldcw   (ctlwd)
 610                 ret