neo/idlib/math/Simd_3DNow.cpp

   1 /*
   2 ===========================================================================
   3
   4 Doom 3 GPL Source Code
   5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
   6
   7 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
   8
   9 Doom 3 Source Code is free software: you can redistribute it and/or modify
  10 it under the terms of the GNU General Public License as published by
  11 the Free Software Foundation, either version 3 of the License, or
  12 (at your option) any later version.
  13
  14 Doom 3 Source Code is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with Doom 3 Source Code.  If not, see <http://www.gnu.org/licenses/>.
  21
  22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code.  If not, please request a copy in writing from id Software at the address below.
  23
  24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  25
  26 ===========================================================================
  27 */
  28
  29 #include "../precompiled.h"
  30 #pragma hdrstop
  31
  32 #include "Simd_Generic.h"
  33 #include "Simd_MMX.h"
  34 #include "Simd_3DNow.h"
  35
  36
  37 //===============================================================
  38 //
  39 //      3DNow! implementation of idSIMDProcessor
  40 //
  41 //===============================================================
  42
  43 #ifdef _WIN32
  44
  45 /*
  46 ============
  47 idSIMD_3DNow::GetName
  48 ============
  49 */
  50 const char * idSIMD_3DNow::GetName( void ) const {
  51         return "MMX & 3DNow!";
  52 }
  53
  54 // Very optimized memcpy() routine for all AMD Athlon and Duron family.
  55 // This code uses any of FOUR different basic copy methods, depending
  56 // on the transfer size.
  57 // NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
  58 // "Streaming Store"), and also uses the software prefetchnta instructions,
  59 // be sure you're running on Athlon/Duron or other recent CPU before calling!
  60
  61 #define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
  62 // The smallest copy uses the X86 "movsd" instruction, in an optimized
  63 // form which is an "unrolled loop".
  64
  65 #define IN_CACHE_COPY 64 * 1024  // upper limit for movq/movq copy w/SW prefetch
  66 // Next is a copy that uses the MMX registers to copy 8 bytes at a time,
  67 // also using the "unrolled loop" optimization.   This code uses
  68 // the software prefetch instruction to get the data into the cache.
  69
  70 #define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
  71 // For larger blocks, which will spill beyond the cache, it's faster to
  72 // use the Streaming Store instruction MOVNTQ.   This write instruction
  73 // bypasses the cache and writes straight to main memory.  This code also
  74 // uses the software prefetch instruction to pre-read the data.
  75 // USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
  76
  77 #define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch
  78 #define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
  79 // For the largest size blocks, a special technique called Block Prefetch
  80 // can be used to accelerate the read operations.   Block Prefetch reads
  81 // one address per cache line, for a series of cache lines, in a short loop.
  82 // This is faster than using software prefetch.  The technique is great for
  83 // getting maximum read bandwidth, especially in DDR memory systems.
  84
  85 /*
  86 ================
  87 idSIMD_3DNow::Memcpy
  88
  89   optimized memory copy routine that handles all alignment cases and block sizes efficiently
  90 ================
  91 */
  92 void VPCALL idSIMD_3DNow::Memcpy( void *dest, const void *src, const int n ) {
  93   __asm {
  94
  95         mov             ecx, [n]                                        // number of bytes to copy
  96         mov             edi, [dest]                                     // destination
  97         mov             esi, [src]                                      // source
  98         mov             ebx, ecx                                        // keep a copy of count
  99
 100         cld
 101         cmp             ecx, TINY_BLOCK_COPY
 102         jb              $memcpy_ic_3                            // tiny? skip mmx copy
 103
 104         cmp             ecx, 32*1024                            // don't align between 32k-64k because
 105         jbe             $memcpy_do_align                        //  it appears to be slower
 106         cmp             ecx, 64*1024
 107         jbe             $memcpy_align_done
 108 $memcpy_do_align:
 109         mov             ecx, 8                                          // a trick that's faster than rep movsb...
 110         sub             ecx, edi                                        // align destination to qword
 111         and             ecx, 111b                                       // get the low bits
 112         sub             ebx, ecx                                        // update copy count
 113         neg             ecx                                                     // set up to jump into the array
 114         add             ecx, offset $memcpy_align_done
 115         jmp             ecx                                                     // jump to array of movsb's
 116
 117 align 4
 118         movsb
 119         movsb
 120         movsb
 121         movsb
 122         movsb
 123         movsb
 124         movsb
 125         movsb
 126
 127 $memcpy_align_done:                                             // destination is dword aligned
 128         mov             ecx, ebx                                        // number of bytes left to copy
 129         shr             ecx, 6                                          // get 64-byte block count
 130         jz              $memcpy_ic_2                            // finish the last few bytes
 131
 132         cmp             ecx, IN_CACHE_COPY/64           // too big 4 cache? use uncached copy
 133         jae             $memcpy_uc_test
 134
 135 // This is small block copy that uses the MMX registers to copy 8 bytes
 136 // at a time.  It uses the "unrolled loop" optimization, and also uses
 137 // the software prefetch instruction to get the data into the cache.
 138 align 16
 139 $memcpy_ic_1:                                                   // 64-byte block copies, in-cache copy
 140
 141         prefetchnta [esi + (200*64/34+192)]     // start reading ahead
 142
 143         movq    mm0, [esi+0]                            // read 64 bits
 144         movq    mm1, [esi+8]
 145         movq    [edi+0], mm0                            // write 64 bits
 146         movq    [edi+8], mm1                            //    note:  the normal movq writes the
 147         movq    mm2, [esi+16]                           //    data to cache; a cache line will be
 148         movq    mm3, [esi+24]                           //    allocated as needed, to store the data
 149         movq    [edi+16], mm2
 150         movq    [edi+24], mm3
 151         movq    mm0, [esi+32]
 152         movq    mm1, [esi+40]
 153         movq    [edi+32], mm0
 154         movq    [edi+40], mm1
 155         movq    mm2, [esi+48]
 156         movq    mm3, [esi+56]
 157         movq    [edi+48], mm2
 158         movq    [edi+56], mm3
 159
 160         add             esi, 64                                         // update source pointer
 161         add             edi, 64                                         // update destination pointer
 162         dec             ecx                                                     // count down
 163         jnz             $memcpy_ic_1                            // last 64-byte block?
 164
 165 $memcpy_ic_2:
 166         mov             ecx, ebx                                        // has valid low 6 bits of the byte count
 167 $memcpy_ic_3:
 168         shr             ecx, 2                                          // dword count
 169         and             ecx, 1111b                                      // only look at the "remainder" bits
 170         neg             ecx                                                     // set up to jump into the array
 171         add             ecx, offset $memcpy_last_few
 172         jmp             ecx                                                     // jump to array of movsd's
 173
 174 $memcpy_uc_test:
 175         cmp             ecx, UNCACHED_COPY/64           // big enough? use block prefetch copy
 176         jae             $memcpy_bp_1
 177
 178 $memcpy_64_test:
 179         or              ecx, ecx                                        // tail end of block prefetch will jump here
 180         jz              $memcpy_ic_2                            // no more 64-byte blocks left
 181
 182 // For larger blocks, which will spill beyond the cache, it's faster to
 183 // use the Streaming Store instruction MOVNTQ.   This write instruction
 184 // bypasses the cache and writes straight to main memory.  This code also
 185 // uses the software prefetch instruction to pre-read the data.
 186 align 16
 187 $memcpy_uc_1:                                                   // 64-byte blocks, uncached copy
 188
 189         prefetchnta [esi + (200*64/34+192)]     // start reading ahead
 190
 191         movq    mm0,[esi+0]                                     // read 64 bits
 192         add             edi,64                                          // update destination pointer
 193         movq    mm1,[esi+8]
 194         add             esi,64                                          // update source pointer
 195         movq    mm2,[esi-48]
 196         movntq  [edi-64], mm0                           // write 64 bits, bypassing the cache
 197         movq    mm0,[esi-40]                            //    note: movntq also prevents the CPU
 198         movntq  [edi-56], mm1                           //    from READING the destination address
 199         movq    mm1,[esi-32]                            //    into the cache, only to be over-written
 200         movntq  [edi-48], mm2                           //    so that also helps performance
 201         movq    mm2,[esi-24]
 202         movntq  [edi-40], mm0
 203         movq    mm0,[esi-16]
 204         movntq  [edi-32], mm1
 205         movq    mm1,[esi-8]
 206         movntq  [edi-24], mm2
 207         movntq  [edi-16], mm0
 208         dec             ecx
 209         movntq  [edi-8], mm1
 210         jnz             $memcpy_uc_1                            // last 64-byte block?
 211
 212         jmp             $memcpy_ic_2                            // almost done
 213
 214 // For the largest size blocks, a special technique called Block Prefetch
 215 // can be used to accelerate the read operations.   Block Prefetch reads
 216 // one address per cache line, for a series of cache lines, in a short loop.
 217 // This is faster than using software prefetch, in this case.
 218 // The technique is great for getting maximum read bandwidth,
 219 // especially in DDR memory systems.
 220 $memcpy_bp_1:                                                   // large blocks, block prefetch copy
 221
 222         cmp             ecx, CACHEBLOCK                         // big enough to run another prefetch loop?
 223         jl              $memcpy_64_test                         // no, back to regular uncached copy
 224
 225         mov             eax, CACHEBLOCK / 2                     // block prefetch loop, unrolled 2X
 226         add             esi, CACHEBLOCK * 64            // move to the top of the block
 227 align 16
 228 $memcpy_bp_2:
 229         mov             edx, [esi-64]                           // grab one address per cache line
 230         mov             edx, [esi-128]                          // grab one address per cache line
 231         sub             esi, 128                                        // go reverse order
 232         dec             eax                                                     // count down the cache lines
 233         jnz             $memcpy_bp_2                            // keep grabbing more lines into cache
 234
 235         mov             eax, CACHEBLOCK                         // now that it's in cache, do the copy
 236 align 16
 237 $memcpy_bp_3:
 238         movq    mm0, [esi   ]                           // read 64 bits
 239         movq    mm1, [esi+ 8]
 240         movq    mm2, [esi+16]
 241         movq    mm3, [esi+24]
 242         movq    mm4, [esi+32]
 243         movq    mm5, [esi+40]
 244         movq    mm6, [esi+48]
 245         movq    mm7, [esi+56]
 246         add             esi, 64                                         // update source pointer
 247         movntq  [edi   ], mm0                           // write 64 bits, bypassing cache
 248         movntq  [edi+ 8], mm1                           //    note: movntq also prevents the CPU
 249         movntq  [edi+16], mm2                           //    from READING the destination address
 250         movntq  [edi+24], mm3                           //    into the cache, only to be over-written,
 251         movntq  [edi+32], mm4                           //    so that also helps performance
 252         movntq  [edi+40], mm5
 253         movntq  [edi+48], mm6
 254         movntq  [edi+56], mm7
 255         add             edi, 64                                         // update dest pointer
 256
 257         dec             eax                                                     // count down
 258
 259         jnz             $memcpy_bp_3                            // keep copying
 260         sub             ecx, CACHEBLOCK                         // update the 64-byte block count
 261         jmp             $memcpy_bp_1                            // keep processing chunks
 262
 263 // The smallest copy uses the X86 "movsd" instruction, in an optimized
 264 // form which is an "unrolled loop".   Then it handles the last few bytes.
 265 align 4
 266         movsd
 267         movsd                                                           // perform last 1-15 dword copies
 268         movsd
 269         movsd
 270         movsd
 271         movsd
 272         movsd
 273         movsd
 274         movsd
 275         movsd                                                           // perform last 1-7 dword copies
 276         movsd
 277         movsd
 278         movsd
 279         movsd
 280         movsd
 281         movsd
 282
 283 $memcpy_last_few:                                               // dword aligned from before movsd's
 284         mov             ecx, ebx                                        // has valid low 2 bits of the byte count
 285         and             ecx, 11b                                        // the last few cows must come home
 286         jz              $memcpy_final                           // no more, let's leave
 287         rep             movsb                                           // the last 1, 2, or 3 bytes
 288
 289 $memcpy_final:
 290         emms                                                            // clean up the MMX state
 291         sfence                                                          // flush the write buffer
 292         mov             eax, [dest]                                     // ret value = destination pointer
 293
 294     }
 295 }
 296
 297 #endif /* _WIN32 */