2 ===========================================================================
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
7 This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
26 ===========================================================================
29 #include "../precompiled.h"
32 #include "Simd_Generic.h"
34 #include "Simd_3DNow.h"
37 //===============================================================
39 // 3DNow! implementation of idSIMDProcessor
41 //===============================================================
50 const char * idSIMD_3DNow::GetName( void ) const {
51 return "MMX & 3DNow!";
54 // Very optimized memcpy() routine for all AMD Athlon and Duron family.
55 // This code uses any of FOUR different basic copy methods, depending
56 // on the transfer size.
57 // NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
58 // "Streaming Store"), and also uses the software prefetchnta instructions,
59 // be sure you're running on Athlon/Duron or other recent CPU before calling!
61 #define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
62 // The smallest copy uses the X86 "movsd" instruction, in an optimized
63 // form which is an "unrolled loop".
65 #define IN_CACHE_COPY 64 * 1024 // upper limit for movq/movq copy w/SW prefetch
66 // Next is a copy that uses the MMX registers to copy 8 bytes at a time,
67 // also using the "unrolled loop" optimization. This code uses
68 // the software prefetch instruction to get the data into the cache.
70 #define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
71 // For larger blocks, which will spill beyond the cache, it's faster to
72 // use the Streaming Store instruction MOVNTQ. This write instruction
73 // bypasses the cache and writes straight to main memory. This code also
74 // uses the software prefetch instruction to pre-read the data.
75 // USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
77 #define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
78 #define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
79 // For the largest size blocks, a special technique called Block Prefetch
80 // can be used to accelerate the read operations. Block Prefetch reads
81 // one address per cache line, for a series of cache lines, in a short loop.
82 // This is faster than using software prefetch. The technique is great for
83 // getting maximum read bandwidth, especially in DDR memory systems.
89 optimized memory copy routine that handles all alignment cases and block sizes efficiently
92 void VPCALL idSIMD_3DNow::Memcpy( void *dest, const void *src, const int n ) {
95 mov ecx, [n] // number of bytes to copy
96 mov edi, [dest] // destination
97 mov esi, [src] // source
98 mov ebx, ecx // keep a copy of count
101 cmp ecx, TINY_BLOCK_COPY
102 jb $memcpy_ic_3 // tiny? skip mmx copy
104 cmp ecx, 32*1024 // don't align between 32k-64k because
105 jbe $memcpy_do_align // it appears to be slower
107 jbe $memcpy_align_done
109 mov ecx, 8 // a trick that's faster than rep movsb...
110 sub ecx, edi // align destination to qword
111 and ecx, 111b // get the low bits
112 sub ebx, ecx // update copy count
113 neg ecx // set up to jump into the array
114 add ecx, offset $memcpy_align_done
115 jmp ecx // jump to array of movsb's
127 $memcpy_align_done: // destination is dword aligned
128 mov ecx, ebx // number of bytes left to copy
129 shr ecx, 6 // get 64-byte block count
130 jz $memcpy_ic_2 // finish the last few bytes
132 cmp ecx, IN_CACHE_COPY/64 // too big 4 cache? use uncached copy
135 // This is small block copy that uses the MMX registers to copy 8 bytes
136 // at a time. It uses the "unrolled loop" optimization, and also uses
137 // the software prefetch instruction to get the data into the cache.
139 $memcpy_ic_1: // 64-byte block copies, in-cache copy
141 prefetchnta [esi + (200*64/34+192)] // start reading ahead
143 movq mm0, [esi+0] // read 64 bits
145 movq [edi+0], mm0 // write 64 bits
146 movq [edi+8], mm1 // note: the normal movq writes the
147 movq mm2, [esi+16] // data to cache; a cache line will be
148 movq mm3, [esi+24] // allocated as needed, to store the data
160 add esi, 64 // update source pointer
161 add edi, 64 // update destination pointer
162 dec ecx // count down
163 jnz $memcpy_ic_1 // last 64-byte block?
166 mov ecx, ebx // has valid low 6 bits of the byte count
168 shr ecx, 2 // dword count
169 and ecx, 1111b // only look at the "remainder" bits
170 neg ecx // set up to jump into the array
171 add ecx, offset $memcpy_last_few
172 jmp ecx // jump to array of movsd's
175 cmp ecx, UNCACHED_COPY/64 // big enough? use block prefetch copy
179 or ecx, ecx // tail end of block prefetch will jump here
180 jz $memcpy_ic_2 // no more 64-byte blocks left
182 // For larger blocks, which will spill beyond the cache, it's faster to
183 // use the Streaming Store instruction MOVNTQ. This write instruction
184 // bypasses the cache and writes straight to main memory. This code also
185 // uses the software prefetch instruction to pre-read the data.
187 $memcpy_uc_1: // 64-byte blocks, uncached copy
189 prefetchnta [esi + (200*64/34+192)] // start reading ahead
191 movq mm0,[esi+0] // read 64 bits
192 add edi,64 // update destination pointer
194 add esi,64 // update source pointer
196 movntq [edi-64], mm0 // write 64 bits, bypassing the cache
197 movq mm0,[esi-40] // note: movntq also prevents the CPU
198 movntq [edi-56], mm1 // from READING the destination address
199 movq mm1,[esi-32] // into the cache, only to be over-written
200 movntq [edi-48], mm2 // so that also helps performance
210 jnz $memcpy_uc_1 // last 64-byte block?
212 jmp $memcpy_ic_2 // almost done
214 // For the largest size blocks, a special technique called Block Prefetch
215 // can be used to accelerate the read operations. Block Prefetch reads
216 // one address per cache line, for a series of cache lines, in a short loop.
217 // This is faster than using software prefetch, in this case.
218 // The technique is great for getting maximum read bandwidth,
219 // especially in DDR memory systems.
220 $memcpy_bp_1: // large blocks, block prefetch copy
222 cmp ecx, CACHEBLOCK // big enough to run another prefetch loop?
223 jl $memcpy_64_test // no, back to regular uncached copy
225 mov eax, CACHEBLOCK / 2 // block prefetch loop, unrolled 2X
226 add esi, CACHEBLOCK * 64 // move to the top of the block
229 mov edx, [esi-64] // grab one address per cache line
230 mov edx, [esi-128] // grab one address per cache line
231 sub esi, 128 // go reverse order
232 dec eax // count down the cache lines
233 jnz $memcpy_bp_2 // keep grabbing more lines into cache
235 mov eax, CACHEBLOCK // now that it's in cache, do the copy
238 movq mm0, [esi ] // read 64 bits
246 add esi, 64 // update source pointer
247 movntq [edi ], mm0 // write 64 bits, bypassing cache
248 movntq [edi+ 8], mm1 // note: movntq also prevents the CPU
249 movntq [edi+16], mm2 // from READING the destination address
250 movntq [edi+24], mm3 // into the cache, only to be over-written,
251 movntq [edi+32], mm4 // so that also helps performance
255 add edi, 64 // update dest pointer
257 dec eax // count down
259 jnz $memcpy_bp_3 // keep copying
260 sub ecx, CACHEBLOCK // update the 64-byte block count
261 jmp $memcpy_bp_1 // keep processing chunks
263 // The smallest copy uses the X86 "movsd" instruction, in an optimized
264 // form which is an "unrolled loop". Then it handles the last few bytes.
267 movsd // perform last 1-15 dword copies
275 movsd // perform last 1-7 dword copies
283 $memcpy_last_few: // dword aligned from before movsd's
284 mov ecx, ebx // has valid low 2 bits of the byte count
285 and ecx, 11b // the last few cows must come home
286 jz $memcpy_final // no more, let's leave
287 rep movsb // the last 1, 2, or 3 bytes
290 emms // clean up the MMX state
291 sfence // flush the write buffer
292 mov eax, [dest] // ret value = destination pointer