00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #define BLOCK_SIZE 4096
00031 #define CONFUSION_FACTOR 0
00032
00033
00034
00035
00036 #ifndef HAVE_SSE2
00037
00038
00039
00040
00041
00042
00043
00044
00045 #undef HAVE_SSE
00046 #endif
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096 #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX )
00097
00098 #undef HAVE_MMX1
00099 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
00100
00101
00102
00103
00104
00105 #define HAVE_MMX1
00106 #endif
00107
00108
00109 #undef HAVE_K6_2PLUS
00110 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
00111 #define HAVE_K6_2PLUS
00112 #endif
00113
00114
00115 #define small_memcpy(to,from,n)\
00116 {\
00117 register unsigned long int dummy;\
00118 __asm__ __volatile__(\
00119 "rep; movsb"\
00120 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
00121 \
00122 \
00123 \
00124 :"0" (to), "1" (from),"2" (n)\
00125 : "memory");\
00126 }
00127
00128 #ifdef HAVE_SSE
00129 #define MMREG_SIZE 16
00130 #else
00131 #define MMREG_SIZE 64
00132 #endif
00133
00134
00135 #ifdef HAVE_K6_2PLUS
00136 #define PREFETCH "prefetch"
00137
00138 #define EMMS "femms"
00139 #else
00140 #define PREFETCH "prefetchnta"
00141 #define EMMS "emms"
00142 #endif
00143
00144 #ifdef HAVE_MMX2
00145 #define MOVNTQ "movntq"
00146 #else
00147 #define MOVNTQ "movq"
00148 #endif
00149
00150 #ifdef HAVE_MMX1
00151 #define MIN_LEN 0x800
00152 #else
00153 #define MIN_LEN 0x40
00154 #endif
00155
00156 void * fast_memcpy(void * to, const void * from, size_t len)
00157 {
00158 void *retval;
00159 size_t i;
00160 retval = to;
00161 #ifdef STATISTICS
00162 {
00163 static int freq[33];
00164 static int t=0;
00165 int i;
00166 for(i=0; len>(1<<i); i++);
00167 freq[i]++;
00168 t++;
00169 if(1024*1024*1024 % t == 0)
00170 for(i=0; i<32; i++)
00171 printf("freq < %8d %4d\n", 1<<i, freq[i]);
00172 }
00173 #endif
00174 #ifndef HAVE_MMX1
00175
00176 __asm__ __volatile__ (
00177 PREFETCH" (%0)\n"
00178 PREFETCH" 64(%0)\n"
00179 PREFETCH" 128(%0)\n"
00180 PREFETCH" 192(%0)\n"
00181 PREFETCH" 256(%0)\n"
00182 : : "r" (from) );
00183 #endif
00184 if(len >= MIN_LEN)
00185 {
00186 register unsigned long int delta;
00187
00188 delta = ((unsigned long int)to)&(MMREG_SIZE-1);
00189 if(delta)
00190 {
00191 delta=MMREG_SIZE-delta;
00192 len -= delta;
00193 small_memcpy(to, from, delta);
00194 }
00195 i = len >> 6;
00196 len&=63;
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206 #ifdef HAVE_SSE
00207 if(((unsigned long)from) & 15)
00208
00209 for(; i>0; i--)
00210 {
00211 __asm__ __volatile__ (
00212 PREFETCH" 320(%0)\n"
00213 "movups (%0), %%xmm0\n"
00214 "movups 16(%0), %%xmm1\n"
00215 "movups 32(%0), %%xmm2\n"
00216 "movups 48(%0), %%xmm3\n"
00217 "movntps %%xmm0, (%1)\n"
00218 "movntps %%xmm1, 16(%1)\n"
00219 "movntps %%xmm2, 32(%1)\n"
00220 "movntps %%xmm3, 48(%1)\n"
00221 :: "r" (from), "r" (to) : "memory");
00222 ((const unsigned char *)from)+=64;
00223 ((unsigned char *)to)+=64;
00224 }
00225 else
00226
00227
00228
00229
00230
00231 for(; i>0; i--)
00232 {
00233 __asm__ __volatile__ (
00234 PREFETCH" 320(%0)\n"
00235 "movaps (%0), %%xmm0\n"
00236 "movaps 16(%0), %%xmm1\n"
00237 "movaps 32(%0), %%xmm2\n"
00238 "movaps 48(%0), %%xmm3\n"
00239 "movntps %%xmm0, (%1)\n"
00240 "movntps %%xmm1, 16(%1)\n"
00241 "movntps %%xmm2, 32(%1)\n"
00242 "movntps %%xmm3, 48(%1)\n"
00243 :: "r" (from), "r" (to) : "memory");
00244 ((const unsigned char *)from)+=64;
00245 ((unsigned char *)to)+=64;
00246 }
00247 #else
00248
00249 for(; ((ptrdiff_t)to & (BLOCK_SIZE-1)) && i>0; i--)
00250 {
00251 __asm__ __volatile__ (
00252 #ifndef HAVE_MMX1
00253 PREFETCH" 320(%0)\n"
00254 #endif
00255 "movq (%0), %%mm0\n"
00256 "movq 8(%0), %%mm1\n"
00257 "movq 16(%0), %%mm2\n"
00258 "movq 24(%0), %%mm3\n"
00259 "movq 32(%0), %%mm4\n"
00260 "movq 40(%0), %%mm5\n"
00261 "movq 48(%0), %%mm6\n"
00262 "movq 56(%0), %%mm7\n"
00263 MOVNTQ" %%mm0, (%1)\n"
00264 MOVNTQ" %%mm1, 8(%1)\n"
00265 MOVNTQ" %%mm2, 16(%1)\n"
00266 MOVNTQ" %%mm3, 24(%1)\n"
00267 MOVNTQ" %%mm4, 32(%1)\n"
00268 MOVNTQ" %%mm5, 40(%1)\n"
00269 MOVNTQ" %%mm6, 48(%1)\n"
00270 MOVNTQ" %%mm7, 56(%1)\n"
00271 :: "r" (from), "r" (to) : "memory");
00272 from = (const void *) (((const unsigned char *)from)+64);
00273 to = (void *) (((unsigned char *)to)+64);
00274 }
00275
00276
00277
00278 # if 0
00279 if(i>=BLOCK_SIZE/64)
00280 asm volatile(
00281 "xorl %%eax, %%eax \n\t"
00282 ".balign 16 \n\t"
00283 "1: \n\t"
00284 "movl (%0, %%eax), %%ebx \n\t"
00285 "movl 32(%0, %%eax), %%ebx \n\t"
00286 "movl 64(%0, %%eax), %%ebx \n\t"
00287 "movl 96(%0, %%eax), %%ebx \n\t"
00288 "addl $128, %%eax \n\t"
00289 "cmpl %3, %%eax \n\t"
00290 " jb 1b \n\t"
00291
00292 "xorl %%eax, %%eax \n\t"
00293
00294 ".balign 16 \n\t"
00295 "2: \n\t"
00296 "movq (%0, %%eax), %%mm0\n"
00297 "movq 8(%0, %%eax), %%mm1\n"
00298 "movq 16(%0, %%eax), %%mm2\n"
00299 "movq 24(%0, %%eax), %%mm3\n"
00300 "movq 32(%0, %%eax), %%mm4\n"
00301 "movq 40(%0, %%eax), %%mm5\n"
00302 "movq 48(%0, %%eax), %%mm6\n"
00303 "movq 56(%0, %%eax), %%mm7\n"
00304 MOVNTQ" %%mm0, (%1, %%eax)\n"
00305 MOVNTQ" %%mm1, 8(%1, %%eax)\n"
00306 MOVNTQ" %%mm2, 16(%1, %%eax)\n"
00307 MOVNTQ" %%mm3, 24(%1, %%eax)\n"
00308 MOVNTQ" %%mm4, 32(%1, %%eax)\n"
00309 MOVNTQ" %%mm5, 40(%1, %%eax)\n"
00310 MOVNTQ" %%mm6, 48(%1, %%eax)\n"
00311 MOVNTQ" %%mm7, 56(%1, %%eax)\n"
00312 "addl $64, %%eax \n\t"
00313 "cmpl %3, %%eax \n\t"
00314 "jb 2b \n\t"
00315
00316 #if CONFUSION_FACTOR > 0
00317
00318 "movl %5, %%eax \n\t"
00319 "2: \n\t"
00320 "movl (%0), %%ebx \n\t"
00321 "movl (%0), %%ebx \n\t"
00322 "movl (%0), %%ebx \n\t"
00323 "movl (%0), %%ebx \n\t"
00324 "decl %%eax \n\t"
00325 " jnz 2b \n\t"
00326 #endif
00327
00328 "xorl %%eax, %%eax \n\t"
00329 "addl %3, %0 \n\t"
00330 "addl %3, %1 \n\t"
00331 "subl %4, %2 \n\t"
00332 "cmpl %4, %2 \n\t"
00333 " jae 1b \n\t"
00334 : "+r" (from), "+r" (to), "+r" (i)
00335 : "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
00336 : "%eax", "%ebx"
00337 );
00338 #endif
00339
00340 for(; i>0; i--)
00341 {
00342 __asm__ __volatile__ (
00343 #ifndef HAVE_MMX1
00344 PREFETCH" 320(%0)\n"
00345 #endif
00346 "movq (%0), %%mm0\n"
00347 "movq 8(%0), %%mm1\n"
00348 "movq 16(%0), %%mm2\n"
00349 "movq 24(%0), %%mm3\n"
00350 "movq 32(%0), %%mm4\n"
00351 "movq 40(%0), %%mm5\n"
00352 "movq 48(%0), %%mm6\n"
00353 "movq 56(%0), %%mm7\n"
00354 MOVNTQ" %%mm0, (%1)\n"
00355 MOVNTQ" %%mm1, 8(%1)\n"
00356 MOVNTQ" %%mm2, 16(%1)\n"
00357 MOVNTQ" %%mm3, 24(%1)\n"
00358 MOVNTQ" %%mm4, 32(%1)\n"
00359 MOVNTQ" %%mm5, 40(%1)\n"
00360 MOVNTQ" %%mm6, 48(%1)\n"
00361 MOVNTQ" %%mm7, 56(%1)\n"
00362 :: "r" (from), "r" (to) : "memory");
00363 from = (const void *) (((const unsigned char *)from)+64);
00364 to = (void *) (((unsigned char *)to)+64);
00365 }
00366
00367 #endif
00368 #ifdef HAVE_MMX2
00369
00370
00371 __asm__ __volatile__ ("sfence":::"memory");
00372 #endif
00373 #ifndef HAVE_SSE
00374
00375 __asm__ __volatile__ (EMMS:::"memory");
00376 #endif
00377 }
00378
00379
00380
00381 if(len) small_memcpy(to, from, len);
00382 return retval;
00383 }
00384
00385
00386 #endif