00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <stddef.h>
00012 #include <inttypes.h>
00013
00014 #ifndef __WORDSIZE
00015
00016 #define __WORDSIZE MP_WORDSIZE
00017 #endif
00018
00019 #undef PREFETCH
00020 #undef MOVNTQ
00021 #undef EMMS
00022 #undef SFENCE
00023 #undef MMREG_SIZE
00024 #undef PREFETCHW
00025 #undef PAVGB
00026
00027 #ifdef HAVE_SSE2
00028 #define MMREG_SIZE 16
00029 #else
00030 #define MMREG_SIZE 8
00031 #endif
00032
00033 #ifdef HAVE_3DNOW
00034 #define PREFETCH "prefetch"
00035 #define PREFETCHW "prefetchw"
00036 #define PAVGB "pavgusb"
00037 #elif defined ( HAVE_MMX2 )
00038 #define PREFETCH "prefetchnta"
00039 #define PREFETCHW "prefetcht0"
00040 #define PAVGB "pavgb"
00041 #else
00042 #define PREFETCH "/nop"
00043 #define PREFETCHW "/nop"
00044 #endif
00045
00046 #ifdef HAVE_3DNOW
00047
00048 #define EMMS "femms"
00049 #else
00050 #define EMMS "emms"
00051 #endif
00052
00053 #ifdef HAVE_MMX2
00054 #define MOVNTQ "movntq"
00055 #define SFENCE "sfence"
00056 #else
00057 #define MOVNTQ "movq"
00058 #define SFENCE "/nop"
00059 #endif
00060
00061 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
00062 {
00063 uint8_t *dest = dst;
00064 const uint8_t *s = src;
00065 const uint8_t *end;
00066 #ifdef HAVE_MMX
00067 const uint8_t *mm_end;
00068 #endif
00069 end = s + src_size;
00070 #ifdef HAVE_MMX
00071 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
00072 mm_end = end - 23;
00073 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
00074 while(s < mm_end)
00075 {
00076 __asm __volatile(
00077 PREFETCH" 32%1\n\t"
00078 "movd %1, %%mm0\n\t"
00079 "punpckldq 3%1, %%mm0\n\t"
00080 "movd 6%1, %%mm1\n\t"
00081 "punpckldq 9%1, %%mm1\n\t"
00082 "movd 12%1, %%mm2\n\t"
00083 "punpckldq 15%1, %%mm2\n\t"
00084 "movd 18%1, %%mm3\n\t"
00085 "punpckldq 21%1, %%mm3\n\t"
00086 "pand %%mm7, %%mm0\n\t"
00087 "pand %%mm7, %%mm1\n\t"
00088 "pand %%mm7, %%mm2\n\t"
00089 "pand %%mm7, %%mm3\n\t"
00090 MOVNTQ" %%mm0, %0\n\t"
00091 MOVNTQ" %%mm1, 8%0\n\t"
00092 MOVNTQ" %%mm2, 16%0\n\t"
00093 MOVNTQ" %%mm3, 24%0"
00094 :"=m"(*dest)
00095 :"m"(*s)
00096 :"memory");
00097 dest += 32;
00098 s += 24;
00099 }
00100 __asm __volatile(SFENCE:::"memory");
00101 __asm __volatile(EMMS:::"memory");
00102 #endif
00103 while(s < end)
00104 {
00105 *dest++ = *s++;
00106 *dest++ = *s++;
00107 *dest++ = *s++;
00108 *dest++ = 0;
00109 }
00110 }
00111
00112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
00113 {
00114 uint8_t *dest = dst;
00115 const uint8_t *s = src;
00116 const uint8_t *end;
00117 #ifdef HAVE_MMX
00118 const uint8_t *mm_end;
00119 #endif
00120 end = s + src_size;
00121 #ifdef HAVE_MMX
00122 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
00123 mm_end = end - 31;
00124 while(s < mm_end)
00125 {
00126 __asm __volatile(
00127 PREFETCH" 32%1\n\t"
00128 "movq %1, %%mm0\n\t"
00129 "movq 8%1, %%mm1\n\t"
00130 "movq 16%1, %%mm4\n\t"
00131 "movq 24%1, %%mm5\n\t"
00132 "movq %%mm0, %%mm2\n\t"
00133 "movq %%mm1, %%mm3\n\t"
00134 "movq %%mm4, %%mm6\n\t"
00135 "movq %%mm5, %%mm7\n\t"
00136 "psrlq $8, %%mm2\n\t"
00137 "psrlq $8, %%mm3\n\t"
00138 "psrlq $8, %%mm6\n\t"
00139 "psrlq $8, %%mm7\n\t"
00140 "pand %2, %%mm0\n\t"
00141 "pand %2, %%mm1\n\t"
00142 "pand %2, %%mm4\n\t"
00143 "pand %2, %%mm5\n\t"
00144 "pand %3, %%mm2\n\t"
00145 "pand %3, %%mm3\n\t"
00146 "pand %3, %%mm6\n\t"
00147 "pand %3, %%mm7\n\t"
00148 "por %%mm2, %%mm0\n\t"
00149 "por %%mm3, %%mm1\n\t"
00150 "por %%mm6, %%mm4\n\t"
00151 "por %%mm7, %%mm5\n\t"
00152
00153 "movq %%mm1, %%mm2\n\t"
00154 "movq %%mm4, %%mm3\n\t"
00155 "psllq $48, %%mm2\n\t"
00156 "psllq $32, %%mm3\n\t"
00157 "pand %4, %%mm2\n\t"
00158 "pand %5, %%mm3\n\t"
00159 "por %%mm2, %%mm0\n\t"
00160 "psrlq $16, %%mm1\n\t"
00161 "psrlq $32, %%mm4\n\t"
00162 "psllq $16, %%mm5\n\t"
00163 "por %%mm3, %%mm1\n\t"
00164 "pand %6, %%mm5\n\t"
00165 "por %%mm5, %%mm4\n\t"
00166
00167 MOVNTQ" %%mm0, %0\n\t"
00168 MOVNTQ" %%mm1, 8%0\n\t"
00169 MOVNTQ" %%mm4, 16%0"
00170 :"=m"(*dest)
00171 :"m"(*s),"m"(mask24l),
00172 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
00173 :"memory");
00174 dest += 24;
00175 s += 32;
00176 }
00177 __asm __volatile(SFENCE:::"memory");
00178 __asm __volatile(EMMS:::"memory");
00179 #endif
00180 while(s < end)
00181 {
00182 *dest++ = *s++;
00183 *dest++ = *s++;
00184 *dest++ = *s++;
00185 s++;
00186 }
00187 }
00188
00189
00190
00191
00192
00193
00194
00195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
00196 {
00197 register const uint8_t* s=src;
00198 register uint8_t* d=dst;
00199 register const uint8_t *end;
00200 const uint8_t *mm_end;
00201 end = s + src_size;
00202 #ifdef HAVE_MMX
00203 __asm __volatile(PREFETCH" %0"::"m"(*s));
00204 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
00205 mm_end = end - 15;
00206 while(s<mm_end)
00207 {
00208 __asm __volatile(
00209 PREFETCH" 32%1\n\t"
00210 "movq %1, %%mm0\n\t"
00211 "movq 8%1, %%mm2\n\t"
00212 "movq %%mm0, %%mm1\n\t"
00213 "movq %%mm2, %%mm3\n\t"
00214 "pand %%mm4, %%mm0\n\t"
00215 "pand %%mm4, %%mm2\n\t"
00216 "paddw %%mm1, %%mm0\n\t"
00217 "paddw %%mm3, %%mm2\n\t"
00218 MOVNTQ" %%mm0, %0\n\t"
00219 MOVNTQ" %%mm2, 8%0"
00220 :"=m"(*d)
00221 :"m"(*s)
00222 );
00223 d+=16;
00224 s+=16;
00225 }
00226 __asm __volatile(SFENCE:::"memory");
00227 __asm __volatile(EMMS:::"memory");
00228 #endif
00229 mm_end = end - 3;
00230 while(s < mm_end)
00231 {
00232 register unsigned x= *((uint32_t *)s);
00233 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
00234 d+=4;
00235 s+=4;
00236 }
00237 if(s < end)
00238 {
00239 register unsigned short x= *((uint16_t *)s);
00240 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
00241 }
00242 }
00243
00244 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
00245 {
00246 register const uint8_t* s=src;
00247 register uint8_t* d=dst;
00248 register const uint8_t *end;
00249 const uint8_t *mm_end;
00250 end = s + src_size;
00251 #ifdef HAVE_MMX
00252 __asm __volatile(PREFETCH" %0"::"m"(*s));
00253 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
00254 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
00255 mm_end = end - 15;
00256 while(s<mm_end)
00257 {
00258 __asm __volatile(
00259 PREFETCH" 32%1\n\t"
00260 "movq %1, %%mm0\n\t"
00261 "movq 8%1, %%mm2\n\t"
00262 "movq %%mm0, %%mm1\n\t"
00263 "movq %%mm2, %%mm3\n\t"
00264 "psrlq $1, %%mm0\n\t"
00265 "psrlq $1, %%mm2\n\t"
00266 "pand %%mm7, %%mm0\n\t"
00267 "pand %%mm7, %%mm2\n\t"
00268 "pand %%mm6, %%mm1\n\t"
00269 "pand %%mm6, %%mm3\n\t"
00270 "por %%mm1, %%mm0\n\t"
00271 "por %%mm3, %%mm2\n\t"
00272 MOVNTQ" %%mm0, %0\n\t"
00273 MOVNTQ" %%mm2, 8%0"
00274 :"=m"(*d)
00275 :"m"(*s)
00276 );
00277 d+=16;
00278 s+=16;
00279 }
00280 __asm __volatile(SFENCE:::"memory");
00281 __asm __volatile(EMMS:::"memory");
00282 #endif
00283 mm_end = end - 3;
00284 while(s < mm_end)
00285 {
00286 register uint32_t x= *((uint32_t *)s);
00287 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
00288 s+=4;
00289 d+=4;
00290 }
00291 if(s < end)
00292 {
00293 register uint16_t x= *((uint16_t *)s);
00294 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
00295 s+=2;
00296 d+=2;
00297 }
00298 }
00299
00300 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
00301 {
00302 const uint8_t *s = src;
00303 const uint8_t *end;
00304 #ifdef HAVE_MMX
00305 const uint8_t *mm_end;
00306 #endif
00307 uint16_t *d = (uint16_t *)dst;
00308 end = s + src_size;
00309 #ifdef HAVE_MMX
00310 mm_end = end - 15;
00311 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
00312 asm volatile(
00313 "movq %3, %%mm5 \n\t"
00314 "movq %4, %%mm6 \n\t"
00315 "movq %5, %%mm7 \n\t"
00316 ".balign 16 \n\t"
00317 "1: \n\t"
00318 PREFETCH" 32(%1) \n\t"
00319 "movd (%1), %%mm0 \n\t"
00320 "movd 4(%1), %%mm3 \n\t"
00321 "punpckldq 8(%1), %%mm0 \n\t"
00322 "punpckldq 12(%1), %%mm3 \n\t"
00323 "movq %%mm0, %%mm1 \n\t"
00324 "movq %%mm3, %%mm4 \n\t"
00325 "pand %%mm6, %%mm0 \n\t"
00326 "pand %%mm6, %%mm3 \n\t"
00327 "pmaddwd %%mm7, %%mm0 \n\t"
00328 "pmaddwd %%mm7, %%mm3 \n\t"
00329 "pand %%mm5, %%mm1 \n\t"
00330 "pand %%mm5, %%mm4 \n\t"
00331 "por %%mm1, %%mm0 \n\t"
00332 "por %%mm4, %%mm3 \n\t"
00333 "psrld $5, %%mm0 \n\t"
00334 "pslld $11, %%mm3 \n\t"
00335 "por %%mm3, %%mm0 \n\t"
00336 MOVNTQ" %%mm0, (%0) \n\t"
00337 "addl $16, %1 \n\t"
00338 "addl $8, %0 \n\t"
00339 "cmpl %2, %1 \n\t"
00340 " jb 1b \n\t"
00341 : "+r" (d), "+r"(s)
00342 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
00343 );
00344 #else
00345 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
00346 __asm __volatile(
00347 "movq %0, %%mm7\n\t"
00348 "movq %1, %%mm6\n\t"
00349 ::"m"(red_16mask),"m"(green_16mask));
00350 while(s < mm_end)
00351 {
00352 __asm __volatile(
00353 PREFETCH" 32%1\n\t"
00354 "movd %1, %%mm0\n\t"
00355 "movd 4%1, %%mm3\n\t"
00356 "punpckldq 8%1, %%mm0\n\t"
00357 "punpckldq 12%1, %%mm3\n\t"
00358 "movq %%mm0, %%mm1\n\t"
00359 "movq %%mm0, %%mm2\n\t"
00360 "movq %%mm3, %%mm4\n\t"
00361 "movq %%mm3, %%mm5\n\t"
00362 "psrlq $3, %%mm0\n\t"
00363 "psrlq $3, %%mm3\n\t"
00364 "pand %2, %%mm0\n\t"
00365 "pand %2, %%mm3\n\t"
00366 "psrlq $5, %%mm1\n\t"
00367 "psrlq $5, %%mm4\n\t"
00368 "pand %%mm6, %%mm1\n\t"
00369 "pand %%mm6, %%mm4\n\t"
00370 "psrlq $8, %%mm2\n\t"
00371 "psrlq $8, %%mm5\n\t"
00372 "pand %%mm7, %%mm2\n\t"
00373 "pand %%mm7, %%mm5\n\t"
00374 "por %%mm1, %%mm0\n\t"
00375 "por %%mm4, %%mm3\n\t"
00376 "por %%mm2, %%mm0\n\t"
00377 "por %%mm5, %%mm3\n\t"
00378 "psllq $16, %%mm3\n\t"
00379 "por %%mm3, %%mm0\n\t"
00380 MOVNTQ" %%mm0, %0\n\t"
00381 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00382 d += 4;
00383 s += 16;
00384 }
00385 #endif
00386 __asm __volatile(SFENCE:::"memory");
00387 __asm __volatile(EMMS:::"memory");
00388 #endif
00389 while(s < end)
00390 {
00391 const int src= *s; s += 4;
00392 *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
00393
00394 }
00395 }
00396
00397 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
00398 {
00399 const uint8_t *s = src;
00400 const uint8_t *end;
00401 #ifdef HAVE_MMX
00402 const uint8_t *mm_end;
00403 #endif
00404 uint16_t *d = (uint16_t *)dst;
00405 end = s + src_size;
00406 #ifdef HAVE_MMX
00407 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
00408 __asm __volatile(
00409 "movq %0, %%mm7\n\t"
00410 "movq %1, %%mm6\n\t"
00411 ::"m"(red_16mask),"m"(green_16mask));
00412 mm_end = end - 15;
00413 while(s < mm_end)
00414 {
00415 __asm __volatile(
00416 PREFETCH" 32%1\n\t"
00417 "movd %1, %%mm0\n\t"
00418 "movd 4%1, %%mm3\n\t"
00419 "punpckldq 8%1, %%mm0\n\t"
00420 "punpckldq 12%1, %%mm3\n\t"
00421 "movq %%mm0, %%mm1\n\t"
00422 "movq %%mm0, %%mm2\n\t"
00423 "movq %%mm3, %%mm4\n\t"
00424 "movq %%mm3, %%mm5\n\t"
00425 "psllq $8, %%mm0\n\t"
00426 "psllq $8, %%mm3\n\t"
00427 "pand %%mm7, %%mm0\n\t"
00428 "pand %%mm7, %%mm3\n\t"
00429 "psrlq $5, %%mm1\n\t"
00430 "psrlq $5, %%mm4\n\t"
00431 "pand %%mm6, %%mm1\n\t"
00432 "pand %%mm6, %%mm4\n\t"
00433 "psrlq $19, %%mm2\n\t"
00434 "psrlq $19, %%mm5\n\t"
00435 "pand %2, %%mm2\n\t"
00436 "pand %2, %%mm5\n\t"
00437 "por %%mm1, %%mm0\n\t"
00438 "por %%mm4, %%mm3\n\t"
00439 "por %%mm2, %%mm0\n\t"
00440 "por %%mm5, %%mm3\n\t"
00441 "psllq $16, %%mm3\n\t"
00442 "por %%mm3, %%mm0\n\t"
00443 MOVNTQ" %%mm0, %0\n\t"
00444 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00445 d += 4;
00446 s += 16;
00447 }
00448 __asm __volatile(SFENCE:::"memory");
00449 __asm __volatile(EMMS:::"memory");
00450 #endif
00451 while(s < end)
00452 {
00453 const int src= *s; s += 4;
00454 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
00455 }
00456 }
00457
00458 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
00459 {
00460 const uint8_t *s = src;
00461 const uint8_t *end;
00462 #ifdef HAVE_MMX
00463 const uint8_t *mm_end;
00464 #endif
00465 uint16_t *d = (uint16_t *)dst;
00466 end = s + src_size;
00467 #ifdef HAVE_MMX
00468 mm_end = end - 15;
00469 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
00470 asm volatile(
00471 "movq %3, %%mm5 \n\t"
00472 "movq %4, %%mm6 \n\t"
00473 "movq %5, %%mm7 \n\t"
00474 ".balign 16 \n\t"
00475 "1: \n\t"
00476 PREFETCH" 32(%1) \n\t"
00477 "movd (%1), %%mm0 \n\t"
00478 "movd 4(%1), %%mm3 \n\t"
00479 "punpckldq 8(%1), %%mm0 \n\t"
00480 "punpckldq 12(%1), %%mm3 \n\t"
00481 "movq %%mm0, %%mm1 \n\t"
00482 "movq %%mm3, %%mm4 \n\t"
00483 "pand %%mm6, %%mm0 \n\t"
00484 "pand %%mm6, %%mm3 \n\t"
00485 "pmaddwd %%mm7, %%mm0 \n\t"
00486 "pmaddwd %%mm7, %%mm3 \n\t"
00487 "pand %%mm5, %%mm1 \n\t"
00488 "pand %%mm5, %%mm4 \n\t"
00489 "por %%mm1, %%mm0 \n\t"
00490 "por %%mm4, %%mm3 \n\t"
00491 "psrld $6, %%mm0 \n\t"
00492 "pslld $10, %%mm3 \n\t"
00493 "por %%mm3, %%mm0 \n\t"
00494 MOVNTQ" %%mm0, (%0) \n\t"
00495 "addl $16, %1 \n\t"
00496 "addl $8, %0 \n\t"
00497 "cmpl %2, %1 \n\t"
00498 " jb 1b \n\t"
00499 : "+r" (d), "+r"(s)
00500 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
00501 );
00502 #else
00503 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
00504 __asm __volatile(
00505 "movq %0, %%mm7\n\t"
00506 "movq %1, %%mm6\n\t"
00507 ::"m"(red_15mask),"m"(green_15mask));
00508 while(s < mm_end)
00509 {
00510 __asm __volatile(
00511 PREFETCH" 32%1\n\t"
00512 "movd %1, %%mm0\n\t"
00513 "movd 4%1, %%mm3\n\t"
00514 "punpckldq 8%1, %%mm0\n\t"
00515 "punpckldq 12%1, %%mm3\n\t"
00516 "movq %%mm0, %%mm1\n\t"
00517 "movq %%mm0, %%mm2\n\t"
00518 "movq %%mm3, %%mm4\n\t"
00519 "movq %%mm3, %%mm5\n\t"
00520 "psrlq $3, %%mm0\n\t"
00521 "psrlq $3, %%mm3\n\t"
00522 "pand %2, %%mm0\n\t"
00523 "pand %2, %%mm3\n\t"
00524 "psrlq $6, %%mm1\n\t"
00525 "psrlq $6, %%mm4\n\t"
00526 "pand %%mm6, %%mm1\n\t"
00527 "pand %%mm6, %%mm4\n\t"
00528 "psrlq $9, %%mm2\n\t"
00529 "psrlq $9, %%mm5\n\t"
00530 "pand %%mm7, %%mm2\n\t"
00531 "pand %%mm7, %%mm5\n\t"
00532 "por %%mm1, %%mm0\n\t"
00533 "por %%mm4, %%mm3\n\t"
00534 "por %%mm2, %%mm0\n\t"
00535 "por %%mm5, %%mm3\n\t"
00536 "psllq $16, %%mm3\n\t"
00537 "por %%mm3, %%mm0\n\t"
00538 MOVNTQ" %%mm0, %0\n\t"
00539 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00540 d += 4;
00541 s += 16;
00542 }
00543 #endif
00544 __asm __volatile(SFENCE:::"memory");
00545 __asm __volatile(EMMS:::"memory");
00546 #endif
00547 while(s < end)
00548 {
00549 const int src= *s; s += 4;
00550 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
00551 }
00552 }
00553
00554 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
00555 {
00556 const uint8_t *s = src;
00557 const uint8_t *end;
00558 #ifdef HAVE_MMX
00559 const uint8_t *mm_end;
00560 #endif
00561 uint16_t *d = (uint16_t *)dst;
00562 end = s + src_size;
00563 #ifdef HAVE_MMX
00564 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
00565 __asm __volatile(
00566 "movq %0, %%mm7\n\t"
00567 "movq %1, %%mm6\n\t"
00568 ::"m"(red_15mask),"m"(green_15mask));
00569 mm_end = end - 15;
00570 while(s < mm_end)
00571 {
00572 __asm __volatile(
00573 PREFETCH" 32%1\n\t"
00574 "movd %1, %%mm0\n\t"
00575 "movd 4%1, %%mm3\n\t"
00576 "punpckldq 8%1, %%mm0\n\t"
00577 "punpckldq 12%1, %%mm3\n\t"
00578 "movq %%mm0, %%mm1\n\t"
00579 "movq %%mm0, %%mm2\n\t"
00580 "movq %%mm3, %%mm4\n\t"
00581 "movq %%mm3, %%mm5\n\t"
00582 "psllq $7, %%mm0\n\t"
00583 "psllq $7, %%mm3\n\t"
00584 "pand %%mm7, %%mm0\n\t"
00585 "pand %%mm7, %%mm3\n\t"
00586 "psrlq $6, %%mm1\n\t"
00587 "psrlq $6, %%mm4\n\t"
00588 "pand %%mm6, %%mm1\n\t"
00589 "pand %%mm6, %%mm4\n\t"
00590 "psrlq $19, %%mm2\n\t"
00591 "psrlq $19, %%mm5\n\t"
00592 "pand %2, %%mm2\n\t"
00593 "pand %2, %%mm5\n\t"
00594 "por %%mm1, %%mm0\n\t"
00595 "por %%mm4, %%mm3\n\t"
00596 "por %%mm2, %%mm0\n\t"
00597 "por %%mm5, %%mm3\n\t"
00598 "psllq $16, %%mm3\n\t"
00599 "por %%mm3, %%mm0\n\t"
00600 MOVNTQ" %%mm0, %0\n\t"
00601 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00602 d += 4;
00603 s += 16;
00604 }
00605 __asm __volatile(SFENCE:::"memory");
00606 __asm __volatile(EMMS:::"memory");
00607 #endif
00608 while(s < end)
00609 {
00610 const int src= *s; s += 4;
00611 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
00612 }
00613 }
00614
00615 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
00616 {
00617 const uint8_t *s = src;
00618 const uint8_t *end;
00619 #ifdef HAVE_MMX
00620 const uint8_t *mm_end;
00621 #endif
00622 uint16_t *d = (uint16_t *)dst;
00623 end = s + src_size;
00624 #ifdef HAVE_MMX
00625 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
00626 __asm __volatile(
00627 "movq %0, %%mm7\n\t"
00628 "movq %1, %%mm6\n\t"
00629 ::"m"(red_16mask),"m"(green_16mask));
00630 mm_end = end - 11;
00631 while(s < mm_end)
00632 {
00633 __asm __volatile(
00634 PREFETCH" 32%1\n\t"
00635 "movd %1, %%mm0\n\t"
00636 "movd 3%1, %%mm3\n\t"
00637 "punpckldq 6%1, %%mm0\n\t"
00638 "punpckldq 9%1, %%mm3\n\t"
00639 "movq %%mm0, %%mm1\n\t"
00640 "movq %%mm0, %%mm2\n\t"
00641 "movq %%mm3, %%mm4\n\t"
00642 "movq %%mm3, %%mm5\n\t"
00643 "psrlq $3, %%mm0\n\t"
00644 "psrlq $3, %%mm3\n\t"
00645 "pand %2, %%mm0\n\t"
00646 "pand %2, %%mm3\n\t"
00647 "psrlq $5, %%mm1\n\t"
00648 "psrlq $5, %%mm4\n\t"
00649 "pand %%mm6, %%mm1\n\t"
00650 "pand %%mm6, %%mm4\n\t"
00651 "psrlq $8, %%mm2\n\t"
00652 "psrlq $8, %%mm5\n\t"
00653 "pand %%mm7, %%mm2\n\t"
00654 "pand %%mm7, %%mm5\n\t"
00655 "por %%mm1, %%mm0\n\t"
00656 "por %%mm4, %%mm3\n\t"
00657 "por %%mm2, %%mm0\n\t"
00658 "por %%mm5, %%mm3\n\t"
00659 "psllq $16, %%mm3\n\t"
00660 "por %%mm3, %%mm0\n\t"
00661 MOVNTQ" %%mm0, %0\n\t"
00662 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00663 d += 4;
00664 s += 12;
00665 }
00666 __asm __volatile(SFENCE:::"memory");
00667 __asm __volatile(EMMS:::"memory");
00668 #endif
00669 while(s < end)
00670 {
00671 const int b= *s++;
00672 const int g= *s++;
00673 const int r= *s++;
00674 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
00675 }
00676 }
00677
00678 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
00679 {
00680 const uint8_t *s = src;
00681 const uint8_t *end;
00682 #ifdef HAVE_MMX
00683 const uint8_t *mm_end;
00684 #endif
00685 uint16_t *d = (uint16_t *)dst;
00686 end = s + src_size;
00687 #ifdef HAVE_MMX
00688 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
00689 __asm __volatile(
00690 "movq %0, %%mm7\n\t"
00691 "movq %1, %%mm6\n\t"
00692 ::"m"(red_16mask),"m"(green_16mask));
00693 mm_end = end - 15;
00694 while(s < mm_end)
00695 {
00696 __asm __volatile(
00697 PREFETCH" 32%1\n\t"
00698 "movd %1, %%mm0\n\t"
00699 "movd 3%1, %%mm3\n\t"
00700 "punpckldq 6%1, %%mm0\n\t"
00701 "punpckldq 9%1, %%mm3\n\t"
00702 "movq %%mm0, %%mm1\n\t"
00703 "movq %%mm0, %%mm2\n\t"
00704 "movq %%mm3, %%mm4\n\t"
00705 "movq %%mm3, %%mm5\n\t"
00706 "psllq $8, %%mm0\n\t"
00707 "psllq $8, %%mm3\n\t"
00708 "pand %%mm7, %%mm0\n\t"
00709 "pand %%mm7, %%mm3\n\t"
00710 "psrlq $5, %%mm1\n\t"
00711 "psrlq $5, %%mm4\n\t"
00712 "pand %%mm6, %%mm1\n\t"
00713 "pand %%mm6, %%mm4\n\t"
00714 "psrlq $19, %%mm2\n\t"
00715 "psrlq $19, %%mm5\n\t"
00716 "pand %2, %%mm2\n\t"
00717 "pand %2, %%mm5\n\t"
00718 "por %%mm1, %%mm0\n\t"
00719 "por %%mm4, %%mm3\n\t"
00720 "por %%mm2, %%mm0\n\t"
00721 "por %%mm5, %%mm3\n\t"
00722 "psllq $16, %%mm3\n\t"
00723 "por %%mm3, %%mm0\n\t"
00724 MOVNTQ" %%mm0, %0\n\t"
00725 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00726 d += 4;
00727 s += 12;
00728 }
00729 __asm __volatile(SFENCE:::"memory");
00730 __asm __volatile(EMMS:::"memory");
00731 #endif
00732 while(s < end)
00733 {
00734 const int r= *s++;
00735 const int g= *s++;
00736 const int b= *s++;
00737 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
00738 }
00739 }
00740
00741 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
00742 {
00743 const uint8_t *s = src;
00744 const uint8_t *end;
00745 #ifdef HAVE_MMX
00746 const uint8_t *mm_end;
00747 #endif
00748 uint16_t *d = (uint16_t *)dst;
00749 end = s + src_size;
00750 #ifdef HAVE_MMX
00751 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
00752 __asm __volatile(
00753 "movq %0, %%mm7\n\t"
00754 "movq %1, %%mm6\n\t"
00755 ::"m"(red_15mask),"m"(green_15mask));
00756 mm_end = end - 11;
00757 while(s < mm_end)
00758 {
00759 __asm __volatile(
00760 PREFETCH" 32%1\n\t"
00761 "movd %1, %%mm0\n\t"
00762 "movd 3%1, %%mm3\n\t"
00763 "punpckldq 6%1, %%mm0\n\t"
00764 "punpckldq 9%1, %%mm3\n\t"
00765 "movq %%mm0, %%mm1\n\t"
00766 "movq %%mm0, %%mm2\n\t"
00767 "movq %%mm3, %%mm4\n\t"
00768 "movq %%mm3, %%mm5\n\t"
00769 "psrlq $3, %%mm0\n\t"
00770 "psrlq $3, %%mm3\n\t"
00771 "pand %2, %%mm0\n\t"
00772 "pand %2, %%mm3\n\t"
00773 "psrlq $6, %%mm1\n\t"
00774 "psrlq $6, %%mm4\n\t"
00775 "pand %%mm6, %%mm1\n\t"
00776 "pand %%mm6, %%mm4\n\t"
00777 "psrlq $9, %%mm2\n\t"
00778 "psrlq $9, %%mm5\n\t"
00779 "pand %%mm7, %%mm2\n\t"
00780 "pand %%mm7, %%mm5\n\t"
00781 "por %%mm1, %%mm0\n\t"
00782 "por %%mm4, %%mm3\n\t"
00783 "por %%mm2, %%mm0\n\t"
00784 "por %%mm5, %%mm3\n\t"
00785 "psllq $16, %%mm3\n\t"
00786 "por %%mm3, %%mm0\n\t"
00787 MOVNTQ" %%mm0, %0\n\t"
00788 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00789 d += 4;
00790 s += 12;
00791 }
00792 __asm __volatile(SFENCE:::"memory");
00793 __asm __volatile(EMMS:::"memory");
00794 #endif
00795 while(s < end)
00796 {
00797 const int b= *s++;
00798 const int g= *s++;
00799 const int r= *s++;
00800 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
00801 }
00802 }
00803
00804 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
00805 {
00806 const uint8_t *s = src;
00807 const uint8_t *end;
00808 #ifdef HAVE_MMX
00809 const uint8_t *mm_end;
00810 #endif
00811 uint16_t *d = (uint16_t *)dst;
00812 end = s + src_size;
00813 #ifdef HAVE_MMX
00814 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
00815 __asm __volatile(
00816 "movq %0, %%mm7\n\t"
00817 "movq %1, %%mm6\n\t"
00818 ::"m"(red_15mask),"m"(green_15mask));
00819 mm_end = end - 15;
00820 while(s < mm_end)
00821 {
00822 __asm __volatile(
00823 PREFETCH" 32%1\n\t"
00824 "movd %1, %%mm0\n\t"
00825 "movd 3%1, %%mm3\n\t"
00826 "punpckldq 6%1, %%mm0\n\t"
00827 "punpckldq 9%1, %%mm3\n\t"
00828 "movq %%mm0, %%mm1\n\t"
00829 "movq %%mm0, %%mm2\n\t"
00830 "movq %%mm3, %%mm4\n\t"
00831 "movq %%mm3, %%mm5\n\t"
00832 "psllq $7, %%mm0\n\t"
00833 "psllq $7, %%mm3\n\t"
00834 "pand %%mm7, %%mm0\n\t"
00835 "pand %%mm7, %%mm3\n\t"
00836 "psrlq $6, %%mm1\n\t"
00837 "psrlq $6, %%mm4\n\t"
00838 "pand %%mm6, %%mm1\n\t"
00839 "pand %%mm6, %%mm4\n\t"
00840 "psrlq $19, %%mm2\n\t"
00841 "psrlq $19, %%mm5\n\t"
00842 "pand %2, %%mm2\n\t"
00843 "pand %2, %%mm5\n\t"
00844 "por %%mm1, %%mm0\n\t"
00845 "por %%mm4, %%mm3\n\t"
00846 "por %%mm2, %%mm0\n\t"
00847 "por %%mm5, %%mm3\n\t"
00848 "psllq $16, %%mm3\n\t"
00849 "por %%mm3, %%mm0\n\t"
00850 MOVNTQ" %%mm0, %0\n\t"
00851 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00852 d += 4;
00853 s += 12;
00854 }
00855 __asm __volatile(SFENCE:::"memory");
00856 __asm __volatile(EMMS:::"memory");
00857 #endif
00858 while(s < end)
00859 {
00860 const int r= *s++;
00861 const int g= *s++;
00862 const int b= *s++;
00863 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
00864 }
00865 }
00866
00867
00868
00869
00870
00871
00872
00873
00874
00875
00876
00877
00878
00879
00880
00881
00882
00883
00884
00885
00886
00887
00888
00889
00890 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
00891 {
00892 const uint16_t *end;
00893 #ifdef HAVE_MMX
00894 const uint16_t *mm_end;
00895 #endif
00896 uint8_t *d = (uint8_t *)dst;
00897 const uint16_t *s = (uint16_t *)src;
00898 end = s + src_size/2;
00899 #ifdef HAVE_MMX
00900 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
00901 mm_end = end - 7;
00902 while(s < mm_end)
00903 {
00904 __asm __volatile(
00905 PREFETCH" 32%1\n\t"
00906 "movq %1, %%mm0\n\t"
00907 "movq %1, %%mm1\n\t"
00908 "movq %1, %%mm2\n\t"
00909 "pand %2, %%mm0\n\t"
00910 "pand %3, %%mm1\n\t"
00911 "pand %4, %%mm2\n\t"
00912 "psllq $3, %%mm0\n\t"
00913 "psrlq $2, %%mm1\n\t"
00914 "psrlq $7, %%mm2\n\t"
00915 "movq %%mm0, %%mm3\n\t"
00916 "movq %%mm1, %%mm4\n\t"
00917 "movq %%mm2, %%mm5\n\t"
00918 "punpcklwd %5, %%mm0\n\t"
00919 "punpcklwd %5, %%mm1\n\t"
00920 "punpcklwd %5, %%mm2\n\t"
00921 "punpckhwd %5, %%mm3\n\t"
00922 "punpckhwd %5, %%mm4\n\t"
00923 "punpckhwd %5, %%mm5\n\t"
00924 "psllq $8, %%mm1\n\t"
00925 "psllq $16, %%mm2\n\t"
00926 "por %%mm1, %%mm0\n\t"
00927 "por %%mm2, %%mm0\n\t"
00928 "psllq $8, %%mm4\n\t"
00929 "psllq $16, %%mm5\n\t"
00930 "por %%mm4, %%mm3\n\t"
00931 "por %%mm5, %%mm3\n\t"
00932
00933 "movq %%mm0, %%mm6\n\t"
00934 "movq %%mm3, %%mm7\n\t"
00935
00936 "movq 8%1, %%mm0\n\t"
00937 "movq 8%1, %%mm1\n\t"
00938 "movq 8%1, %%mm2\n\t"
00939 "pand %2, %%mm0\n\t"
00940 "pand %3, %%mm1\n\t"
00941 "pand %4, %%mm2\n\t"
00942 "psllq $3, %%mm0\n\t"
00943 "psrlq $2, %%mm1\n\t"
00944 "psrlq $7, %%mm2\n\t"
00945 "movq %%mm0, %%mm3\n\t"
00946 "movq %%mm1, %%mm4\n\t"
00947 "movq %%mm2, %%mm5\n\t"
00948 "punpcklwd %5, %%mm0\n\t"
00949 "punpcklwd %5, %%mm1\n\t"
00950 "punpcklwd %5, %%mm2\n\t"
00951 "punpckhwd %5, %%mm3\n\t"
00952 "punpckhwd %5, %%mm4\n\t"
00953 "punpckhwd %5, %%mm5\n\t"
00954 "psllq $8, %%mm1\n\t"
00955 "psllq $16, %%mm2\n\t"
00956 "por %%mm1, %%mm0\n\t"
00957 "por %%mm2, %%mm0\n\t"
00958 "psllq $8, %%mm4\n\t"
00959 "psllq $16, %%mm5\n\t"
00960 "por %%mm4, %%mm3\n\t"
00961 "por %%mm5, %%mm3\n\t"
00962
00963 :"=m"(*d)
00964 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
00965 :"memory");
00966
00967 __asm __volatile(
00968 "movq %%mm0, %%mm4\n\t"
00969 "movq %%mm3, %%mm5\n\t"
00970 "movq %%mm6, %%mm0\n\t"
00971 "movq %%mm7, %%mm1\n\t"
00972
00973 "movq %%mm4, %%mm6\n\t"
00974 "movq %%mm5, %%mm7\n\t"
00975 "movq %%mm0, %%mm2\n\t"
00976 "movq %%mm1, %%mm3\n\t"
00977
00978 "psrlq $8, %%mm2\n\t"
00979 "psrlq $8, %%mm3\n\t"
00980 "psrlq $8, %%mm6\n\t"
00981 "psrlq $8, %%mm7\n\t"
00982 "pand %2, %%mm0\n\t"
00983 "pand %2, %%mm1\n\t"
00984 "pand %2, %%mm4\n\t"
00985 "pand %2, %%mm5\n\t"
00986 "pand %3, %%mm2\n\t"
00987 "pand %3, %%mm3\n\t"
00988 "pand %3, %%mm6\n\t"
00989 "pand %3, %%mm7\n\t"
00990 "por %%mm2, %%mm0\n\t"
00991 "por %%mm3, %%mm1\n\t"
00992 "por %%mm6, %%mm4\n\t"
00993 "por %%mm7, %%mm5\n\t"
00994
00995 "movq %%mm1, %%mm2\n\t"
00996 "movq %%mm4, %%mm3\n\t"
00997 "psllq $48, %%mm2\n\t"
00998 "psllq $32, %%mm3\n\t"
00999 "pand %4, %%mm2\n\t"
01000 "pand %5, %%mm3\n\t"
01001 "por %%mm2, %%mm0\n\t"
01002 "psrlq $16, %%mm1\n\t"
01003 "psrlq $32, %%mm4\n\t"
01004 "psllq $16, %%mm5\n\t"
01005 "por %%mm3, %%mm1\n\t"
01006 "pand %6, %%mm5\n\t"
01007 "por %%mm5, %%mm4\n\t"
01008
01009 MOVNTQ" %%mm0, %0\n\t"
01010 MOVNTQ" %%mm1, 8%0\n\t"
01011 MOVNTQ" %%mm4, 16%0"
01012
01013 :"=m"(*d)
01014 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
01015 :"memory");
01016 d += 24;
01017 s += 8;
01018 }
01019 __asm __volatile(SFENCE:::"memory");
01020 __asm __volatile(EMMS:::"memory");
01021 #endif
01022 while(s < end)
01023 {
01024 register uint16_t bgr;
01025 bgr = *s++;
01026 *d++ = (bgr&0x1F)<<3;
01027 *d++ = (bgr&0x3E0)>>2;
01028 *d++ = (bgr&0x7C00)>>7;
01029 }
01030 }
01031
01032 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
01033 {
01034 const uint16_t *end;
01035 #ifdef HAVE_MMX
01036 const uint16_t *mm_end;
01037 #endif
01038 uint8_t *d = (uint8_t *)dst;
01039 const uint16_t *s = (const uint16_t *)src;
01040 end = s + src_size/2;
01041 #ifdef HAVE_MMX
01042 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
01043 mm_end = end - 7;
01044 while(s < mm_end)
01045 {
01046 __asm __volatile(
01047 PREFETCH" 32%1\n\t"
01048 "movq %1, %%mm0\n\t"
01049 "movq %1, %%mm1\n\t"
01050 "movq %1, %%mm2\n\t"
01051 "pand %2, %%mm0\n\t"
01052 "pand %3, %%mm1\n\t"
01053 "pand %4, %%mm2\n\t"
01054 "psllq $3, %%mm0\n\t"
01055 "psrlq $3, %%mm1\n\t"
01056 "psrlq $8, %%mm2\n\t"
01057 "movq %%mm0, %%mm3\n\t"
01058 "movq %%mm1, %%mm4\n\t"
01059 "movq %%mm2, %%mm5\n\t"
01060 "punpcklwd %5, %%mm0\n\t"
01061 "punpcklwd %5, %%mm1\n\t"
01062 "punpcklwd %5, %%mm2\n\t"
01063 "punpckhwd %5, %%mm3\n\t"
01064 "punpckhwd %5, %%mm4\n\t"
01065 "punpckhwd %5, %%mm5\n\t"
01066 "psllq $8, %%mm1\n\t"
01067 "psllq $16, %%mm2\n\t"
01068 "por %%mm1, %%mm0\n\t"
01069 "por %%mm2, %%mm0\n\t"
01070 "psllq $8, %%mm4\n\t"
01071 "psllq $16, %%mm5\n\t"
01072 "por %%mm4, %%mm3\n\t"
01073 "por %%mm5, %%mm3\n\t"
01074
01075 "movq %%mm0, %%mm6\n\t"
01076 "movq %%mm3, %%mm7\n\t"
01077
01078 "movq 8%1, %%mm0\n\t"
01079 "movq 8%1, %%mm1\n\t"
01080 "movq 8%1, %%mm2\n\t"
01081 "pand %2, %%mm0\n\t"
01082 "pand %3, %%mm1\n\t"
01083 "pand %4, %%mm2\n\t"
01084 "psllq $3, %%mm0\n\t"
01085 "psrlq $3, %%mm1\n\t"
01086 "psrlq $8, %%mm2\n\t"
01087 "movq %%mm0, %%mm3\n\t"
01088 "movq %%mm1, %%mm4\n\t"
01089 "movq %%mm2, %%mm5\n\t"
01090 "punpcklwd %5, %%mm0\n\t"
01091 "punpcklwd %5, %%mm1\n\t"
01092 "punpcklwd %5, %%mm2\n\t"
01093 "punpckhwd %5, %%mm3\n\t"
01094 "punpckhwd %5, %%mm4\n\t"
01095 "punpckhwd %5, %%mm5\n\t"
01096 "psllq $8, %%mm1\n\t"
01097 "psllq $16, %%mm2\n\t"
01098 "por %%mm1, %%mm0\n\t"
01099 "por %%mm2, %%mm0\n\t"
01100 "psllq $8, %%mm4\n\t"
01101 "psllq $16, %%mm5\n\t"
01102 "por %%mm4, %%mm3\n\t"
01103 "por %%mm5, %%mm3\n\t"
01104 :"=m"(*d)
01105 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
01106 :"memory");
01107
01108 __asm __volatile(
01109 "movq %%mm0, %%mm4\n\t"
01110 "movq %%mm3, %%mm5\n\t"
01111 "movq %%mm6, %%mm0\n\t"
01112 "movq %%mm7, %%mm1\n\t"
01113
01114 "movq %%mm4, %%mm6\n\t"
01115 "movq %%mm5, %%mm7\n\t"
01116 "movq %%mm0, %%mm2\n\t"
01117 "movq %%mm1, %%mm3\n\t"
01118
01119 "psrlq $8, %%mm2\n\t"
01120 "psrlq $8, %%mm3\n\t"
01121 "psrlq $8, %%mm6\n\t"
01122 "psrlq $8, %%mm7\n\t"
01123 "pand %2, %%mm0\n\t"
01124 "pand %2, %%mm1\n\t"
01125 "pand %2, %%mm4\n\t"
01126 "pand %2, %%mm5\n\t"
01127 "pand %3, %%mm2\n\t"
01128 "pand %3, %%mm3\n\t"
01129 "pand %3, %%mm6\n\t"
01130 "pand %3, %%mm7\n\t"
01131 "por %%mm2, %%mm0\n\t"
01132 "por %%mm3, %%mm1\n\t"
01133 "por %%mm6, %%mm4\n\t"
01134 "por %%mm7, %%mm5\n\t"
01135
01136 "movq %%mm1, %%mm2\n\t"
01137 "movq %%mm4, %%mm3\n\t"
01138 "psllq $48, %%mm2\n\t"
01139 "psllq $32, %%mm3\n\t"
01140 "pand %4, %%mm2\n\t"
01141 "pand %5, %%mm3\n\t"
01142 "por %%mm2, %%mm0\n\t"
01143 "psrlq $16, %%mm1\n\t"
01144 "psrlq $32, %%mm4\n\t"
01145 "psllq $16, %%mm5\n\t"
01146 "por %%mm3, %%mm1\n\t"
01147 "pand %6, %%mm5\n\t"
01148 "por %%mm5, %%mm4\n\t"
01149
01150 MOVNTQ" %%mm0, %0\n\t"
01151 MOVNTQ" %%mm1, 8%0\n\t"
01152 MOVNTQ" %%mm4, 16%0"
01153
01154 :"=m"(*d)
01155 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
01156 :"memory");
01157 d += 24;
01158 s += 8;
01159 }
01160 __asm __volatile(SFENCE:::"memory");
01161 __asm __volatile(EMMS:::"memory");
01162 #endif
01163 while(s < end)
01164 {
01165 register uint16_t bgr;
01166 bgr = *s++;
01167 *d++ = (bgr&0x1F)<<3;
01168 *d++ = (bgr&0x7E0)>>3;
01169 *d++ = (bgr&0xF800)>>8;
01170 }
01171 }
01172
01173 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
01174 {
01175 const uint16_t *end;
01176 #ifdef HAVE_MMX
01177 const uint16_t *mm_end;
01178 #endif
01179 uint8_t *d = (uint8_t *)dst;
01180 const uint16_t *s = (const uint16_t *)src;
01181 end = s + src_size/2;
01182 #ifdef HAVE_MMX
01183 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
01184 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
01185 mm_end = end - 3;
01186 while(s < mm_end)
01187 {
01188 __asm __volatile(
01189 PREFETCH" 32%1\n\t"
01190 "movq %1, %%mm0\n\t"
01191 "movq %1, %%mm1\n\t"
01192 "movq %1, %%mm2\n\t"
01193 "pand %2, %%mm0\n\t"
01194 "pand %3, %%mm1\n\t"
01195 "pand %4, %%mm2\n\t"
01196 "psllq $3, %%mm0\n\t"
01197 "psrlq $2, %%mm1\n\t"
01198 "psrlq $7, %%mm2\n\t"
01199 "movq %%mm0, %%mm3\n\t"
01200 "movq %%mm1, %%mm4\n\t"
01201 "movq %%mm2, %%mm5\n\t"
01202 "punpcklwd %%mm7, %%mm0\n\t"
01203 "punpcklwd %%mm7, %%mm1\n\t"
01204 "punpcklwd %%mm7, %%mm2\n\t"
01205 "punpckhwd %%mm7, %%mm3\n\t"
01206 "punpckhwd %%mm7, %%mm4\n\t"
01207 "punpckhwd %%mm7, %%mm5\n\t"
01208 "psllq $8, %%mm1\n\t"
01209 "psllq $16, %%mm2\n\t"
01210 "por %%mm1, %%mm0\n\t"
01211 "por %%mm2, %%mm0\n\t"
01212 "psllq $8, %%mm4\n\t"
01213 "psllq $16, %%mm5\n\t"
01214 "por %%mm4, %%mm3\n\t"
01215 "por %%mm5, %%mm3\n\t"
01216 MOVNTQ" %%mm0, %0\n\t"
01217 MOVNTQ" %%mm3, 8%0\n\t"
01218 :"=m"(*d)
01219 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
01220 :"memory");
01221 d += 16;
01222 s += 4;
01223 }
01224 __asm __volatile(SFENCE:::"memory");
01225 __asm __volatile(EMMS:::"memory");
01226 #endif
01227 while(s < end)
01228 {
01229 #if 0 //slightly slower on athlon
01230 int bgr= *s++;
01231 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
01232 #else
01233
01234 register uint16_t bgr;
01235 bgr = *s++;
01236 *d++ = (bgr&0x1F)<<3;
01237 *d++ = (bgr&0x3E0)>>2;
01238 *d++ = (bgr&0x7C00)>>7;
01239 *d++ = 0;
01240 #endif
01241 }
01242 }
01243
01244 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
01245 {
01246 const uint16_t *end;
01247 #ifdef HAVE_MMX
01248 const uint16_t *mm_end;
01249 #endif
01250 uint8_t *d = (uint8_t *)dst;
01251 const uint16_t *s = (uint16_t *)src;
01252 end = s + src_size/2;
01253 #ifdef HAVE_MMX
01254 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
01255 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
01256 mm_end = end - 3;
01257 while(s < mm_end)
01258 {
01259 __asm __volatile(
01260 PREFETCH" 32%1\n\t"
01261 "movq %1, %%mm0\n\t"
01262 "movq %1, %%mm1\n\t"
01263 "movq %1, %%mm2\n\t"
01264 "pand %2, %%mm0\n\t"
01265 "pand %3, %%mm1\n\t"
01266 "pand %4, %%mm2\n\t"
01267 "psllq $3, %%mm0\n\t"
01268 "psrlq $3, %%mm1\n\t"
01269 "psrlq $8, %%mm2\n\t"
01270 "movq %%mm0, %%mm3\n\t"
01271 "movq %%mm1, %%mm4\n\t"
01272 "movq %%mm2, %%mm5\n\t"
01273 "punpcklwd %%mm7, %%mm0\n\t"
01274 "punpcklwd %%mm7, %%mm1\n\t"
01275 "punpcklwd %%mm7, %%mm2\n\t"
01276 "punpckhwd %%mm7, %%mm3\n\t"
01277 "punpckhwd %%mm7, %%mm4\n\t"
01278 "punpckhwd %%mm7, %%mm5\n\t"
01279 "psllq $8, %%mm1\n\t"
01280 "psllq $16, %%mm2\n\t"
01281 "por %%mm1, %%mm0\n\t"
01282 "por %%mm2, %%mm0\n\t"
01283 "psllq $8, %%mm4\n\t"
01284 "psllq $16, %%mm5\n\t"
01285 "por %%mm4, %%mm3\n\t"
01286 "por %%mm5, %%mm3\n\t"
01287 MOVNTQ" %%mm0, %0\n\t"
01288 MOVNTQ" %%mm3, 8%0\n\t"
01289 :"=m"(*d)
01290 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
01291 :"memory");
01292 d += 16;
01293 s += 4;
01294 }
01295 __asm __volatile(SFENCE:::"memory");
01296 __asm __volatile(EMMS:::"memory");
01297 #endif
01298 while(s < end)
01299 {
01300 register uint16_t bgr;
01301 bgr = *s++;
01302 *d++ = (bgr&0x1F)<<3;
01303 *d++ = (bgr&0x7E0)>>3;
01304 *d++ = (bgr&0xF800)>>8;
01305 *d++ = 0;
01306 }
01307 }
01308
01309 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
01310 {
01311 #ifdef HAVE_MMX
01312
01313 asm volatile (
01314 "xorl %%eax, %%eax \n\t"
01315 ".balign 16 \n\t"
01316 "1: \n\t"
01317 PREFETCH" 32(%0, %%eax) \n\t"
01318 "movq (%0, %%eax), %%mm0 \n\t"
01319 "movq %%mm0, %%mm1 \n\t"
01320 "movq %%mm0, %%mm2 \n\t"
01321 "pslld $16, %%mm0 \n\t"
01322 "psrld $16, %%mm1 \n\t"
01323 "pand "MANGLE(mask32r)", %%mm0 \n\t"
01324 "pand "MANGLE(mask32g)", %%mm2 \n\t"
01325 "pand "MANGLE(mask32b)", %%mm1 \n\t"
01326 "por %%mm0, %%mm2 \n\t"
01327 "por %%mm1, %%mm2 \n\t"
01328 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
01329 "addl $8, %%eax \n\t"
01330 "cmpl %2, %%eax \n\t"
01331 " jb 1b \n\t"
01332 :: "r" (src), "r"(dst), "r" (src_size-7)
01333 : "%eax"
01334 );
01335
01336 __asm __volatile(SFENCE:::"memory");
01337 __asm __volatile(EMMS:::"memory");
01338 #else
01339 unsigned i;
01340 unsigned num_pixels = src_size >> 2;
01341 for(i=0; i<num_pixels; i++)
01342 {
01343 #ifdef WORDS_BIGENDIAN
01344 dst[4*i + 1] = src[4*i + 3];
01345 dst[4*i + 2] = src[4*i + 2];
01346 dst[4*i + 3] = src[4*i + 1];
01347 #else
01348 dst[4*i + 0] = src[4*i + 2];
01349 dst[4*i + 1] = src[4*i + 1];
01350 dst[4*i + 2] = src[4*i + 0];
01351 #endif
01352 }
01353 #endif
01354 }
01355
01356 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
01357 {
01358 unsigned i;
01359 #ifdef HAVE_MMX
01360 int mmx_size= 23 - src_size;
01361 asm volatile (
01362 "movq "MANGLE(mask24r)", %%mm5 \n\t"
01363 "movq "MANGLE(mask24g)", %%mm6 \n\t"
01364 "movq "MANGLE(mask24b)", %%mm7 \n\t"
01365 ".balign 16 \n\t"
01366 "1: \n\t"
01367 PREFETCH" 32(%1, %%eax) \n\t"
01368 "movq (%1, %%eax), %%mm0 \n\t"
01369 "movq (%1, %%eax), %%mm1 \n\t"
01370 "movq 2(%1, %%eax), %%mm2 \n\t"
01371 "psllq $16, %%mm0 \n\t"
01372 "pand %%mm5, %%mm0 \n\t"
01373 "pand %%mm6, %%mm1 \n\t"
01374 "pand %%mm7, %%mm2 \n\t"
01375 "por %%mm0, %%mm1 \n\t"
01376 "por %%mm2, %%mm1 \n\t"
01377 "movq 6(%1, %%eax), %%mm0 \n\t"
01378 MOVNTQ" %%mm1, (%2, %%eax) \n\t"
01379 "movq 8(%1, %%eax), %%mm1 \n\t"
01380 "movq 10(%1, %%eax), %%mm2 \n\t"
01381 "pand %%mm7, %%mm0 \n\t"
01382 "pand %%mm5, %%mm1 \n\t"
01383 "pand %%mm6, %%mm2 \n\t"
01384 "por %%mm0, %%mm1 \n\t"
01385 "por %%mm2, %%mm1 \n\t"
01386 "movq 14(%1, %%eax), %%mm0 \n\t"
01387 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t"
01388 "movq 16(%1, %%eax), %%mm1 \n\t"
01389 "movq 18(%1, %%eax), %%mm2 \n\t"
01390 "pand %%mm6, %%mm0 \n\t"
01391 "pand %%mm7, %%mm1 \n\t"
01392 "pand %%mm5, %%mm2 \n\t"
01393 "por %%mm0, %%mm1 \n\t"
01394 "por %%mm2, %%mm1 \n\t"
01395 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
01396 "addl $24, %%eax \n\t"
01397 " js 1b \n\t"
01398 : "+a" (mmx_size)
01399 : "r" (src-mmx_size), "r"(dst-mmx_size)
01400 );
01401
01402 __asm __volatile(SFENCE:::"memory");
01403 __asm __volatile(EMMS:::"memory");
01404
01405 if(mmx_size==23) return;
01406
01407 src+= src_size;
01408 dst+= src_size;
01409 src_size= 23-mmx_size;
01410 src-= src_size;
01411 dst-= src_size;
01412 #endif
01413 for(i=0; i<src_size; i+=3)
01414 {
01415 register uint8_t x;
01416 x = src[i + 2];
01417 dst[i + 1] = src[i + 1];
01418 dst[i + 2] = src[i + 0];
01419 dst[i + 0] = x;
01420 }
01421 }
01422
01423 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01424 unsigned int width, unsigned int height,
01425 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
01426 {
01427 unsigned y;
01428 const unsigned chromWidth= width>>1;
01429 for(y=0; y<height; y++)
01430 {
01431 #ifdef HAVE_MMX
01432
01433 asm volatile(
01434 "xorl %%eax, %%eax \n\t"
01435 ".balign 16 \n\t"
01436 "1: \n\t"
01437 PREFETCH" 32(%1, %%eax, 2) \n\t"
01438 PREFETCH" 32(%2, %%eax) \n\t"
01439 PREFETCH" 32(%3, %%eax) \n\t"
01440 "movq (%2, %%eax), %%mm0 \n\t"
01441 "movq %%mm0, %%mm2 \n\t"
01442 "movq (%3, %%eax), %%mm1 \n\t"
01443 "punpcklbw %%mm1, %%mm0 \n\t"
01444 "punpckhbw %%mm1, %%mm2 \n\t"
01445
01446 "movq (%1, %%eax,2), %%mm3 \n\t"
01447 "movq 8(%1, %%eax,2), %%mm5 \n\t"
01448 "movq %%mm3, %%mm4 \n\t"
01449 "movq %%mm5, %%mm6 \n\t"
01450 "punpcklbw %%mm0, %%mm3 \n\t"
01451 "punpckhbw %%mm0, %%mm4 \n\t"
01452 "punpcklbw %%mm2, %%mm5 \n\t"
01453 "punpckhbw %%mm2, %%mm6 \n\t"
01454
01455 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
01456 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
01457 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
01458 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
01459
01460 "addl $8, %%eax \n\t"
01461 "cmpl %4, %%eax \n\t"
01462 " jb 1b \n\t"
01463 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
01464 : "%eax"
01465 );
01466 #else
01467
01468 #if defined ARCH_ALPHA && defined HAVE_MVI
01469 #define pl2yuy2(n) \
01470 y1 = yc[n]; \
01471 y2 = yc2[n]; \
01472 u = uc[n]; \
01473 v = vc[n]; \
01474 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
01475 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
01476 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
01477 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
01478 yuv1 = (u << 8) + (v << 24); \
01479 yuv2 = yuv1 + y2; \
01480 yuv1 += y1; \
01481 qdst[n] = yuv1; \
01482 qdst2[n] = yuv2;
01483
01484 int i;
01485 uint64_t *qdst = (uint64_t *) dst;
01486 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
01487 const uint32_t *yc = (uint32_t *) ysrc;
01488 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
01489 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
01490 for(i = 0; i < chromWidth; i += 8){
01491 uint64_t y1, y2, yuv1, yuv2;
01492 uint64_t u, v;
01493
01494 asm("ldq $31,64(%0)" :: "r"(yc));
01495 asm("ldq $31,64(%0)" :: "r"(yc2));
01496 asm("ldq $31,64(%0)" :: "r"(uc));
01497 asm("ldq $31,64(%0)" :: "r"(vc));
01498
01499 pl2yuy2(0);
01500 pl2yuy2(1);
01501 pl2yuy2(2);
01502 pl2yuy2(3);
01503
01504 yc += 4;
01505 yc2 += 4;
01506 uc += 4;
01507 vc += 4;
01508 qdst += 4;
01509 qdst2 += 4;
01510 }
01511 y++;
01512 ysrc += lumStride;
01513 dst += dstStride;
01514
01515 #elif __WORDSIZE >= 64
01516 int i;
01517 uint64_t *ldst = (uint64_t *) dst;
01518 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
01519 for(i = 0; i < chromWidth; i += 2){
01520 uint64_t k, l;
01521 k = yc[0] + (uc[0] << 8) +
01522 (yc[1] << 16) + (vc[0] << 24);
01523 l = yc[2] + (uc[1] << 8) +
01524 (yc[3] << 16) + (vc[1] << 24);
01525 *ldst++ = k + (l << 32);
01526 yc += 4;
01527 uc += 2;
01528 vc += 2;
01529 }
01530
01531 #else
01532 int i, *idst = (int32_t *) dst;
01533 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
01534 for(i = 0; i < chromWidth; i++){
01535 #ifdef WORDS_BIGENDIAN
01536 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
01537 (yc[1] << 8) + (vc[0] << 0);
01538 #else
01539 *idst++ = yc[0] + (uc[0] << 8) +
01540 (yc[1] << 16) + (vc[0] << 24);
01541 #endif
01542 yc += 2;
01543 uc++;
01544 vc++;
01545 }
01546 #endif
01547 #endif
01548 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
01549 {
01550 usrc += chromStride;
01551 vsrc += chromStride;
01552 }
01553 ysrc += lumStride;
01554 dst += dstStride;
01555 }
01556 #ifdef HAVE_MMX
01557 asm( EMMS" \n\t"
01558 SFENCE" \n\t"
01559 :::"memory");
01560 #endif
01561 }
01562
01568 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01569 unsigned int width, unsigned int height,
01570 int lumStride, int chromStride, int dstStride)
01571 {
01572
01573 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
01574 }
01575
01576 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01577 unsigned int width, unsigned int height,
01578 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
01579 {
01580 unsigned y;
01581 const unsigned chromWidth= width>>1;
01582 for(y=0; y<height; y++)
01583 {
01584 #ifdef HAVE_MMX
01585
01586 asm volatile(
01587 "xorl %%eax, %%eax \n\t"
01588 ".balign 16 \n\t"
01589 "1: \n\t"
01590 PREFETCH" 32(%1, %%eax, 2) \n\t"
01591 PREFETCH" 32(%2, %%eax) \n\t"
01592 PREFETCH" 32(%3, %%eax) \n\t"
01593 "movq (%2, %%eax), %%mm0 \n\t"
01594 "movq %%mm0, %%mm2 \n\t"
01595 "movq (%3, %%eax), %%mm1 \n\t"
01596 "punpcklbw %%mm1, %%mm0 \n\t"
01597 "punpckhbw %%mm1, %%mm2 \n\t"
01598
01599 "movq (%1, %%eax,2), %%mm3 \n\t"
01600 "movq 8(%1, %%eax,2), %%mm5 \n\t"
01601 "movq %%mm0, %%mm4 \n\t"
01602 "movq %%mm2, %%mm6 \n\t"
01603 "punpcklbw %%mm3, %%mm0 \n\t"
01604 "punpckhbw %%mm3, %%mm4 \n\t"
01605 "punpcklbw %%mm5, %%mm2 \n\t"
01606 "punpckhbw %%mm5, %%mm6 \n\t"
01607
01608 MOVNTQ" %%mm0, (%0, %%eax, 4) \n\t"
01609 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
01610 MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t"
01611 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
01612
01613 "addl $8, %%eax \n\t"
01614 "cmpl %4, %%eax \n\t"
01615 " jb 1b \n\t"
01616 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
01617 : "%eax"
01618 );
01619 #else
01620
01621
01622 #if __WORDSIZE >= 64
01623 int i;
01624 uint64_t *ldst = (uint64_t *) dst;
01625 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
01626 for(i = 0; i < chromWidth; i += 2){
01627 uint64_t k, l;
01628 k = uc[0] + (yc[0] << 8) +
01629 (vc[0] << 16) + (yc[1] << 24);
01630 l = uc[1] + (yc[2] << 8) +
01631 (vc[1] << 16) + (yc[3] << 24);
01632 *ldst++ = k + (l << 32);
01633 yc += 4;
01634 uc += 2;
01635 vc += 2;
01636 }
01637
01638 #else
01639 int i, *idst = (int32_t *) dst;
01640 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
01641 for(i = 0; i < chromWidth; i++){
01642 #ifdef WORDS_BIGENDIAN
01643 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
01644 (vc[0] << 8) + (yc[1] << 0);
01645 #else
01646 *idst++ = uc[0] + (yc[0] << 8) +
01647 (vc[0] << 16) + (yc[1] << 24);
01648 #endif
01649 yc += 2;
01650 uc++;
01651 vc++;
01652 }
01653 #endif
01654 #endif
01655 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
01656 {
01657 usrc += chromStride;
01658 vsrc += chromStride;
01659 }
01660 ysrc += lumStride;
01661 dst += dstStride;
01662 }
01663 #ifdef HAVE_MMX
01664 asm( EMMS" \n\t"
01665 SFENCE" \n\t"
01666 :::"memory");
01667 #endif
01668 }
01669
01675 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01676 unsigned int width, unsigned int height,
01677 int lumStride, int chromStride, int dstStride)
01678 {
01679
01680 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
01681 }
01682
01687 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01688 unsigned int width, unsigned int height,
01689 int lumStride, int chromStride, int dstStride)
01690 {
01691 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
01692 }
01693
01699 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
01700 unsigned int width, unsigned int height,
01701 int lumStride, int chromStride, int srcStride)
01702 {
01703 unsigned y;
01704 const unsigned chromWidth= width>>1;
01705 for(y=0; y<height; y+=2)
01706 {
01707 #ifdef HAVE_MMX
01708 asm volatile(
01709 "xorl %%eax, %%eax \n\t"
01710 "pcmpeqw %%mm7, %%mm7 \n\t"
01711 "psrlw $8, %%mm7 \n\t"
01712 ".balign 16 \n\t"
01713 "1: \n\t"
01714 PREFETCH" 64(%0, %%eax, 4) \n\t"
01715 "movq (%0, %%eax, 4), %%mm0 \n\t"
01716 "movq 8(%0, %%eax, 4), %%mm1 \n\t"
01717 "movq %%mm0, %%mm2 \n\t"
01718 "movq %%mm1, %%mm3 \n\t"
01719 "psrlw $8, %%mm0 \n\t"
01720 "psrlw $8, %%mm1 \n\t"
01721 "pand %%mm7, %%mm2 \n\t"
01722 "pand %%mm7, %%mm3 \n\t"
01723 "packuswb %%mm1, %%mm0 \n\t"
01724 "packuswb %%mm3, %%mm2 \n\t"
01725
01726 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
01727
01728 "movq 16(%0, %%eax, 4), %%mm1 \n\t"
01729 "movq 24(%0, %%eax, 4), %%mm2 \n\t"
01730 "movq %%mm1, %%mm3 \n\t"
01731 "movq %%mm2, %%mm4 \n\t"
01732 "psrlw $8, %%mm1 \n\t"
01733 "psrlw $8, %%mm2 \n\t"
01734 "pand %%mm7, %%mm3 \n\t"
01735 "pand %%mm7, %%mm4 \n\t"
01736 "packuswb %%mm2, %%mm1 \n\t"
01737 "packuswb %%mm4, %%mm3 \n\t"
01738
01739 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
01740
01741 "movq %%mm0, %%mm2 \n\t"
01742 "movq %%mm1, %%mm3 \n\t"
01743 "psrlw $8, %%mm0 \n\t"
01744 "psrlw $8, %%mm1 \n\t"
01745 "pand %%mm7, %%mm2 \n\t"
01746 "pand %%mm7, %%mm3 \n\t"
01747 "packuswb %%mm1, %%mm0 \n\t"
01748 "packuswb %%mm3, %%mm2 \n\t"
01749
01750 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
01751 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
01752
01753 "addl $8, %%eax \n\t"
01754 "cmpl %4, %%eax \n\t"
01755 " jb 1b \n\t"
01756 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01757 : "memory", "%eax"
01758 );
01759
01760 ydst += lumStride;
01761 src += srcStride;
01762
01763 asm volatile(
01764 "xorl %%eax, %%eax \n\t"
01765 ".balign 16 \n\t"
01766 "1: \n\t"
01767 PREFETCH" 64(%0, %%eax, 4) \n\t"
01768 "movq (%0, %%eax, 4), %%mm0 \n\t"
01769 "movq 8(%0, %%eax, 4), %%mm1 \n\t"
01770 "movq 16(%0, %%eax, 4), %%mm2 \n\t"
01771 "movq 24(%0, %%eax, 4), %%mm3 \n\t"
01772 "pand %%mm7, %%mm0 \n\t"
01773 "pand %%mm7, %%mm1 \n\t"
01774 "pand %%mm7, %%mm2 \n\t"
01775 "pand %%mm7, %%mm3 \n\t"
01776 "packuswb %%mm1, %%mm0 \n\t"
01777 "packuswb %%mm3, %%mm2 \n\t"
01778
01779 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
01780 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
01781
01782 "addl $8, %%eax \n\t"
01783 "cmpl %4, %%eax \n\t"
01784 " jb 1b \n\t"
01785
01786 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01787 : "memory", "%eax"
01788 );
01789 #else
01790 unsigned i;
01791 for(i=0; i<chromWidth; i++)
01792 {
01793 ydst[2*i+0] = src[4*i+0];
01794 udst[i] = src[4*i+1];
01795 ydst[2*i+1] = src[4*i+2];
01796 vdst[i] = src[4*i+3];
01797 }
01798 ydst += lumStride;
01799 src += srcStride;
01800
01801 for(i=0; i<chromWidth; i++)
01802 {
01803 ydst[2*i+0] = src[4*i+0];
01804 ydst[2*i+1] = src[4*i+2];
01805 }
01806 #endif
01807 udst += chromStride;
01808 vdst += chromStride;
01809 ydst += lumStride;
01810 src += srcStride;
01811 }
01812 #ifdef HAVE_MMX
01813 asm volatile( EMMS" \n\t"
01814 SFENCE" \n\t"
01815 :::"memory");
01816 #endif
01817 }
01818
01819 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
01820 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
01821 unsigned int width, unsigned int height, int lumStride, int chromStride)
01822 {
01823
01824 memcpy(ydst, ysrc, width*height);
01825
01826
01827 }
01828
01829 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
01830 {
01831 int x,y;
01832
01833 dst[0]= src[0];
01834
01835
01836 for(x=0; x<srcWidth-1; x++){
01837 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
01838 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
01839 }
01840 dst[2*srcWidth-1]= src[srcWidth-1];
01841
01842 dst+= dstStride;
01843
01844 for(y=1; y<srcHeight; y++){
01845 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
01846 const int mmxSize= srcWidth&~15;
01847 asm volatile(
01848 "movl %4, %%eax \n\t"
01849 "1: \n\t"
01850 "movq (%0, %%eax), %%mm0 \n\t"
01851 "movq (%1, %%eax), %%mm1 \n\t"
01852 "movq 1(%0, %%eax), %%mm2 \n\t"
01853 "movq 1(%1, %%eax), %%mm3 \n\t"
01854 "movq -1(%0, %%eax), %%mm4 \n\t"
01855 "movq -1(%1, %%eax), %%mm5 \n\t"
01856 PAVGB" %%mm0, %%mm5 \n\t"
01857 PAVGB" %%mm0, %%mm3 \n\t"
01858 PAVGB" %%mm0, %%mm5 \n\t"
01859 PAVGB" %%mm0, %%mm3 \n\t"
01860 PAVGB" %%mm1, %%mm4 \n\t"
01861 PAVGB" %%mm1, %%mm2 \n\t"
01862 PAVGB" %%mm1, %%mm4 \n\t"
01863 PAVGB" %%mm1, %%mm2 \n\t"
01864 "movq %%mm5, %%mm7 \n\t"
01865 "movq %%mm4, %%mm6 \n\t"
01866 "punpcklbw %%mm3, %%mm5 \n\t"
01867 "punpckhbw %%mm3, %%mm7 \n\t"
01868 "punpcklbw %%mm2, %%mm4 \n\t"
01869 "punpckhbw %%mm2, %%mm6 \n\t"
01870 #if 1
01871 MOVNTQ" %%mm5, (%2, %%eax, 2) \n\t"
01872 MOVNTQ" %%mm7, 8(%2, %%eax, 2) \n\t"
01873 MOVNTQ" %%mm4, (%3, %%eax, 2) \n\t"
01874 MOVNTQ" %%mm6, 8(%3, %%eax, 2) \n\t"
01875 #else
01876 "movq %%mm5, (%2, %%eax, 2) \n\t"
01877 "movq %%mm7, 8(%2, %%eax, 2) \n\t"
01878 "movq %%mm4, (%3, %%eax, 2) \n\t"
01879 "movq %%mm6, 8(%3, %%eax, 2) \n\t"
01880 #endif
01881 "addl $8, %%eax \n\t"
01882 " js 1b \n\t"
01883 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
01884 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
01885 "g" (-mmxSize)
01886 : "%eax"
01887
01888 );
01889 #else
01890 const int mmxSize=1;
01891 #endif
01892 dst[0 ]= (3*src[0] + src[srcStride])>>2;
01893 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
01894
01895 for(x=mmxSize-1; x<srcWidth-1; x++){
01896 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
01897 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
01898 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
01899 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
01900 }
01901 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
01902 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
01903
01904 dst+=dstStride*2;
01905 src+=srcStride;
01906 }
01907
01908
01909 #if 1
01910 dst[0]= src[0];
01911
01912 for(x=0; x<srcWidth-1; x++){
01913 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
01914 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
01915 }
01916 dst[2*srcWidth-1]= src[srcWidth-1];
01917 #else
01918 for(x=0; x<srcWidth; x++){
01919 dst[2*x+0]=
01920 dst[2*x+1]= src[x];
01921 }
01922 #endif
01923
01924 #ifdef HAVE_MMX
01925 asm volatile( EMMS" \n\t"
01926 SFENCE" \n\t"
01927 :::"memory");
01928 #endif
01929 }
01930
01937 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
01938 unsigned int width, unsigned int height,
01939 int lumStride, int chromStride, int srcStride)
01940 {
01941 unsigned y;
01942 const unsigned chromWidth= width>>1;
01943 for(y=0; y<height; y+=2)
01944 {
01945 #ifdef HAVE_MMX
01946 asm volatile(
01947 "xorl %%eax, %%eax \n\t"
01948 "pcmpeqw %%mm7, %%mm7 \n\t"
01949 "psrlw $8, %%mm7 \n\t"
01950 ".balign 16 \n\t"
01951 "1: \n\t"
01952 PREFETCH" 64(%0, %%eax, 4) \n\t"
01953 "movq (%0, %%eax, 4), %%mm0 \n\t"
01954 "movq 8(%0, %%eax, 4), %%mm1 \n\t"
01955 "movq %%mm0, %%mm2 \n\t"
01956 "movq %%mm1, %%mm3 \n\t"
01957 "pand %%mm7, %%mm0 \n\t"
01958 "pand %%mm7, %%mm1 \n\t"
01959 "psrlw $8, %%mm2 \n\t"
01960 "psrlw $8, %%mm3 \n\t"
01961 "packuswb %%mm1, %%mm0 \n\t"
01962 "packuswb %%mm3, %%mm2 \n\t"
01963
01964 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
01965
01966 "movq 16(%0, %%eax, 4), %%mm1 \n\t"
01967 "movq 24(%0, %%eax, 4), %%mm2 \n\t"
01968 "movq %%mm1, %%mm3 \n\t"
01969 "movq %%mm2, %%mm4 \n\t"
01970 "pand %%mm7, %%mm1 \n\t"
01971 "pand %%mm7, %%mm2 \n\t"
01972 "psrlw $8, %%mm3 \n\t"
01973 "psrlw $8, %%mm4 \n\t"
01974 "packuswb %%mm2, %%mm1 \n\t"
01975 "packuswb %%mm4, %%mm3 \n\t"
01976
01977 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
01978
01979 "movq %%mm0, %%mm2 \n\t"
01980 "movq %%mm1, %%mm3 \n\t"
01981 "psrlw $8, %%mm0 \n\t"
01982 "psrlw $8, %%mm1 \n\t"
01983 "pand %%mm7, %%mm2 \n\t"
01984 "pand %%mm7, %%mm3 \n\t"
01985 "packuswb %%mm1, %%mm0 \n\t"
01986 "packuswb %%mm3, %%mm2 \n\t"
01987
01988 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
01989 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
01990
01991 "addl $8, %%eax \n\t"
01992 "cmpl %4, %%eax \n\t"
01993 " jb 1b \n\t"
01994 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01995 : "memory", "%eax"
01996 );
01997
01998 ydst += lumStride;
01999 src += srcStride;
02000
02001 asm volatile(
02002 "xorl %%eax, %%eax \n\t"
02003 ".balign 16 \n\t"
02004 "1: \n\t"
02005 PREFETCH" 64(%0, %%eax, 4) \n\t"
02006 "movq (%0, %%eax, 4), %%mm0 \n\t"
02007 "movq 8(%0, %%eax, 4), %%mm1 \n\t"
02008 "movq 16(%0, %%eax, 4), %%mm2 \n\t"
02009 "movq 24(%0, %%eax, 4), %%mm3 \n\t"
02010 "psrlw $8, %%mm0 \n\t"
02011 "psrlw $8, %%mm1 \n\t"
02012 "psrlw $8, %%mm2 \n\t"
02013 "psrlw $8, %%mm3 \n\t"
02014 "packuswb %%mm1, %%mm0 \n\t"
02015 "packuswb %%mm3, %%mm2 \n\t"
02016
02017 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
02018 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
02019
02020 "addl $8, %%eax \n\t"
02021 "cmpl %4, %%eax \n\t"
02022 " jb 1b \n\t"
02023
02024 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
02025 : "memory", "%eax"
02026 );
02027 #else
02028 unsigned i;
02029 for(i=0; i<chromWidth; i++)
02030 {
02031 udst[i] = src[4*i+0];
02032 ydst[2*i+0] = src[4*i+1];
02033 vdst[i] = src[4*i+2];
02034 ydst[2*i+1] = src[4*i+3];
02035 }
02036 ydst += lumStride;
02037 src += srcStride;
02038
02039 for(i=0; i<chromWidth; i++)
02040 {
02041 ydst[2*i+0] = src[4*i+1];
02042 ydst[2*i+1] = src[4*i+3];
02043 }
02044 #endif
02045 udst += chromStride;
02046 vdst += chromStride;
02047 ydst += lumStride;
02048 src += srcStride;
02049 }
02050 #ifdef HAVE_MMX
02051 asm volatile( EMMS" \n\t"
02052 SFENCE" \n\t"
02053 :::"memory");
02054 #endif
02055 }
02056
02063 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
02064 unsigned int width, unsigned int height,
02065 int lumStride, int chromStride, int srcStride)
02066 {
02067 unsigned y;
02068 const unsigned chromWidth= width>>1;
02069 #ifdef HAVE_MMX
02070 for(y=0; y<height-2; y+=2)
02071 {
02072 unsigned i;
02073 for(i=0; i<2; i++)
02074 {
02075 asm volatile(
02076 "movl %2, %%eax \n\t"
02077 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
02078 "movq "MANGLE(w1111)", %%mm5 \n\t"
02079 "pxor %%mm7, %%mm7 \n\t"
02080 "leal (%%eax, %%eax, 2), %%ebx \n\t"
02081 ".balign 16 \n\t"
02082 "1: \n\t"
02083 PREFETCH" 64(%0, %%ebx) \n\t"
02084 "movd (%0, %%ebx), %%mm0 \n\t"
02085 "movd 3(%0, %%ebx), %%mm1 \n\t"
02086 "punpcklbw %%mm7, %%mm0 \n\t"
02087 "punpcklbw %%mm7, %%mm1 \n\t"
02088 "movd 6(%0, %%ebx), %%mm2 \n\t"
02089 "movd 9(%0, %%ebx), %%mm3 \n\t"
02090 "punpcklbw %%mm7, %%mm2 \n\t"
02091 "punpcklbw %%mm7, %%mm3 \n\t"
02092 "pmaddwd %%mm6, %%mm0 \n\t"
02093 "pmaddwd %%mm6, %%mm1 \n\t"
02094 "pmaddwd %%mm6, %%mm2 \n\t"
02095 "pmaddwd %%mm6, %%mm3 \n\t"
02096 #ifndef FAST_BGR2YV12
02097 "psrad $8, %%mm0 \n\t"
02098 "psrad $8, %%mm1 \n\t"
02099 "psrad $8, %%mm2 \n\t"
02100 "psrad $8, %%mm3 \n\t"
02101 #endif
02102 "packssdw %%mm1, %%mm0 \n\t"
02103 "packssdw %%mm3, %%mm2 \n\t"
02104 "pmaddwd %%mm5, %%mm0 \n\t"
02105 "pmaddwd %%mm5, %%mm2 \n\t"
02106 "packssdw %%mm2, %%mm0 \n\t"
02107 "psraw $7, %%mm0 \n\t"
02108
02109 "movd 12(%0, %%ebx), %%mm4 \n\t"
02110 "movd 15(%0, %%ebx), %%mm1 \n\t"
02111 "punpcklbw %%mm7, %%mm4 \n\t"
02112 "punpcklbw %%mm7, %%mm1 \n\t"
02113 "movd 18(%0, %%ebx), %%mm2 \n\t"
02114 "movd 21(%0, %%ebx), %%mm3 \n\t"
02115 "punpcklbw %%mm7, %%mm2 \n\t"
02116 "punpcklbw %%mm7, %%mm3 \n\t"
02117 "pmaddwd %%mm6, %%mm4 \n\t"
02118 "pmaddwd %%mm6, %%mm1 \n\t"
02119 "pmaddwd %%mm6, %%mm2 \n\t"
02120 "pmaddwd %%mm6, %%mm3 \n\t"
02121 #ifndef FAST_BGR2YV12
02122 "psrad $8, %%mm4 \n\t"
02123 "psrad $8, %%mm1 \n\t"
02124 "psrad $8, %%mm2 \n\t"
02125 "psrad $8, %%mm3 \n\t"
02126 #endif
02127 "packssdw %%mm1, %%mm4 \n\t"
02128 "packssdw %%mm3, %%mm2 \n\t"
02129 "pmaddwd %%mm5, %%mm4 \n\t"
02130 "pmaddwd %%mm5, %%mm2 \n\t"
02131 "addl $24, %%ebx \n\t"
02132 "packssdw %%mm2, %%mm4 \n\t"
02133 "psraw $7, %%mm4 \n\t"
02134
02135 "packuswb %%mm4, %%mm0 \n\t"
02136 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
02137
02138 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
02139 "addl $8, %%eax \n\t"
02140 " js 1b \n\t"
02141 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
02142 : "%eax", "%ebx"
02143 );
02144 ydst += lumStride;
02145 src += srcStride;
02146 }
02147 src -= srcStride*2;
02148 asm volatile(
02149 "movl %4, %%eax \n\t"
02150 "movq "MANGLE(w1111)", %%mm5 \n\t"
02151 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
02152 "pxor %%mm7, %%mm7 \n\t"
02153 "leal (%%eax, %%eax, 2), %%ebx \n\t"
02154 "addl %%ebx, %%ebx \n\t"
02155 ".balign 16 \n\t"
02156 "1: \n\t"
02157 PREFETCH" 64(%0, %%ebx) \n\t"
02158 PREFETCH" 64(%1, %%ebx) \n\t"
02159 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
02160 "movq (%0, %%ebx), %%mm0 \n\t"
02161 "movq (%1, %%ebx), %%mm1 \n\t"
02162 "movq 6(%0, %%ebx), %%mm2 \n\t"
02163 "movq 6(%1, %%ebx), %%mm3 \n\t"
02164 PAVGB" %%mm1, %%mm0 \n\t"
02165 PAVGB" %%mm3, %%mm2 \n\t"
02166 "movq %%mm0, %%mm1 \n\t"
02167 "movq %%mm2, %%mm3 \n\t"
02168 "psrlq $24, %%mm0 \n\t"
02169 "psrlq $24, %%mm2 \n\t"
02170 PAVGB" %%mm1, %%mm0 \n\t"
02171 PAVGB" %%mm3, %%mm2 \n\t"
02172 "punpcklbw %%mm7, %%mm0 \n\t"
02173 "punpcklbw %%mm7, %%mm2 \n\t"
02174 #else
02175 "movd (%0, %%ebx), %%mm0 \n\t"
02176 "movd (%1, %%ebx), %%mm1 \n\t"
02177 "movd 3(%0, %%ebx), %%mm2 \n\t"
02178 "movd 3(%1, %%ebx), %%mm3 \n\t"
02179 "punpcklbw %%mm7, %%mm0 \n\t"
02180 "punpcklbw %%mm7, %%mm1 \n\t"
02181 "punpcklbw %%mm7, %%mm2 \n\t"
02182 "punpcklbw %%mm7, %%mm3 \n\t"
02183 "paddw %%mm1, %%mm0 \n\t"
02184 "paddw %%mm3, %%mm2 \n\t"
02185 "paddw %%mm2, %%mm0 \n\t"
02186 "movd 6(%0, %%ebx), %%mm4 \n\t"
02187 "movd 6(%1, %%ebx), %%mm1 \n\t"
02188 "movd 9(%0, %%ebx), %%mm2 \n\t"
02189 "movd 9(%1, %%ebx), %%mm3 \n\t"
02190 "punpcklbw %%mm7, %%mm4 \n\t"
02191 "punpcklbw %%mm7, %%mm1 \n\t"
02192 "punpcklbw %%mm7, %%mm2 \n\t"
02193 "punpcklbw %%mm7, %%mm3 \n\t"
02194 "paddw %%mm1, %%mm4 \n\t"
02195 "paddw %%mm3, %%mm2 \n\t"
02196 "paddw %%mm4, %%mm2 \n\t"
02197 "psrlw $2, %%mm0 \n\t"
02198 "psrlw $2, %%mm2 \n\t"
02199 #endif
02200 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
02201 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
02202
02203 "pmaddwd %%mm0, %%mm1 \n\t"
02204 "pmaddwd %%mm2, %%mm3 \n\t"
02205 "pmaddwd %%mm6, %%mm0 \n\t"
02206 "pmaddwd %%mm6, %%mm2 \n\t"
02207 #ifndef FAST_BGR2YV12
02208 "psrad $8, %%mm0 \n\t"
02209 "psrad $8, %%mm1 \n\t"
02210 "psrad $8, %%mm2 \n\t"
02211 "psrad $8, %%mm3 \n\t"
02212 #endif
02213 "packssdw %%mm2, %%mm0 \n\t"
02214 "packssdw %%mm3, %%mm1 \n\t"
02215 "pmaddwd %%mm5, %%mm0 \n\t"
02216 "pmaddwd %%mm5, %%mm1 \n\t"
02217 "packssdw %%mm1, %%mm0 \n\t"
02218 "psraw $7, %%mm0 \n\t"
02219
02220 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
02221 "movq 12(%0, %%ebx), %%mm4 \n\t"
02222 "movq 12(%1, %%ebx), %%mm1 \n\t"
02223 "movq 18(%0, %%ebx), %%mm2 \n\t"
02224 "movq 18(%1, %%ebx), %%mm3 \n\t"
02225 PAVGB" %%mm1, %%mm4 \n\t"
02226 PAVGB" %%mm3, %%mm2 \n\t"
02227 "movq %%mm4, %%mm1 \n\t"
02228 "movq %%mm2, %%mm3 \n\t"
02229 "psrlq $24, %%mm4 \n\t"
02230 "psrlq $24, %%mm2 \n\t"
02231 PAVGB" %%mm1, %%mm4 \n\t"
02232 PAVGB" %%mm3, %%mm2 \n\t"
02233 "punpcklbw %%mm7, %%mm4 \n\t"
02234 "punpcklbw %%mm7, %%mm2 \n\t"
02235 #else
02236 "movd 12(%0, %%ebx), %%mm4 \n\t"
02237 "movd 12(%1, %%ebx), %%mm1 \n\t"
02238 "movd 15(%0, %%ebx), %%mm2 \n\t"
02239 "movd 15(%1, %%ebx), %%mm3 \n\t"
02240 "punpcklbw %%mm7, %%mm4 \n\t"
02241 "punpcklbw %%mm7, %%mm1 \n\t"
02242 "punpcklbw %%mm7, %%mm2 \n\t"
02243 "punpcklbw %%mm7, %%mm3 \n\t"
02244 "paddw %%mm1, %%mm4 \n\t"
02245 "paddw %%mm3, %%mm2 \n\t"
02246 "paddw %%mm2, %%mm4 \n\t"
02247 "movd 18(%0, %%ebx), %%mm5 \n\t"
02248 "movd 18(%1, %%ebx), %%mm1 \n\t"
02249 "movd 21(%0, %%ebx), %%mm2 \n\t"
02250 "movd 21(%1, %%ebx), %%mm3 \n\t"
02251 "punpcklbw %%mm7, %%mm5 \n\t"
02252 "punpcklbw %%mm7, %%mm1 \n\t"
02253 "punpcklbw %%mm7, %%mm2 \n\t"
02254 "punpcklbw %%mm7, %%mm3 \n\t"
02255 "paddw %%mm1, %%mm5 \n\t"
02256 "paddw %%mm3, %%mm2 \n\t"
02257 "paddw %%mm5, %%mm2 \n\t"
02258 "movq "MANGLE(w1111)", %%mm5 \n\t"
02259 "psrlw $2, %%mm4 \n\t"
02260 "psrlw $2, %%mm2 \n\t"
02261 #endif
02262 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
02263 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
02264
02265 "pmaddwd %%mm4, %%mm1 \n\t"
02266 "pmaddwd %%mm2, %%mm3 \n\t"
02267 "pmaddwd %%mm6, %%mm4 \n\t"
02268 "pmaddwd %%mm6, %%mm2 \n\t"
02269 #ifndef FAST_BGR2YV12
02270 "psrad $8, %%mm4 \n\t"
02271 "psrad $8, %%mm1 \n\t"
02272 "psrad $8, %%mm2 \n\t"
02273 "psrad $8, %%mm3 \n\t"
02274 #endif
02275 "packssdw %%mm2, %%mm4 \n\t"
02276 "packssdw %%mm3, %%mm1 \n\t"
02277 "pmaddwd %%mm5, %%mm4 \n\t"
02278 "pmaddwd %%mm5, %%mm1 \n\t"
02279 "addl $24, %%ebx \n\t"
02280 "packssdw %%mm1, %%mm4 \n\t"
02281 "psraw $7, %%mm4 \n\t"
02282
02283 "movq %%mm0, %%mm1 \n\t"
02284 "punpckldq %%mm4, %%mm0 \n\t"
02285 "punpckhdq %%mm4, %%mm1 \n\t"
02286 "packsswb %%mm1, %%mm0 \n\t"
02287 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
02288
02289 "movd %%mm0, (%2, %%eax) \n\t"
02290 "punpckhdq %%mm0, %%mm0 \n\t"
02291 "movd %%mm0, (%3, %%eax) \n\t"
02292 "addl $4, %%eax \n\t"
02293 " js 1b \n\t"
02294 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
02295 : "%eax", "%ebx"
02296 );
02297
02298 udst += chromStride;
02299 vdst += chromStride;
02300 src += srcStride*2;
02301 }
02302
02303 asm volatile( EMMS" \n\t"
02304 SFENCE" \n\t"
02305 :::"memory");
02306 #else
02307 y=0;
02308 #endif
02309 for(; y<height; y+=2)
02310 {
02311 unsigned i;
02312 for(i=0; i<chromWidth; i++)
02313 {
02314 unsigned int b= src[6*i+0];
02315 unsigned int g= src[6*i+1];
02316 unsigned int r= src[6*i+2];
02317
02318 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
02319 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
02320 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
02321
02322 udst[i] = U;
02323 vdst[i] = V;
02324 ydst[2*i] = Y;
02325
02326 b= src[6*i+3];
02327 g= src[6*i+4];
02328 r= src[6*i+5];
02329
02330 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
02331 ydst[2*i+1] = Y;
02332 }
02333 ydst += lumStride;
02334 src += srcStride;
02335
02336 for(i=0; i<chromWidth; i++)
02337 {
02338 unsigned int b= src[6*i+0];
02339 unsigned int g= src[6*i+1];
02340 unsigned int r= src[6*i+2];
02341
02342 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
02343
02344 ydst[2*i] = Y;
02345
02346 b= src[6*i+3];
02347 g= src[6*i+4];
02348 r= src[6*i+5];
02349
02350 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
02351 ydst[2*i+1] = Y;
02352 }
02353 udst += chromStride;
02354 vdst += chromStride;
02355 ydst += lumStride;
02356 src += srcStride;
02357 }
02358 }
02359
02360 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
02361 unsigned width, unsigned height, int src1Stride,
02362 int src2Stride, int dstStride){
02363 unsigned h;
02364
02365 for(h=0; h < height; h++)
02366 {
02367 unsigned w;
02368
02369 #ifdef HAVE_MMX
02370 #ifdef HAVE_SSE2
02371 asm(
02372 "xorl %%eax, %%eax \n\t"
02373 "1: \n\t"
02374 PREFETCH" 64(%1, %%eax) \n\t"
02375 PREFETCH" 64(%2, %%eax) \n\t"
02376 "movdqa (%1, %%eax), %%xmm0 \n\t"
02377 "movdqa (%1, %%eax), %%xmm1 \n\t"
02378 "movdqa (%2, %%eax), %%xmm2 \n\t"
02379 "punpcklbw %%xmm2, %%xmm0 \n\t"
02380 "punpckhbw %%xmm2, %%xmm1 \n\t"
02381 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
02382 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
02383 "addl $16, %%eax \n\t"
02384 "cmpl %3, %%eax \n\t"
02385 " jb 1b \n\t"
02386 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
02387 : "memory", "%eax"
02388 );
02389 #else
02390 asm(
02391 "xorl %%eax, %%eax \n\t"
02392 "1: \n\t"
02393 PREFETCH" 64(%1, %%eax) \n\t"
02394 PREFETCH" 64(%2, %%eax) \n\t"
02395 "movq (%1, %%eax), %%mm0 \n\t"
02396 "movq 8(%1, %%eax), %%mm2 \n\t"
02397 "movq %%mm0, %%mm1 \n\t"
02398 "movq %%mm2, %%mm3 \n\t"
02399 "movq (%2, %%eax), %%mm4 \n\t"
02400 "movq 8(%2, %%eax), %%mm5 \n\t"
02401 "punpcklbw %%mm4, %%mm0 \n\t"
02402 "punpckhbw %%mm4, %%mm1 \n\t"
02403 "punpcklbw %%mm5, %%mm2 \n\t"
02404 "punpckhbw %%mm5, %%mm3 \n\t"
02405 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
02406 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
02407 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
02408 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
02409 "addl $16, %%eax \n\t"
02410 "cmpl %3, %%eax \n\t"
02411 " jb 1b \n\t"
02412 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
02413 : "memory", "%eax"
02414 );
02415 #endif
02416 for(w= (width&(~15)); w < width; w++)
02417 {
02418 dest[2*w+0] = src1[w];
02419 dest[2*w+1] = src2[w];
02420 }
02421 #else
02422 for(w=0; w < width; w++)
02423 {
02424 dest[2*w+0] = src1[w];
02425 dest[2*w+1] = src2[w];
02426 }
02427 #endif
02428 dest += dstStride;
02429 src1 += src1Stride;
02430 src2 += src2Stride;
02431 }
02432 #ifdef HAVE_MMX
02433 asm(
02434 EMMS" \n\t"
02435 SFENCE" \n\t"
02436 ::: "memory"
02437 );
02438 #endif
02439 }
02440
02441 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
02442 uint8_t *dst1, uint8_t *dst2,
02443 unsigned width, unsigned height,
02444 int srcStride1, int srcStride2,
02445 int dstStride1, int dstStride2)
02446 {
02447 unsigned int y,x,h;
02448 int w;
02449 w=width/2; h=height/2;
02450 #ifdef HAVE_MMX
02451 asm volatile(
02452 PREFETCH" %0\n\t"
02453 PREFETCH" %1\n\t"
02454 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
02455 #endif
02456 for(y=0;y<h;y++){
02457 const uint8_t* s1=src1+srcStride1*(y>>1);
02458 uint8_t* d=dst1+dstStride1*y;
02459 x=0;
02460 #ifdef HAVE_MMX
02461 for(;x<w-31;x+=32)
02462 {
02463 asm volatile(
02464 PREFETCH" 32%1\n\t"
02465 "movq %1, %%mm0\n\t"
02466 "movq 8%1, %%mm2\n\t"
02467 "movq 16%1, %%mm4\n\t"
02468 "movq 24%1, %%mm6\n\t"
02469 "movq %%mm0, %%mm1\n\t"
02470 "movq %%mm2, %%mm3\n\t"
02471 "movq %%mm4, %%mm5\n\t"
02472 "movq %%mm6, %%mm7\n\t"
02473 "punpcklbw %%mm0, %%mm0\n\t"
02474 "punpckhbw %%mm1, %%mm1\n\t"
02475 "punpcklbw %%mm2, %%mm2\n\t"
02476 "punpckhbw %%mm3, %%mm3\n\t"
02477 "punpcklbw %%mm4, %%mm4\n\t"
02478 "punpckhbw %%mm5, %%mm5\n\t"
02479 "punpcklbw %%mm6, %%mm6\n\t"
02480 "punpckhbw %%mm7, %%mm7\n\t"
02481 MOVNTQ" %%mm0, %0\n\t"
02482 MOVNTQ" %%mm1, 8%0\n\t"
02483 MOVNTQ" %%mm2, 16%0\n\t"
02484 MOVNTQ" %%mm3, 24%0\n\t"
02485 MOVNTQ" %%mm4, 32%0\n\t"
02486 MOVNTQ" %%mm5, 40%0\n\t"
02487 MOVNTQ" %%mm6, 48%0\n\t"
02488 MOVNTQ" %%mm7, 56%0"
02489 :"=m"(d[2*x])
02490 :"m"(s1[x])
02491 :"memory");
02492 }
02493 #endif
02494 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
02495 }
02496 for(y=0;y<h;y++){
02497 const uint8_t* s2=src2+srcStride2*(y>>1);
02498 uint8_t* d=dst2+dstStride2*y;
02499 x=0;
02500 #ifdef HAVE_MMX
02501 for(;x<w-31;x+=32)
02502 {
02503 asm volatile(
02504 PREFETCH" 32%1\n\t"
02505 "movq %1, %%mm0\n\t"
02506 "movq 8%1, %%mm2\n\t"
02507 "movq 16%1, %%mm4\n\t"
02508 "movq 24%1, %%mm6\n\t"
02509 "movq %%mm0, %%mm1\n\t"
02510 "movq %%mm2, %%mm3\n\t"
02511 "movq %%mm4, %%mm5\n\t"
02512 "movq %%mm6, %%mm7\n\t"
02513 "punpcklbw %%mm0, %%mm0\n\t"
02514 "punpckhbw %%mm1, %%mm1\n\t"
02515 "punpcklbw %%mm2, %%mm2\n\t"
02516 "punpckhbw %%mm3, %%mm3\n\t"
02517 "punpcklbw %%mm4, %%mm4\n\t"
02518 "punpckhbw %%mm5, %%mm5\n\t"
02519 "punpcklbw %%mm6, %%mm6\n\t"
02520 "punpckhbw %%mm7, %%mm7\n\t"
02521 MOVNTQ" %%mm0, %0\n\t"
02522 MOVNTQ" %%mm1, 8%0\n\t"
02523 MOVNTQ" %%mm2, 16%0\n\t"
02524 MOVNTQ" %%mm3, 24%0\n\t"
02525 MOVNTQ" %%mm4, 32%0\n\t"
02526 MOVNTQ" %%mm5, 40%0\n\t"
02527 MOVNTQ" %%mm6, 48%0\n\t"
02528 MOVNTQ" %%mm7, 56%0"
02529 :"=m"(d[2*x])
02530 :"m"(s2[x])
02531 :"memory");
02532 }
02533 #endif
02534 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
02535 }
02536 #ifdef HAVE_MMX
02537 asm(
02538 EMMS" \n\t"
02539 SFENCE" \n\t"
02540 ::: "memory"
02541 );
02542 #endif
02543 }
02544
02545 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
02546 uint8_t *dst,
02547 unsigned width, unsigned height,
02548 int srcStride1, int srcStride2,
02549 int srcStride3, int dstStride)
02550 {
02551 unsigned y,x,w,h;
02552 w=width/2; h=height;
02553 for(y=0;y<h;y++){
02554 const uint8_t* yp=src1+srcStride1*y;
02555 const uint8_t* up=src2+srcStride2*(y>>2);
02556 const uint8_t* vp=src3+srcStride3*(y>>2);
02557 uint8_t* d=dst+dstStride*y;
02558 x=0;
02559 #ifdef HAVE_MMX
02560 for(;x<w-7;x+=8)
02561 {
02562 asm volatile(
02563 PREFETCH" 32(%1, %0)\n\t"
02564 PREFETCH" 32(%2, %0)\n\t"
02565 PREFETCH" 32(%3, %0)\n\t"
02566 "movq (%1, %0, 4), %%mm0\n\t"
02567 "movq (%2, %0), %%mm1\n\t"
02568 "movq (%3, %0), %%mm2\n\t"
02569 "movq %%mm0, %%mm3\n\t"
02570 "movq %%mm1, %%mm4\n\t"
02571 "movq %%mm2, %%mm5\n\t"
02572 "punpcklbw %%mm1, %%mm1\n\t"
02573 "punpcklbw %%mm2, %%mm2\n\t"
02574 "punpckhbw %%mm4, %%mm4\n\t"
02575 "punpckhbw %%mm5, %%mm5\n\t"
02576
02577 "movq %%mm1, %%mm6\n\t"
02578 "punpcklbw %%mm2, %%mm1\n\t"
02579 "punpcklbw %%mm1, %%mm0\n\t"
02580 "punpckhbw %%mm1, %%mm3\n\t"
02581 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
02582 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
02583
02584 "punpckhbw %%mm2, %%mm6\n\t"
02585 "movq 8(%1, %0, 4), %%mm0\n\t"
02586 "movq %%mm0, %%mm3\n\t"
02587 "punpcklbw %%mm6, %%mm0\n\t"
02588 "punpckhbw %%mm6, %%mm3\n\t"
02589 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
02590 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
02591
02592 "movq %%mm4, %%mm6\n\t"
02593 "movq 16(%1, %0, 4), %%mm0\n\t"
02594 "movq %%mm0, %%mm3\n\t"
02595 "punpcklbw %%mm5, %%mm4\n\t"
02596 "punpcklbw %%mm4, %%mm0\n\t"
02597 "punpckhbw %%mm4, %%mm3\n\t"
02598 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
02599 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
02600
02601 "punpckhbw %%mm5, %%mm6\n\t"
02602 "movq 24(%1, %0, 4), %%mm0\n\t"
02603 "movq %%mm0, %%mm3\n\t"
02604 "punpcklbw %%mm6, %%mm0\n\t"
02605 "punpckhbw %%mm6, %%mm3\n\t"
02606 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
02607 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
02608
02609 : "+r" (x)
02610 : "r"(yp), "r" (up), "r"(vp), "r"(d)
02611 :"memory");
02612 }
02613 #endif
02614 for(; x<w; x++)
02615 {
02616 const int x2= x<<2;
02617 d[8*x+0]=yp[x2];
02618 d[8*x+1]=up[x];
02619 d[8*x+2]=yp[x2+1];
02620 d[8*x+3]=vp[x];
02621 d[8*x+4]=yp[x2+2];
02622 d[8*x+5]=up[x];
02623 d[8*x+6]=yp[x2+3];
02624 d[8*x+7]=vp[x];
02625 }
02626 }
02627 #ifdef HAVE_MMX
02628 asm(
02629 EMMS" \n\t"
02630 SFENCE" \n\t"
02631 ::: "memory"
02632 );
02633 #endif
02634 }