Main Page | Modules | Class Hierarchy | Class List | Directories | File List | Class Members | File Members | Related Pages

rgb2rgb_template.c

00001 /*
00002  *
00003  *  rgb2rgb.c, Software RGB to RGB convertor
00004  *  pluralize by Software PAL8 to RGB convertor
00005  *               Software YUV to YUV convertor
00006  *               Software YUV to RGB convertor
00007  *  Written by Nick Kurshev.
00008  *  palette & yuv & runtime cpu stuff by Michael ([email protected]) (under GPL)
00009  */
00010 
00011 #include <stddef.h>
00012 #include <inttypes.h> /* for __WORDSIZE */
00013 
00014 #ifndef __WORDSIZE
00015 // #warning You have misconfigured system and probably will lose performance!
00016 #define __WORDSIZE MP_WORDSIZE
00017 #endif
00018 
00019 #undef PREFETCH
00020 #undef MOVNTQ
00021 #undef EMMS
00022 #undef SFENCE
00023 #undef MMREG_SIZE
00024 #undef PREFETCHW
00025 #undef PAVGB
00026 
00027 #ifdef HAVE_SSE2
00028 #define MMREG_SIZE 16
00029 #else
00030 #define MMREG_SIZE 8
00031 #endif
00032 
00033 #ifdef HAVE_3DNOW
00034 #define PREFETCH  "prefetch"
00035 #define PREFETCHW "prefetchw"
00036 #define PAVGB     "pavgusb"
00037 #elif defined ( HAVE_MMX2 )
00038 #define PREFETCH "prefetchnta"
00039 #define PREFETCHW "prefetcht0"
00040 #define PAVGB     "pavgb"
00041 #else
00042 #define PREFETCH "/nop"
00043 #define PREFETCHW "/nop"
00044 #endif
00045 
00046 #ifdef HAVE_3DNOW
00047 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
00048 #define EMMS     "femms"
00049 #else
00050 #define EMMS     "emms"
00051 #endif
00052 
00053 #ifdef HAVE_MMX2
00054 #define MOVNTQ "movntq"
00055 #define SFENCE "sfence"
00056 #else
00057 #define MOVNTQ "movq"
00058 #define SFENCE "/nop"
00059 #endif
00060 
00061 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
00062 {
00063   uint8_t *dest = dst;
00064   const uint8_t *s = src;
00065   const uint8_t *end;
00066 #ifdef HAVE_MMX
00067   const uint8_t *mm_end;
00068 #endif
00069   end = s + src_size;
00070 #ifdef HAVE_MMX
00071   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
00072   mm_end = end - 23;
00073   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
00074   while(s < mm_end)
00075   {
00076     __asm __volatile(
00077         PREFETCH"       32%1\n\t"
00078         "movd   %1, %%mm0\n\t"
00079         "punpckldq 3%1, %%mm0\n\t"
00080         "movd   6%1, %%mm1\n\t"
00081         "punpckldq 9%1, %%mm1\n\t"
00082         "movd   12%1, %%mm2\n\t"
00083         "punpckldq 15%1, %%mm2\n\t"
00084         "movd   18%1, %%mm3\n\t"
00085         "punpckldq 21%1, %%mm3\n\t"
00086         "pand   %%mm7, %%mm0\n\t"
00087         "pand   %%mm7, %%mm1\n\t"
00088         "pand   %%mm7, %%mm2\n\t"
00089         "pand   %%mm7, %%mm3\n\t"
00090         MOVNTQ" %%mm0, %0\n\t"
00091         MOVNTQ" %%mm1, 8%0\n\t"
00092         MOVNTQ" %%mm2, 16%0\n\t"
00093         MOVNTQ" %%mm3, 24%0"
00094         :"=m"(*dest)
00095         :"m"(*s)
00096         :"memory");
00097     dest += 32;
00098     s += 24;
00099   }
00100   __asm __volatile(SFENCE:::"memory");
00101   __asm __volatile(EMMS:::"memory");
00102 #endif
00103   while(s < end)
00104   {
00105     *dest++ = *s++;
00106     *dest++ = *s++;
00107     *dest++ = *s++;
00108     *dest++ = 0;
00109   }
00110 }
00111 
00112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
00113 {
00114   uint8_t *dest = dst;
00115   const uint8_t *s = src;
00116   const uint8_t *end;
00117 #ifdef HAVE_MMX
00118   const uint8_t *mm_end;
00119 #endif
00120   end = s + src_size;
00121 #ifdef HAVE_MMX
00122   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
00123   mm_end = end - 31;
00124   while(s < mm_end)
00125   {
00126     __asm __volatile(
00127         PREFETCH"       32%1\n\t"
00128         "movq   %1, %%mm0\n\t"
00129         "movq   8%1, %%mm1\n\t"
00130         "movq   16%1, %%mm4\n\t"
00131         "movq   24%1, %%mm5\n\t"
00132         "movq   %%mm0, %%mm2\n\t"
00133         "movq   %%mm1, %%mm3\n\t"
00134         "movq   %%mm4, %%mm6\n\t"
00135         "movq   %%mm5, %%mm7\n\t"
00136         "psrlq  $8, %%mm2\n\t"
00137         "psrlq  $8, %%mm3\n\t"
00138         "psrlq  $8, %%mm6\n\t"
00139         "psrlq  $8, %%mm7\n\t"
00140         "pand   %2, %%mm0\n\t"
00141         "pand   %2, %%mm1\n\t"
00142         "pand   %2, %%mm4\n\t"
00143         "pand   %2, %%mm5\n\t"
00144         "pand   %3, %%mm2\n\t"
00145         "pand   %3, %%mm3\n\t"
00146         "pand   %3, %%mm6\n\t"
00147         "pand   %3, %%mm7\n\t"
00148         "por    %%mm2, %%mm0\n\t"
00149         "por    %%mm3, %%mm1\n\t"
00150         "por    %%mm6, %%mm4\n\t"
00151         "por    %%mm7, %%mm5\n\t"
00152 
00153         "movq   %%mm1, %%mm2\n\t"
00154         "movq   %%mm4, %%mm3\n\t"
00155         "psllq  $48, %%mm2\n\t"
00156         "psllq  $32, %%mm3\n\t"
00157         "pand   %4, %%mm2\n\t"
00158         "pand   %5, %%mm3\n\t"
00159         "por    %%mm2, %%mm0\n\t"
00160         "psrlq  $16, %%mm1\n\t"
00161         "psrlq  $32, %%mm4\n\t"
00162         "psllq  $16, %%mm5\n\t"
00163         "por    %%mm3, %%mm1\n\t"
00164         "pand   %6, %%mm5\n\t"
00165         "por    %%mm5, %%mm4\n\t"
00166 
00167         MOVNTQ" %%mm0, %0\n\t"
00168         MOVNTQ" %%mm1, 8%0\n\t"
00169         MOVNTQ" %%mm4, 16%0"
00170         :"=m"(*dest)
00171         :"m"(*s),"m"(mask24l),
00172          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
00173         :"memory");
00174     dest += 24;
00175     s += 32;
00176   }
00177   __asm __volatile(SFENCE:::"memory");
00178   __asm __volatile(EMMS:::"memory");
00179 #endif
00180   while(s < end)
00181   {
00182     *dest++ = *s++;
00183     *dest++ = *s++;
00184     *dest++ = *s++;
00185     s++;
00186   }
00187 }
00188 
00189 /*
00190  Original by Strepto/Astral
00191  ported to gcc & bugfixed : A'rpi
00192  MMX2, 3DNOW optimization by Nick Kurshev
00193  32bit c version, and and&add trick by Michael Niedermayer
00194 */
00195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
00196 {
00197   register const uint8_t* s=src;
00198   register uint8_t* d=dst;
00199   register const uint8_t *end;
00200   const uint8_t *mm_end;
00201   end = s + src_size;
00202 #ifdef HAVE_MMX
00203   __asm __volatile(PREFETCH"    %0"::"m"(*s));
00204   __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
00205   mm_end = end - 15;
00206   while(s<mm_end)
00207   {
00208         __asm __volatile(
00209                 PREFETCH"       32%1\n\t"
00210                 "movq   %1, %%mm0\n\t"
00211                 "movq   8%1, %%mm2\n\t"
00212                 "movq   %%mm0, %%mm1\n\t"
00213                 "movq   %%mm2, %%mm3\n\t"
00214                 "pand   %%mm4, %%mm0\n\t"
00215                 "pand   %%mm4, %%mm2\n\t"
00216                 "paddw  %%mm1, %%mm0\n\t"
00217                 "paddw  %%mm3, %%mm2\n\t"
00218                 MOVNTQ" %%mm0, %0\n\t"
00219                 MOVNTQ" %%mm2, 8%0"
00220                 :"=m"(*d)
00221                 :"m"(*s)
00222                 );
00223         d+=16;
00224         s+=16;
00225   }
00226   __asm __volatile(SFENCE:::"memory");
00227   __asm __volatile(EMMS:::"memory");
00228 #endif
00229     mm_end = end - 3;
00230     while(s < mm_end)
00231     {
00232         register unsigned x= *((uint32_t *)s);
00233         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
00234         d+=4;
00235         s+=4;
00236     }
00237     if(s < end)
00238     {
00239         register unsigned short x= *((uint16_t *)s);
00240         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
00241     }
00242 }
00243 
00244 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
00245 {
00246   register const uint8_t* s=src;
00247   register uint8_t* d=dst;
00248   register const uint8_t *end;
00249   const uint8_t *mm_end;
00250   end = s + src_size;
00251 #ifdef HAVE_MMX
00252   __asm __volatile(PREFETCH"    %0"::"m"(*s));
00253   __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
00254   __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
00255   mm_end = end - 15;
00256   while(s<mm_end)
00257   {
00258         __asm __volatile(
00259                 PREFETCH"       32%1\n\t"
00260                 "movq   %1, %%mm0\n\t"
00261                 "movq   8%1, %%mm2\n\t"
00262                 "movq   %%mm0, %%mm1\n\t"
00263                 "movq   %%mm2, %%mm3\n\t"
00264                 "psrlq  $1, %%mm0\n\t"
00265                 "psrlq  $1, %%mm2\n\t"
00266                 "pand   %%mm7, %%mm0\n\t"
00267                 "pand   %%mm7, %%mm2\n\t"
00268                 "pand   %%mm6, %%mm1\n\t"
00269                 "pand   %%mm6, %%mm3\n\t"
00270                 "por    %%mm1, %%mm0\n\t"
00271                 "por    %%mm3, %%mm2\n\t"
00272                 MOVNTQ" %%mm0, %0\n\t"
00273                 MOVNTQ" %%mm2, 8%0"
00274                 :"=m"(*d)
00275                 :"m"(*s)
00276                 );
00277         d+=16;
00278         s+=16;
00279   }
00280   __asm __volatile(SFENCE:::"memory");
00281   __asm __volatile(EMMS:::"memory");
00282 #endif
00283     mm_end = end - 3;
00284     while(s < mm_end)
00285     {
00286         register uint32_t x= *((uint32_t *)s);
00287         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
00288         s+=4;
00289         d+=4;
00290     }
00291     if(s < end)
00292     {
00293         register uint16_t x= *((uint16_t *)s);
00294         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
00295         s+=2;
00296         d+=2;
00297     }
00298 }
00299 
00300 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
00301 {
00302         const uint8_t *s = src;
00303         const uint8_t *end;
00304 #ifdef HAVE_MMX
00305         const uint8_t *mm_end;
00306 #endif
00307         uint16_t *d = (uint16_t *)dst;
00308         end = s + src_size;
00309 #ifdef HAVE_MMX
00310         mm_end = end - 15;
00311 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
00312         asm volatile(
00313                 "movq %3, %%mm5                 \n\t"
00314                 "movq %4, %%mm6                 \n\t"
00315                 "movq %5, %%mm7                 \n\t"
00316                 ".balign 16                     \n\t"
00317                 "1:                             \n\t"
00318                 PREFETCH" 32(%1)                \n\t"
00319                 "movd   (%1), %%mm0             \n\t"
00320                 "movd   4(%1), %%mm3            \n\t"
00321                 "punpckldq 8(%1), %%mm0         \n\t"
00322                 "punpckldq 12(%1), %%mm3        \n\t"
00323                 "movq %%mm0, %%mm1              \n\t"
00324                 "movq %%mm3, %%mm4              \n\t"
00325                 "pand %%mm6, %%mm0              \n\t"
00326                 "pand %%mm6, %%mm3              \n\t"
00327                 "pmaddwd %%mm7, %%mm0           \n\t"
00328                 "pmaddwd %%mm7, %%mm3           \n\t"
00329                 "pand %%mm5, %%mm1              \n\t"
00330                 "pand %%mm5, %%mm4              \n\t"
00331                 "por %%mm1, %%mm0               \n\t"   
00332                 "por %%mm4, %%mm3               \n\t"
00333                 "psrld $5, %%mm0                \n\t"
00334                 "pslld $11, %%mm3               \n\t"
00335                 "por %%mm3, %%mm0               \n\t"
00336                 MOVNTQ" %%mm0, (%0)             \n\t"
00337                 "addl $16, %1                   \n\t"
00338                 "addl $8, %0                    \n\t"
00339                 "cmpl %2, %1                    \n\t"
00340                 " jb 1b                         \n\t"
00341                 : "+r" (d), "+r"(s)
00342                 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
00343         );
00344 #else
00345         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
00346         __asm __volatile(
00347             "movq       %0, %%mm7\n\t"
00348             "movq       %1, %%mm6\n\t"
00349             ::"m"(red_16mask),"m"(green_16mask));
00350         while(s < mm_end)
00351         {
00352             __asm __volatile(
00353                 PREFETCH" 32%1\n\t"
00354                 "movd   %1, %%mm0\n\t"
00355                 "movd   4%1, %%mm3\n\t"
00356                 "punpckldq 8%1, %%mm0\n\t"
00357                 "punpckldq 12%1, %%mm3\n\t"
00358                 "movq   %%mm0, %%mm1\n\t"
00359                 "movq   %%mm0, %%mm2\n\t"
00360                 "movq   %%mm3, %%mm4\n\t"
00361                 "movq   %%mm3, %%mm5\n\t"
00362                 "psrlq  $3, %%mm0\n\t"
00363                 "psrlq  $3, %%mm3\n\t"
00364                 "pand   %2, %%mm0\n\t"
00365                 "pand   %2, %%mm3\n\t"
00366                 "psrlq  $5, %%mm1\n\t"
00367                 "psrlq  $5, %%mm4\n\t"
00368                 "pand   %%mm6, %%mm1\n\t"
00369                 "pand   %%mm6, %%mm4\n\t"
00370                 "psrlq  $8, %%mm2\n\t"
00371                 "psrlq  $8, %%mm5\n\t"
00372                 "pand   %%mm7, %%mm2\n\t"
00373                 "pand   %%mm7, %%mm5\n\t"
00374                 "por    %%mm1, %%mm0\n\t"
00375                 "por    %%mm4, %%mm3\n\t"
00376                 "por    %%mm2, %%mm0\n\t"
00377                 "por    %%mm5, %%mm3\n\t"
00378                 "psllq  $16, %%mm3\n\t"
00379                 "por    %%mm3, %%mm0\n\t"
00380                 MOVNTQ" %%mm0, %0\n\t"
00381                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00382                 d += 4;
00383                 s += 16;
00384         }
00385 #endif
00386         __asm __volatile(SFENCE:::"memory");
00387         __asm __volatile(EMMS:::"memory");
00388 #endif
00389         while(s < end)
00390         {
00391                 const int src= *s; s += 4;
00392                 *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
00393 //              *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
00394         }
00395 }
00396 
00397 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
00398 {
00399         const uint8_t *s = src;
00400         const uint8_t *end;
00401 #ifdef HAVE_MMX
00402         const uint8_t *mm_end;
00403 #endif
00404         uint16_t *d = (uint16_t *)dst;
00405         end = s + src_size;
00406 #ifdef HAVE_MMX
00407         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
00408         __asm __volatile(
00409             "movq       %0, %%mm7\n\t"
00410             "movq       %1, %%mm6\n\t"
00411             ::"m"(red_16mask),"m"(green_16mask));
00412         mm_end = end - 15;
00413         while(s < mm_end)
00414         {
00415             __asm __volatile(
00416                 PREFETCH" 32%1\n\t"
00417                 "movd   %1, %%mm0\n\t"
00418                 "movd   4%1, %%mm3\n\t"
00419                 "punpckldq 8%1, %%mm0\n\t"
00420                 "punpckldq 12%1, %%mm3\n\t"
00421                 "movq   %%mm0, %%mm1\n\t"
00422                 "movq   %%mm0, %%mm2\n\t"
00423                 "movq   %%mm3, %%mm4\n\t"
00424                 "movq   %%mm3, %%mm5\n\t"
00425                 "psllq  $8, %%mm0\n\t"
00426                 "psllq  $8, %%mm3\n\t"
00427                 "pand   %%mm7, %%mm0\n\t"
00428                 "pand   %%mm7, %%mm3\n\t"
00429                 "psrlq  $5, %%mm1\n\t"
00430                 "psrlq  $5, %%mm4\n\t"
00431                 "pand   %%mm6, %%mm1\n\t"
00432                 "pand   %%mm6, %%mm4\n\t"
00433                 "psrlq  $19, %%mm2\n\t"
00434                 "psrlq  $19, %%mm5\n\t"
00435                 "pand   %2, %%mm2\n\t"
00436                 "pand   %2, %%mm5\n\t"
00437                 "por    %%mm1, %%mm0\n\t"
00438                 "por    %%mm4, %%mm3\n\t"
00439                 "por    %%mm2, %%mm0\n\t"
00440                 "por    %%mm5, %%mm3\n\t"
00441                 "psllq  $16, %%mm3\n\t"
00442                 "por    %%mm3, %%mm0\n\t"
00443                 MOVNTQ" %%mm0, %0\n\t"
00444                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00445                 d += 4;
00446                 s += 16;
00447         }
00448         __asm __volatile(SFENCE:::"memory");
00449         __asm __volatile(EMMS:::"memory");
00450 #endif
00451         while(s < end)
00452         {
00453                 const int src= *s; s += 4;
00454                 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
00455         }
00456 }
00457 
00458 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
00459 {
00460         const uint8_t *s = src;
00461         const uint8_t *end;
00462 #ifdef HAVE_MMX
00463         const uint8_t *mm_end;
00464 #endif
00465         uint16_t *d = (uint16_t *)dst;
00466         end = s + src_size;
00467 #ifdef HAVE_MMX
00468         mm_end = end - 15;
00469 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
00470         asm volatile(
00471                 "movq %3, %%mm5                 \n\t"
00472                 "movq %4, %%mm6                 \n\t"
00473                 "movq %5, %%mm7                 \n\t"
00474                 ".balign 16                     \n\t"
00475                 "1:                             \n\t"
00476                 PREFETCH" 32(%1)                \n\t"
00477                 "movd   (%1), %%mm0             \n\t"
00478                 "movd   4(%1), %%mm3            \n\t"
00479                 "punpckldq 8(%1), %%mm0         \n\t"
00480                 "punpckldq 12(%1), %%mm3        \n\t"
00481                 "movq %%mm0, %%mm1              \n\t"
00482                 "movq %%mm3, %%mm4              \n\t"
00483                 "pand %%mm6, %%mm0              \n\t"
00484                 "pand %%mm6, %%mm3              \n\t"
00485                 "pmaddwd %%mm7, %%mm0           \n\t"
00486                 "pmaddwd %%mm7, %%mm3           \n\t"
00487                 "pand %%mm5, %%mm1              \n\t"
00488                 "pand %%mm5, %%mm4              \n\t"
00489                 "por %%mm1, %%mm0               \n\t"   
00490                 "por %%mm4, %%mm3               \n\t"
00491                 "psrld $6, %%mm0                \n\t"
00492                 "pslld $10, %%mm3               \n\t"
00493                 "por %%mm3, %%mm0               \n\t"
00494                 MOVNTQ" %%mm0, (%0)             \n\t"
00495                 "addl $16, %1                   \n\t"
00496                 "addl $8, %0                    \n\t"
00497                 "cmpl %2, %1                    \n\t"
00498                 " jb 1b                         \n\t"
00499                 : "+r" (d), "+r"(s)
00500                 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
00501         );
00502 #else
00503         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
00504         __asm __volatile(
00505             "movq       %0, %%mm7\n\t"
00506             "movq       %1, %%mm6\n\t"
00507             ::"m"(red_15mask),"m"(green_15mask));
00508         while(s < mm_end)
00509         {
00510             __asm __volatile(
00511                 PREFETCH" 32%1\n\t"
00512                 "movd   %1, %%mm0\n\t"
00513                 "movd   4%1, %%mm3\n\t"
00514                 "punpckldq 8%1, %%mm0\n\t"
00515                 "punpckldq 12%1, %%mm3\n\t"
00516                 "movq   %%mm0, %%mm1\n\t"
00517                 "movq   %%mm0, %%mm2\n\t"
00518                 "movq   %%mm3, %%mm4\n\t"
00519                 "movq   %%mm3, %%mm5\n\t"
00520                 "psrlq  $3, %%mm0\n\t"
00521                 "psrlq  $3, %%mm3\n\t"
00522                 "pand   %2, %%mm0\n\t"
00523                 "pand   %2, %%mm3\n\t"
00524                 "psrlq  $6, %%mm1\n\t"
00525                 "psrlq  $6, %%mm4\n\t"
00526                 "pand   %%mm6, %%mm1\n\t"
00527                 "pand   %%mm6, %%mm4\n\t"
00528                 "psrlq  $9, %%mm2\n\t"
00529                 "psrlq  $9, %%mm5\n\t"
00530                 "pand   %%mm7, %%mm2\n\t"
00531                 "pand   %%mm7, %%mm5\n\t"
00532                 "por    %%mm1, %%mm0\n\t"
00533                 "por    %%mm4, %%mm3\n\t"
00534                 "por    %%mm2, %%mm0\n\t"
00535                 "por    %%mm5, %%mm3\n\t"
00536                 "psllq  $16, %%mm3\n\t"
00537                 "por    %%mm3, %%mm0\n\t"
00538                 MOVNTQ" %%mm0, %0\n\t"
00539                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00540                 d += 4;
00541                 s += 16;
00542         }
00543 #endif
00544         __asm __volatile(SFENCE:::"memory");
00545         __asm __volatile(EMMS:::"memory");
00546 #endif
00547         while(s < end)
00548         {
00549                 const int src= *s; s += 4;
00550                 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
00551         }
00552 }
00553 
00554 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
00555 {
00556         const uint8_t *s = src;
00557         const uint8_t *end;
00558 #ifdef HAVE_MMX
00559         const uint8_t *mm_end;
00560 #endif
00561         uint16_t *d = (uint16_t *)dst;
00562         end = s + src_size;
00563 #ifdef HAVE_MMX
00564         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
00565         __asm __volatile(
00566             "movq       %0, %%mm7\n\t"
00567             "movq       %1, %%mm6\n\t"
00568             ::"m"(red_15mask),"m"(green_15mask));
00569         mm_end = end - 15;
00570         while(s < mm_end)
00571         {
00572             __asm __volatile(
00573                 PREFETCH" 32%1\n\t"
00574                 "movd   %1, %%mm0\n\t"
00575                 "movd   4%1, %%mm3\n\t"
00576                 "punpckldq 8%1, %%mm0\n\t"
00577                 "punpckldq 12%1, %%mm3\n\t"
00578                 "movq   %%mm0, %%mm1\n\t"
00579                 "movq   %%mm0, %%mm2\n\t"
00580                 "movq   %%mm3, %%mm4\n\t"
00581                 "movq   %%mm3, %%mm5\n\t"
00582                 "psllq  $7, %%mm0\n\t"
00583                 "psllq  $7, %%mm3\n\t"
00584                 "pand   %%mm7, %%mm0\n\t"
00585                 "pand   %%mm7, %%mm3\n\t"
00586                 "psrlq  $6, %%mm1\n\t"
00587                 "psrlq  $6, %%mm4\n\t"
00588                 "pand   %%mm6, %%mm1\n\t"
00589                 "pand   %%mm6, %%mm4\n\t"
00590                 "psrlq  $19, %%mm2\n\t"
00591                 "psrlq  $19, %%mm5\n\t"
00592                 "pand   %2, %%mm2\n\t"
00593                 "pand   %2, %%mm5\n\t"
00594                 "por    %%mm1, %%mm0\n\t"
00595                 "por    %%mm4, %%mm3\n\t"
00596                 "por    %%mm2, %%mm0\n\t"
00597                 "por    %%mm5, %%mm3\n\t"
00598                 "psllq  $16, %%mm3\n\t"
00599                 "por    %%mm3, %%mm0\n\t"
00600                 MOVNTQ" %%mm0, %0\n\t"
00601                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00602                 d += 4;
00603                 s += 16;
00604         }
00605         __asm __volatile(SFENCE:::"memory");
00606         __asm __volatile(EMMS:::"memory");
00607 #endif
00608         while(s < end)
00609         {
00610                 const int src= *s; s += 4;
00611                 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
00612         }
00613 }
00614 
00615 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
00616 {
00617         const uint8_t *s = src;
00618         const uint8_t *end;
00619 #ifdef HAVE_MMX
00620         const uint8_t *mm_end;
00621 #endif
00622         uint16_t *d = (uint16_t *)dst;
00623         end = s + src_size;
00624 #ifdef HAVE_MMX
00625         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
00626         __asm __volatile(
00627             "movq       %0, %%mm7\n\t"
00628             "movq       %1, %%mm6\n\t"
00629             ::"m"(red_16mask),"m"(green_16mask));
00630         mm_end = end - 11;
00631         while(s < mm_end)
00632         {
00633             __asm __volatile(
00634                 PREFETCH" 32%1\n\t"
00635                 "movd   %1, %%mm0\n\t"
00636                 "movd   3%1, %%mm3\n\t"
00637                 "punpckldq 6%1, %%mm0\n\t"
00638                 "punpckldq 9%1, %%mm3\n\t"
00639                 "movq   %%mm0, %%mm1\n\t"
00640                 "movq   %%mm0, %%mm2\n\t"
00641                 "movq   %%mm3, %%mm4\n\t"
00642                 "movq   %%mm3, %%mm5\n\t"
00643                 "psrlq  $3, %%mm0\n\t"
00644                 "psrlq  $3, %%mm3\n\t"
00645                 "pand   %2, %%mm0\n\t"
00646                 "pand   %2, %%mm3\n\t"
00647                 "psrlq  $5, %%mm1\n\t"
00648                 "psrlq  $5, %%mm4\n\t"
00649                 "pand   %%mm6, %%mm1\n\t"
00650                 "pand   %%mm6, %%mm4\n\t"
00651                 "psrlq  $8, %%mm2\n\t"
00652                 "psrlq  $8, %%mm5\n\t"
00653                 "pand   %%mm7, %%mm2\n\t"
00654                 "pand   %%mm7, %%mm5\n\t"
00655                 "por    %%mm1, %%mm0\n\t"
00656                 "por    %%mm4, %%mm3\n\t"
00657                 "por    %%mm2, %%mm0\n\t"
00658                 "por    %%mm5, %%mm3\n\t"
00659                 "psllq  $16, %%mm3\n\t"
00660                 "por    %%mm3, %%mm0\n\t"
00661                 MOVNTQ" %%mm0, %0\n\t"
00662                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00663                 d += 4;
00664                 s += 12;
00665         }
00666         __asm __volatile(SFENCE:::"memory");
00667         __asm __volatile(EMMS:::"memory");
00668 #endif
00669         while(s < end)
00670         {
00671                 const int b= *s++;
00672                 const int g= *s++;
00673                 const int r= *s++;
00674                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
00675         }
00676 }
00677 
00678 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
00679 {
00680         const uint8_t *s = src;
00681         const uint8_t *end;
00682 #ifdef HAVE_MMX
00683         const uint8_t *mm_end;
00684 #endif
00685         uint16_t *d = (uint16_t *)dst;
00686         end = s + src_size;
00687 #ifdef HAVE_MMX
00688         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
00689         __asm __volatile(
00690             "movq       %0, %%mm7\n\t"
00691             "movq       %1, %%mm6\n\t"
00692             ::"m"(red_16mask),"m"(green_16mask));
00693         mm_end = end - 15;
00694         while(s < mm_end)
00695         {
00696             __asm __volatile(
00697                 PREFETCH" 32%1\n\t"
00698                 "movd   %1, %%mm0\n\t"
00699                 "movd   3%1, %%mm3\n\t"
00700                 "punpckldq 6%1, %%mm0\n\t"
00701                 "punpckldq 9%1, %%mm3\n\t"
00702                 "movq   %%mm0, %%mm1\n\t"
00703                 "movq   %%mm0, %%mm2\n\t"
00704                 "movq   %%mm3, %%mm4\n\t"
00705                 "movq   %%mm3, %%mm5\n\t"
00706                 "psllq  $8, %%mm0\n\t"
00707                 "psllq  $8, %%mm3\n\t"
00708                 "pand   %%mm7, %%mm0\n\t"
00709                 "pand   %%mm7, %%mm3\n\t"
00710                 "psrlq  $5, %%mm1\n\t"
00711                 "psrlq  $5, %%mm4\n\t"
00712                 "pand   %%mm6, %%mm1\n\t"
00713                 "pand   %%mm6, %%mm4\n\t"
00714                 "psrlq  $19, %%mm2\n\t"
00715                 "psrlq  $19, %%mm5\n\t"
00716                 "pand   %2, %%mm2\n\t"
00717                 "pand   %2, %%mm5\n\t"
00718                 "por    %%mm1, %%mm0\n\t"
00719                 "por    %%mm4, %%mm3\n\t"
00720                 "por    %%mm2, %%mm0\n\t"
00721                 "por    %%mm5, %%mm3\n\t"
00722                 "psllq  $16, %%mm3\n\t"
00723                 "por    %%mm3, %%mm0\n\t"
00724                 MOVNTQ" %%mm0, %0\n\t"
00725                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00726                 d += 4;
00727                 s += 12;
00728         }
00729         __asm __volatile(SFENCE:::"memory");
00730         __asm __volatile(EMMS:::"memory");
00731 #endif
00732         while(s < end)
00733         {
00734                 const int r= *s++;
00735                 const int g= *s++;
00736                 const int b= *s++;
00737                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
00738         }
00739 }
00740 
00741 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
00742 {
00743         const uint8_t *s = src;
00744         const uint8_t *end;
00745 #ifdef HAVE_MMX
00746         const uint8_t *mm_end;
00747 #endif
00748         uint16_t *d = (uint16_t *)dst;
00749         end = s + src_size;
00750 #ifdef HAVE_MMX
00751         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
00752         __asm __volatile(
00753             "movq       %0, %%mm7\n\t"
00754             "movq       %1, %%mm6\n\t"
00755             ::"m"(red_15mask),"m"(green_15mask));
00756         mm_end = end - 11;
00757         while(s < mm_end)
00758         {
00759             __asm __volatile(
00760                 PREFETCH" 32%1\n\t"
00761                 "movd   %1, %%mm0\n\t"
00762                 "movd   3%1, %%mm3\n\t"
00763                 "punpckldq 6%1, %%mm0\n\t"
00764                 "punpckldq 9%1, %%mm3\n\t"
00765                 "movq   %%mm0, %%mm1\n\t"
00766                 "movq   %%mm0, %%mm2\n\t"
00767                 "movq   %%mm3, %%mm4\n\t"
00768                 "movq   %%mm3, %%mm5\n\t"
00769                 "psrlq  $3, %%mm0\n\t"
00770                 "psrlq  $3, %%mm3\n\t"
00771                 "pand   %2, %%mm0\n\t"
00772                 "pand   %2, %%mm3\n\t"
00773                 "psrlq  $6, %%mm1\n\t"
00774                 "psrlq  $6, %%mm4\n\t"
00775                 "pand   %%mm6, %%mm1\n\t"
00776                 "pand   %%mm6, %%mm4\n\t"
00777                 "psrlq  $9, %%mm2\n\t"
00778                 "psrlq  $9, %%mm5\n\t"
00779                 "pand   %%mm7, %%mm2\n\t"
00780                 "pand   %%mm7, %%mm5\n\t"
00781                 "por    %%mm1, %%mm0\n\t"
00782                 "por    %%mm4, %%mm3\n\t"
00783                 "por    %%mm2, %%mm0\n\t"
00784                 "por    %%mm5, %%mm3\n\t"
00785                 "psllq  $16, %%mm3\n\t"
00786                 "por    %%mm3, %%mm0\n\t"
00787                 MOVNTQ" %%mm0, %0\n\t"
00788                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00789                 d += 4;
00790                 s += 12;
00791         }
00792         __asm __volatile(SFENCE:::"memory");
00793         __asm __volatile(EMMS:::"memory");
00794 #endif
00795         while(s < end)
00796         {
00797                 const int b= *s++;
00798                 const int g= *s++;
00799                 const int r= *s++;
00800                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
00801         }
00802 }
00803 
00804 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
00805 {
00806         const uint8_t *s = src;
00807         const uint8_t *end;
00808 #ifdef HAVE_MMX
00809         const uint8_t *mm_end;
00810 #endif
00811         uint16_t *d = (uint16_t *)dst;
00812         end = s + src_size;
00813 #ifdef HAVE_MMX
00814         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
00815         __asm __volatile(
00816             "movq       %0, %%mm7\n\t"
00817             "movq       %1, %%mm6\n\t"
00818             ::"m"(red_15mask),"m"(green_15mask));
00819         mm_end = end - 15;
00820         while(s < mm_end)
00821         {
00822             __asm __volatile(
00823                 PREFETCH" 32%1\n\t"
00824                 "movd   %1, %%mm0\n\t"
00825                 "movd   3%1, %%mm3\n\t"
00826                 "punpckldq 6%1, %%mm0\n\t"
00827                 "punpckldq 9%1, %%mm3\n\t"
00828                 "movq   %%mm0, %%mm1\n\t"
00829                 "movq   %%mm0, %%mm2\n\t"
00830                 "movq   %%mm3, %%mm4\n\t"
00831                 "movq   %%mm3, %%mm5\n\t"
00832                 "psllq  $7, %%mm0\n\t"
00833                 "psllq  $7, %%mm3\n\t"
00834                 "pand   %%mm7, %%mm0\n\t"
00835                 "pand   %%mm7, %%mm3\n\t"
00836                 "psrlq  $6, %%mm1\n\t"
00837                 "psrlq  $6, %%mm4\n\t"
00838                 "pand   %%mm6, %%mm1\n\t"
00839                 "pand   %%mm6, %%mm4\n\t"
00840                 "psrlq  $19, %%mm2\n\t"
00841                 "psrlq  $19, %%mm5\n\t"
00842                 "pand   %2, %%mm2\n\t"
00843                 "pand   %2, %%mm5\n\t"
00844                 "por    %%mm1, %%mm0\n\t"
00845                 "por    %%mm4, %%mm3\n\t"
00846                 "por    %%mm2, %%mm0\n\t"
00847                 "por    %%mm5, %%mm3\n\t"
00848                 "psllq  $16, %%mm3\n\t"
00849                 "por    %%mm3, %%mm0\n\t"
00850                 MOVNTQ" %%mm0, %0\n\t"
00851                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00852                 d += 4;
00853                 s += 12;
00854         }
00855         __asm __volatile(SFENCE:::"memory");
00856         __asm __volatile(EMMS:::"memory");
00857 #endif
00858         while(s < end)
00859         {
00860                 const int r= *s++;
00861                 const int g= *s++;
00862                 const int b= *s++;
00863                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
00864         }
00865 }
00866 
00867 /*
00868   I use here less accurate approximation by simply
00869  left-shifting the input
00870   value and filling the low order bits with
00871  zeroes. This method improves png's
00872   compression but this scheme cannot reproduce white exactly, since it does not
00873   generate an all-ones maximum value; the net effect is to darken the
00874   image slightly.
00875 
00876   The better method should be "left bit replication":
00877 
00878    4 3 2 1 0
00879    ---------
00880    1 1 0 1 1
00881 
00882    7 6 5 4 3  2 1 0
00883    ----------------
00884    1 1 0 1 1  1 1 0
00885    |=======|  |===|
00886        |      Leftmost Bits Repeated to Fill Open Bits
00887        |
00888    Original Bits
00889 */
00890 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
00891 {
00892         const uint16_t *end;
00893 #ifdef HAVE_MMX
00894         const uint16_t *mm_end;
00895 #endif
00896         uint8_t *d = (uint8_t *)dst;
00897         const uint16_t *s = (uint16_t *)src;
00898         end = s + src_size/2;
00899 #ifdef HAVE_MMX
00900         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
00901         mm_end = end - 7;
00902         while(s < mm_end)
00903         {
00904             __asm __volatile(
00905                 PREFETCH" 32%1\n\t"
00906                 "movq   %1, %%mm0\n\t"
00907                 "movq   %1, %%mm1\n\t"
00908                 "movq   %1, %%mm2\n\t"
00909                 "pand   %2, %%mm0\n\t"
00910                 "pand   %3, %%mm1\n\t"
00911                 "pand   %4, %%mm2\n\t"
00912                 "psllq  $3, %%mm0\n\t"
00913                 "psrlq  $2, %%mm1\n\t"
00914                 "psrlq  $7, %%mm2\n\t"
00915                 "movq   %%mm0, %%mm3\n\t"
00916                 "movq   %%mm1, %%mm4\n\t"
00917                 "movq   %%mm2, %%mm5\n\t"
00918                 "punpcklwd %5, %%mm0\n\t"
00919                 "punpcklwd %5, %%mm1\n\t"
00920                 "punpcklwd %5, %%mm2\n\t"
00921                 "punpckhwd %5, %%mm3\n\t"
00922                 "punpckhwd %5, %%mm4\n\t"
00923                 "punpckhwd %5, %%mm5\n\t"
00924                 "psllq  $8, %%mm1\n\t"
00925                 "psllq  $16, %%mm2\n\t"
00926                 "por    %%mm1, %%mm0\n\t"
00927                 "por    %%mm2, %%mm0\n\t"
00928                 "psllq  $8, %%mm4\n\t"
00929                 "psllq  $16, %%mm5\n\t"
00930                 "por    %%mm4, %%mm3\n\t"
00931                 "por    %%mm5, %%mm3\n\t"
00932 
00933                 "movq   %%mm0, %%mm6\n\t"
00934                 "movq   %%mm3, %%mm7\n\t"
00935                 
00936                 "movq   8%1, %%mm0\n\t"
00937                 "movq   8%1, %%mm1\n\t"
00938                 "movq   8%1, %%mm2\n\t"
00939                 "pand   %2, %%mm0\n\t"
00940                 "pand   %3, %%mm1\n\t"
00941                 "pand   %4, %%mm2\n\t"
00942                 "psllq  $3, %%mm0\n\t"
00943                 "psrlq  $2, %%mm1\n\t"
00944                 "psrlq  $7, %%mm2\n\t"
00945                 "movq   %%mm0, %%mm3\n\t"
00946                 "movq   %%mm1, %%mm4\n\t"
00947                 "movq   %%mm2, %%mm5\n\t"
00948                 "punpcklwd %5, %%mm0\n\t"
00949                 "punpcklwd %5, %%mm1\n\t"
00950                 "punpcklwd %5, %%mm2\n\t"
00951                 "punpckhwd %5, %%mm3\n\t"
00952                 "punpckhwd %5, %%mm4\n\t"
00953                 "punpckhwd %5, %%mm5\n\t"
00954                 "psllq  $8, %%mm1\n\t"
00955                 "psllq  $16, %%mm2\n\t"
00956                 "por    %%mm1, %%mm0\n\t"
00957                 "por    %%mm2, %%mm0\n\t"
00958                 "psllq  $8, %%mm4\n\t"
00959                 "psllq  $16, %%mm5\n\t"
00960                 "por    %%mm4, %%mm3\n\t"
00961                 "por    %%mm5, %%mm3\n\t"
00962 
00963                 :"=m"(*d)
00964                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
00965                 :"memory");
00966             /* Borrowed 32 to 24 */
00967             __asm __volatile(
00968                 "movq   %%mm0, %%mm4\n\t"
00969                 "movq   %%mm3, %%mm5\n\t"
00970                 "movq   %%mm6, %%mm0\n\t"
00971                 "movq   %%mm7, %%mm1\n\t"
00972                 
00973                 "movq   %%mm4, %%mm6\n\t"
00974                 "movq   %%mm5, %%mm7\n\t"
00975                 "movq   %%mm0, %%mm2\n\t"
00976                 "movq   %%mm1, %%mm3\n\t"
00977 
00978                 "psrlq  $8, %%mm2\n\t"
00979                 "psrlq  $8, %%mm3\n\t"
00980                 "psrlq  $8, %%mm6\n\t"
00981                 "psrlq  $8, %%mm7\n\t"
00982                 "pand   %2, %%mm0\n\t"
00983                 "pand   %2, %%mm1\n\t"
00984                 "pand   %2, %%mm4\n\t"
00985                 "pand   %2, %%mm5\n\t"
00986                 "pand   %3, %%mm2\n\t"
00987                 "pand   %3, %%mm3\n\t"
00988                 "pand   %3, %%mm6\n\t"
00989                 "pand   %3, %%mm7\n\t"
00990                 "por    %%mm2, %%mm0\n\t"
00991                 "por    %%mm3, %%mm1\n\t"
00992                 "por    %%mm6, %%mm4\n\t"
00993                 "por    %%mm7, %%mm5\n\t"
00994 
00995                 "movq   %%mm1, %%mm2\n\t"
00996                 "movq   %%mm4, %%mm3\n\t"
00997                 "psllq  $48, %%mm2\n\t"
00998                 "psllq  $32, %%mm3\n\t"
00999                 "pand   %4, %%mm2\n\t"
01000                 "pand   %5, %%mm3\n\t"
01001                 "por    %%mm2, %%mm0\n\t"
01002                 "psrlq  $16, %%mm1\n\t"
01003                 "psrlq  $32, %%mm4\n\t"
01004                 "psllq  $16, %%mm5\n\t"
01005                 "por    %%mm3, %%mm1\n\t"
01006                 "pand   %6, %%mm5\n\t"
01007                 "por    %%mm5, %%mm4\n\t"
01008 
01009                 MOVNTQ" %%mm0, %0\n\t"
01010                 MOVNTQ" %%mm1, 8%0\n\t"
01011                 MOVNTQ" %%mm4, 16%0"
01012 
01013                 :"=m"(*d)
01014                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
01015                 :"memory");
01016                 d += 24;
01017                 s += 8;
01018         }
01019         __asm __volatile(SFENCE:::"memory");
01020         __asm __volatile(EMMS:::"memory");
01021 #endif
01022         while(s < end)
01023         {
01024                 register uint16_t bgr;
01025                 bgr = *s++;
01026                 *d++ = (bgr&0x1F)<<3;
01027                 *d++ = (bgr&0x3E0)>>2;
01028                 *d++ = (bgr&0x7C00)>>7;
01029         }
01030 }
01031 
01032 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
01033 {
01034         const uint16_t *end;
01035 #ifdef HAVE_MMX
01036         const uint16_t *mm_end;
01037 #endif
01038         uint8_t *d = (uint8_t *)dst;
01039         const uint16_t *s = (const uint16_t *)src;
01040         end = s + src_size/2;
01041 #ifdef HAVE_MMX
01042         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
01043         mm_end = end - 7;
01044         while(s < mm_end)
01045         {
01046             __asm __volatile(
01047                 PREFETCH" 32%1\n\t"
01048                 "movq   %1, %%mm0\n\t"
01049                 "movq   %1, %%mm1\n\t"
01050                 "movq   %1, %%mm2\n\t"
01051                 "pand   %2, %%mm0\n\t"
01052                 "pand   %3, %%mm1\n\t"
01053                 "pand   %4, %%mm2\n\t"
01054                 "psllq  $3, %%mm0\n\t"
01055                 "psrlq  $3, %%mm1\n\t"
01056                 "psrlq  $8, %%mm2\n\t"
01057                 "movq   %%mm0, %%mm3\n\t"
01058                 "movq   %%mm1, %%mm4\n\t"
01059                 "movq   %%mm2, %%mm5\n\t"
01060                 "punpcklwd %5, %%mm0\n\t"
01061                 "punpcklwd %5, %%mm1\n\t"
01062                 "punpcklwd %5, %%mm2\n\t"
01063                 "punpckhwd %5, %%mm3\n\t"
01064                 "punpckhwd %5, %%mm4\n\t"
01065                 "punpckhwd %5, %%mm5\n\t"
01066                 "psllq  $8, %%mm1\n\t"
01067                 "psllq  $16, %%mm2\n\t"
01068                 "por    %%mm1, %%mm0\n\t"
01069                 "por    %%mm2, %%mm0\n\t"
01070                 "psllq  $8, %%mm4\n\t"
01071                 "psllq  $16, %%mm5\n\t"
01072                 "por    %%mm4, %%mm3\n\t"
01073                 "por    %%mm5, %%mm3\n\t"
01074                 
01075                 "movq   %%mm0, %%mm6\n\t"
01076                 "movq   %%mm3, %%mm7\n\t"
01077 
01078                 "movq   8%1, %%mm0\n\t"
01079                 "movq   8%1, %%mm1\n\t"
01080                 "movq   8%1, %%mm2\n\t"
01081                 "pand   %2, %%mm0\n\t"
01082                 "pand   %3, %%mm1\n\t"
01083                 "pand   %4, %%mm2\n\t"
01084                 "psllq  $3, %%mm0\n\t"
01085                 "psrlq  $3, %%mm1\n\t"
01086                 "psrlq  $8, %%mm2\n\t"
01087                 "movq   %%mm0, %%mm3\n\t"
01088                 "movq   %%mm1, %%mm4\n\t"
01089                 "movq   %%mm2, %%mm5\n\t"
01090                 "punpcklwd %5, %%mm0\n\t"
01091                 "punpcklwd %5, %%mm1\n\t"
01092                 "punpcklwd %5, %%mm2\n\t"
01093                 "punpckhwd %5, %%mm3\n\t"
01094                 "punpckhwd %5, %%mm4\n\t"
01095                 "punpckhwd %5, %%mm5\n\t"
01096                 "psllq  $8, %%mm1\n\t"
01097                 "psllq  $16, %%mm2\n\t"
01098                 "por    %%mm1, %%mm0\n\t"
01099                 "por    %%mm2, %%mm0\n\t"
01100                 "psllq  $8, %%mm4\n\t"
01101                 "psllq  $16, %%mm5\n\t"
01102                 "por    %%mm4, %%mm3\n\t"
01103                 "por    %%mm5, %%mm3\n\t"
01104                 :"=m"(*d)
01105                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)           
01106                 :"memory");
01107             /* Borrowed 32 to 24 */
01108             __asm __volatile(
01109                 "movq   %%mm0, %%mm4\n\t"
01110                 "movq   %%mm3, %%mm5\n\t"
01111                 "movq   %%mm6, %%mm0\n\t"
01112                 "movq   %%mm7, %%mm1\n\t"
01113                 
01114                 "movq   %%mm4, %%mm6\n\t"
01115                 "movq   %%mm5, %%mm7\n\t"
01116                 "movq   %%mm0, %%mm2\n\t"
01117                 "movq   %%mm1, %%mm3\n\t"
01118 
01119                 "psrlq  $8, %%mm2\n\t"
01120                 "psrlq  $8, %%mm3\n\t"
01121                 "psrlq  $8, %%mm6\n\t"
01122                 "psrlq  $8, %%mm7\n\t"
01123                 "pand   %2, %%mm0\n\t"
01124                 "pand   %2, %%mm1\n\t"
01125                 "pand   %2, %%mm4\n\t"
01126                 "pand   %2, %%mm5\n\t"
01127                 "pand   %3, %%mm2\n\t"
01128                 "pand   %3, %%mm3\n\t"
01129                 "pand   %3, %%mm6\n\t"
01130                 "pand   %3, %%mm7\n\t"
01131                 "por    %%mm2, %%mm0\n\t"
01132                 "por    %%mm3, %%mm1\n\t"
01133                 "por    %%mm6, %%mm4\n\t"
01134                 "por    %%mm7, %%mm5\n\t"
01135 
01136                 "movq   %%mm1, %%mm2\n\t"
01137                 "movq   %%mm4, %%mm3\n\t"
01138                 "psllq  $48, %%mm2\n\t"
01139                 "psllq  $32, %%mm3\n\t"
01140                 "pand   %4, %%mm2\n\t"
01141                 "pand   %5, %%mm3\n\t"
01142                 "por    %%mm2, %%mm0\n\t"
01143                 "psrlq  $16, %%mm1\n\t"
01144                 "psrlq  $32, %%mm4\n\t"
01145                 "psllq  $16, %%mm5\n\t"
01146                 "por    %%mm3, %%mm1\n\t"
01147                 "pand   %6, %%mm5\n\t"
01148                 "por    %%mm5, %%mm4\n\t"
01149 
01150                 MOVNTQ" %%mm0, %0\n\t"
01151                 MOVNTQ" %%mm1, 8%0\n\t"
01152                 MOVNTQ" %%mm4, 16%0"
01153 
01154                 :"=m"(*d)
01155                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
01156                 :"memory");
01157                 d += 24;
01158                 s += 8;
01159         }
01160         __asm __volatile(SFENCE:::"memory");
01161         __asm __volatile(EMMS:::"memory");
01162 #endif
01163         while(s < end)
01164         {
01165                 register uint16_t bgr;
01166                 bgr = *s++;
01167                 *d++ = (bgr&0x1F)<<3;
01168                 *d++ = (bgr&0x7E0)>>3;
01169                 *d++ = (bgr&0xF800)>>8;
01170         }
01171 }
01172 
01173 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
01174 {
01175         const uint16_t *end;
01176 #ifdef HAVE_MMX
01177         const uint16_t *mm_end;
01178 #endif
01179         uint8_t *d = (uint8_t *)dst;
01180         const uint16_t *s = (const uint16_t *)src;
01181         end = s + src_size/2;
01182 #ifdef HAVE_MMX
01183         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
01184         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
01185         mm_end = end - 3;
01186         while(s < mm_end)
01187         {
01188             __asm __volatile(
01189                 PREFETCH" 32%1\n\t"
01190                 "movq   %1, %%mm0\n\t"
01191                 "movq   %1, %%mm1\n\t"
01192                 "movq   %1, %%mm2\n\t"
01193                 "pand   %2, %%mm0\n\t"
01194                 "pand   %3, %%mm1\n\t"
01195                 "pand   %4, %%mm2\n\t"
01196                 "psllq  $3, %%mm0\n\t"
01197                 "psrlq  $2, %%mm1\n\t"
01198                 "psrlq  $7, %%mm2\n\t"
01199                 "movq   %%mm0, %%mm3\n\t"
01200                 "movq   %%mm1, %%mm4\n\t"
01201                 "movq   %%mm2, %%mm5\n\t"
01202                 "punpcklwd %%mm7, %%mm0\n\t"
01203                 "punpcklwd %%mm7, %%mm1\n\t"
01204                 "punpcklwd %%mm7, %%mm2\n\t"
01205                 "punpckhwd %%mm7, %%mm3\n\t"
01206                 "punpckhwd %%mm7, %%mm4\n\t"
01207                 "punpckhwd %%mm7, %%mm5\n\t"
01208                 "psllq  $8, %%mm1\n\t"
01209                 "psllq  $16, %%mm2\n\t"
01210                 "por    %%mm1, %%mm0\n\t"
01211                 "por    %%mm2, %%mm0\n\t"
01212                 "psllq  $8, %%mm4\n\t"
01213                 "psllq  $16, %%mm5\n\t"
01214                 "por    %%mm4, %%mm3\n\t"
01215                 "por    %%mm5, %%mm3\n\t"
01216                 MOVNTQ" %%mm0, %0\n\t"
01217                 MOVNTQ" %%mm3, 8%0\n\t"
01218                 :"=m"(*d)
01219                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
01220                 :"memory");
01221                 d += 16;
01222                 s += 4;
01223         }
01224         __asm __volatile(SFENCE:::"memory");
01225         __asm __volatile(EMMS:::"memory");
01226 #endif
01227         while(s < end)
01228         {
01229 #if 0 //slightly slower on athlon
01230                 int bgr= *s++;
01231                 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
01232 #else
01233 //FIXME this is very likely wrong for bigendian (and the following converters too)
01234                 register uint16_t bgr;
01235                 bgr = *s++;
01236                 *d++ = (bgr&0x1F)<<3;
01237                 *d++ = (bgr&0x3E0)>>2;
01238                 *d++ = (bgr&0x7C00)>>7;
01239                 *d++ = 0;
01240 #endif
01241         }
01242 }
01243 
01244 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
01245 {
01246         const uint16_t *end;
01247 #ifdef HAVE_MMX
01248         const uint16_t *mm_end;
01249 #endif
01250         uint8_t *d = (uint8_t *)dst;
01251         const uint16_t *s = (uint16_t *)src;
01252         end = s + src_size/2;
01253 #ifdef HAVE_MMX
01254         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
01255         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
01256         mm_end = end - 3;
01257         while(s < mm_end)
01258         {
01259             __asm __volatile(
01260                 PREFETCH" 32%1\n\t"
01261                 "movq   %1, %%mm0\n\t"
01262                 "movq   %1, %%mm1\n\t"
01263                 "movq   %1, %%mm2\n\t"
01264                 "pand   %2, %%mm0\n\t"
01265                 "pand   %3, %%mm1\n\t"
01266                 "pand   %4, %%mm2\n\t"
01267                 "psllq  $3, %%mm0\n\t"
01268                 "psrlq  $3, %%mm1\n\t"
01269                 "psrlq  $8, %%mm2\n\t"
01270                 "movq   %%mm0, %%mm3\n\t"
01271                 "movq   %%mm1, %%mm4\n\t"
01272                 "movq   %%mm2, %%mm5\n\t"
01273                 "punpcklwd %%mm7, %%mm0\n\t"
01274                 "punpcklwd %%mm7, %%mm1\n\t"
01275                 "punpcklwd %%mm7, %%mm2\n\t"
01276                 "punpckhwd %%mm7, %%mm3\n\t"
01277                 "punpckhwd %%mm7, %%mm4\n\t"
01278                 "punpckhwd %%mm7, %%mm5\n\t"
01279                 "psllq  $8, %%mm1\n\t"
01280                 "psllq  $16, %%mm2\n\t"
01281                 "por    %%mm1, %%mm0\n\t"
01282                 "por    %%mm2, %%mm0\n\t"
01283                 "psllq  $8, %%mm4\n\t"
01284                 "psllq  $16, %%mm5\n\t"
01285                 "por    %%mm4, %%mm3\n\t"
01286                 "por    %%mm5, %%mm3\n\t"
01287                 MOVNTQ" %%mm0, %0\n\t"
01288                 MOVNTQ" %%mm3, 8%0\n\t"
01289                 :"=m"(*d)
01290                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
01291                 :"memory");
01292                 d += 16;
01293                 s += 4;
01294         }
01295         __asm __volatile(SFENCE:::"memory");
01296         __asm __volatile(EMMS:::"memory");
01297 #endif
01298         while(s < end)
01299         {
01300                 register uint16_t bgr;
01301                 bgr = *s++;
01302                 *d++ = (bgr&0x1F)<<3;
01303                 *d++ = (bgr&0x7E0)>>3;
01304                 *d++ = (bgr&0xF800)>>8;
01305                 *d++ = 0;
01306         }
01307 }
01308 
01309 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
01310 {
01311 #ifdef HAVE_MMX
01312 /* TODO: unroll this loop */
01313         asm volatile (
01314                 "xorl %%eax, %%eax              \n\t"
01315                 ".balign 16                     \n\t"
01316                 "1:                             \n\t"
01317                 PREFETCH" 32(%0, %%eax)         \n\t"
01318                 "movq (%0, %%eax), %%mm0        \n\t"
01319                 "movq %%mm0, %%mm1              \n\t"
01320                 "movq %%mm0, %%mm2              \n\t"
01321                 "pslld $16, %%mm0               \n\t"
01322                 "psrld $16, %%mm1               \n\t"
01323                 "pand "MANGLE(mask32r)", %%mm0  \n\t"
01324                 "pand "MANGLE(mask32g)", %%mm2  \n\t"
01325                 "pand "MANGLE(mask32b)", %%mm1  \n\t"
01326                 "por %%mm0, %%mm2               \n\t"
01327                 "por %%mm1, %%mm2               \n\t"
01328                 MOVNTQ" %%mm2, (%1, %%eax)      \n\t"
01329                 "addl $8, %%eax                 \n\t"
01330                 "cmpl %2, %%eax                 \n\t"
01331                 " jb 1b                         \n\t"
01332                 :: "r" (src), "r"(dst), "r" (src_size-7)
01333                 : "%eax"
01334         );
01335 
01336         __asm __volatile(SFENCE:::"memory");
01337         __asm __volatile(EMMS:::"memory");
01338 #else
01339         unsigned i;
01340         unsigned num_pixels = src_size >> 2;
01341         for(i=0; i<num_pixels; i++)
01342         {
01343 #ifdef WORDS_BIGENDIAN  
01344           dst[4*i + 1] = src[4*i + 3];
01345           dst[4*i + 2] = src[4*i + 2];
01346           dst[4*i + 3] = src[4*i + 1];
01347 #else
01348           dst[4*i + 0] = src[4*i + 2];
01349           dst[4*i + 1] = src[4*i + 1];
01350           dst[4*i + 2] = src[4*i + 0];
01351 #endif
01352         }
01353 #endif
01354 }
01355 
01356 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
01357 {
01358         unsigned i;
01359 #ifdef HAVE_MMX
01360         int mmx_size= 23 - src_size;
01361         asm volatile (
01362                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
01363                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
01364                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
01365                 ".balign 16                     \n\t"
01366                 "1:                             \n\t"
01367                 PREFETCH" 32(%1, %%eax)         \n\t"
01368                 "movq   (%1, %%eax), %%mm0      \n\t" // BGR BGR BG
01369                 "movq   (%1, %%eax), %%mm1      \n\t" // BGR BGR BG
01370                 "movq  2(%1, %%eax), %%mm2      \n\t" // R BGR BGR B
01371                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
01372                 "pand %%mm5, %%mm0              \n\t"
01373                 "pand %%mm6, %%mm1              \n\t"
01374                 "pand %%mm7, %%mm2              \n\t"
01375                 "por %%mm0, %%mm1               \n\t"
01376                 "por %%mm2, %%mm1               \n\t"                
01377                 "movq  6(%1, %%eax), %%mm0      \n\t" // BGR BGR BG
01378                 MOVNTQ" %%mm1,   (%2, %%eax)    \n\t" // RGB RGB RG
01379                 "movq  8(%1, %%eax), %%mm1      \n\t" // R BGR BGR B
01380                 "movq 10(%1, %%eax), %%mm2      \n\t" // GR BGR BGR
01381                 "pand %%mm7, %%mm0              \n\t"
01382                 "pand %%mm5, %%mm1              \n\t"
01383                 "pand %%mm6, %%mm2              \n\t"
01384                 "por %%mm0, %%mm1               \n\t"
01385                 "por %%mm2, %%mm1               \n\t"                
01386                 "movq 14(%1, %%eax), %%mm0      \n\t" // R BGR BGR B
01387                 MOVNTQ" %%mm1,  8(%2, %%eax)    \n\t" // B RGB RGB R
01388                 "movq 16(%1, %%eax), %%mm1      \n\t" // GR BGR BGR
01389                 "movq 18(%1, %%eax), %%mm2      \n\t" // BGR BGR BG
01390                 "pand %%mm6, %%mm0              \n\t"
01391                 "pand %%mm7, %%mm1              \n\t"
01392                 "pand %%mm5, %%mm2              \n\t"
01393                 "por %%mm0, %%mm1               \n\t"
01394                 "por %%mm2, %%mm1               \n\t"                
01395                 MOVNTQ" %%mm1, 16(%2, %%eax)    \n\t"
01396                 "addl $24, %%eax                \n\t"
01397                 " js 1b                         \n\t"
01398                 : "+a" (mmx_size)
01399                 : "r" (src-mmx_size), "r"(dst-mmx_size)
01400         );
01401 
01402         __asm __volatile(SFENCE:::"memory");
01403         __asm __volatile(EMMS:::"memory");
01404 
01405         if(mmx_size==23) return; //finihsed, was multiple of 8
01406 
01407         src+= src_size;
01408         dst+= src_size;
01409         src_size= 23-mmx_size;
01410         src-= src_size;
01411         dst-= src_size;
01412 #endif
01413         for(i=0; i<src_size; i+=3)
01414         {
01415                 register uint8_t x;
01416                 x          = src[i + 2];
01417                 dst[i + 1] = src[i + 1];
01418                 dst[i + 2] = src[i + 0];
01419                 dst[i + 0] = x;
01420         }
01421 }
01422 
01423 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01424         unsigned int width, unsigned int height,
01425         int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
01426 {
01427         unsigned y;
01428         const unsigned chromWidth= width>>1;
01429         for(y=0; y<height; y++)
01430         {
01431 #ifdef HAVE_MMX
01432 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
01433                 asm volatile(
01434                         "xorl %%eax, %%eax              \n\t"
01435                         ".balign 16                     \n\t"
01436                         "1:                             \n\t"
01437                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
01438                         PREFETCH" 32(%2, %%eax)         \n\t"
01439                         PREFETCH" 32(%3, %%eax)         \n\t"
01440                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
01441                         "movq %%mm0, %%mm2              \n\t" // U(0)
01442                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
01443                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
01444                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
01445 
01446                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
01447                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
01448                         "movq %%mm3, %%mm4              \n\t" // Y(0)
01449                         "movq %%mm5, %%mm6              \n\t" // Y(8)
01450                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
01451                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
01452                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
01453                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
01454 
01455                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
01456                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
01457                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
01458                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
01459 
01460                         "addl $8, %%eax                 \n\t"
01461                         "cmpl %4, %%eax                 \n\t"
01462                         " jb 1b                         \n\t"
01463                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
01464                         : "%eax"
01465                 );
01466 #else
01467 
01468 #if defined ARCH_ALPHA && defined HAVE_MVI
01469 #define pl2yuy2(n)                                      \
01470         y1 = yc[n];                                     \
01471         y2 = yc2[n];                                    \
01472         u = uc[n];                                      \
01473         v = vc[n];                                      \
01474         asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));      \
01475         asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));      \
01476         asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
01477         asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
01478         yuv1 = (u << 8) + (v << 24);                    \
01479         yuv2 = yuv1 + y2;                               \
01480         yuv1 += y1;                                     \
01481         qdst[n] = yuv1;                                 \
01482         qdst2[n] = yuv2;
01483 
01484                 int i;
01485                 uint64_t *qdst = (uint64_t *) dst;
01486                 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
01487                 const uint32_t *yc = (uint32_t *) ysrc;
01488                 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
01489                 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
01490                 for(i = 0; i < chromWidth; i += 8){
01491                         uint64_t y1, y2, yuv1, yuv2;
01492                         uint64_t u, v;
01493                         /* Prefetch */
01494                         asm("ldq $31,64(%0)" :: "r"(yc));
01495                         asm("ldq $31,64(%0)" :: "r"(yc2));
01496                         asm("ldq $31,64(%0)" :: "r"(uc));
01497                         asm("ldq $31,64(%0)" :: "r"(vc));
01498 
01499                         pl2yuy2(0);
01500                         pl2yuy2(1);
01501                         pl2yuy2(2);
01502                         pl2yuy2(3);
01503 
01504                         yc += 4;
01505                         yc2 += 4;
01506                         uc += 4;
01507                         vc += 4;
01508                         qdst += 4;
01509                         qdst2 += 4;
01510                 }
01511                 y++;
01512                 ysrc += lumStride;
01513                 dst += dstStride;
01514 
01515 #elif __WORDSIZE >= 64
01516                 int i;
01517                 uint64_t *ldst = (uint64_t *) dst;
01518                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
01519                 for(i = 0; i < chromWidth; i += 2){
01520                         uint64_t k, l;
01521                         k = yc[0] + (uc[0] << 8) +
01522                             (yc[1] << 16) + (vc[0] << 24);
01523                         l = yc[2] + (uc[1] << 8) +
01524                             (yc[3] << 16) + (vc[1] << 24);
01525                         *ldst++ = k + (l << 32);
01526                         yc += 4;
01527                         uc += 2;
01528                         vc += 2;
01529                 }
01530 
01531 #else
01532                 int i, *idst = (int32_t *) dst;
01533                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
01534                 for(i = 0; i < chromWidth; i++){
01535 #ifdef WORDS_BIGENDIAN
01536                         *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
01537                             (yc[1] << 8) + (vc[0] << 0);
01538 #else
01539                         *idst++ = yc[0] + (uc[0] << 8) +
01540                             (yc[1] << 16) + (vc[0] << 24);
01541 #endif
01542                         yc += 2;
01543                         uc++;
01544                         vc++;
01545                 }
01546 #endif
01547 #endif
01548                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
01549                 {
01550                         usrc += chromStride;
01551                         vsrc += chromStride;
01552                 }
01553                 ysrc += lumStride;
01554                 dst += dstStride;
01555         }
01556 #ifdef HAVE_MMX
01557 asm(    EMMS" \n\t"
01558         SFENCE" \n\t"
01559         :::"memory");
01560 #endif
01561 }
01562 
01568 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01569         unsigned int width, unsigned int height,
01570         int lumStride, int chromStride, int dstStride)
01571 {
01572         //FIXME interpolate chroma
01573         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
01574 }
01575 
01576 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01577         unsigned int width, unsigned int height,
01578         int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
01579 {
01580         unsigned y;
01581         const unsigned chromWidth= width>>1;
01582         for(y=0; y<height; y++)
01583         {
01584 #ifdef HAVE_MMX
01585 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
01586                 asm volatile(
01587                         "xorl %%eax, %%eax              \n\t"
01588                         ".balign 16                     \n\t"
01589                         "1:                             \n\t"
01590                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
01591                         PREFETCH" 32(%2, %%eax)         \n\t"
01592                         PREFETCH" 32(%3, %%eax)         \n\t"
01593                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
01594                         "movq %%mm0, %%mm2              \n\t" // U(0)
01595                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
01596                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
01597                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
01598 
01599                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
01600                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
01601                         "movq %%mm0, %%mm4              \n\t" // Y(0)
01602                         "movq %%mm2, %%mm6              \n\t" // Y(8)
01603                         "punpcklbw %%mm3, %%mm0         \n\t" // YUYV YUYV(0)
01604                         "punpckhbw %%mm3, %%mm4         \n\t" // YUYV YUYV(4)
01605                         "punpcklbw %%mm5, %%mm2         \n\t" // YUYV YUYV(8)
01606                         "punpckhbw %%mm5, %%mm6         \n\t" // YUYV YUYV(12)
01607 
01608                         MOVNTQ" %%mm0, (%0, %%eax, 4)   \n\t"
01609                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
01610                         MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t"
01611                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
01612 
01613                         "addl $8, %%eax                 \n\t"
01614                         "cmpl %4, %%eax                 \n\t"
01615                         " jb 1b                         \n\t"
01616                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
01617                         : "%eax"
01618                 );
01619 #else
01620 //FIXME adapt the alpha asm code from yv12->yuy2
01621 
01622 #if __WORDSIZE >= 64
01623                 int i;
01624                 uint64_t *ldst = (uint64_t *) dst;
01625                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
01626                 for(i = 0; i < chromWidth; i += 2){
01627                         uint64_t k, l;
01628                         k = uc[0] + (yc[0] << 8) +
01629                             (vc[0] << 16) + (yc[1] << 24);
01630                         l = uc[1] + (yc[2] << 8) +
01631                             (vc[1] << 16) + (yc[3] << 24);
01632                         *ldst++ = k + (l << 32);
01633                         yc += 4;
01634                         uc += 2;
01635                         vc += 2;
01636                 }
01637 
01638 #else
01639                 int i, *idst = (int32_t *) dst;
01640                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
01641                 for(i = 0; i < chromWidth; i++){
01642 #ifdef WORDS_BIGENDIAN
01643                         *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
01644                             (vc[0] << 8) + (yc[1] << 0);
01645 #else
01646                         *idst++ = uc[0] + (yc[0] << 8) +
01647                             (vc[0] << 16) + (yc[1] << 24);
01648 #endif
01649                         yc += 2;
01650                         uc++;
01651                         vc++;
01652                 }
01653 #endif
01654 #endif
01655                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
01656                 {
01657                         usrc += chromStride;
01658                         vsrc += chromStride;
01659                 }
01660                 ysrc += lumStride;
01661                 dst += dstStride;
01662         }
01663 #ifdef HAVE_MMX
01664 asm(    EMMS" \n\t"
01665         SFENCE" \n\t"
01666         :::"memory");
01667 #endif
01668 }
01669 
01675 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01676         unsigned int width, unsigned int height,
01677         int lumStride, int chromStride, int dstStride)
01678 {
01679         //FIXME interpolate chroma
01680         RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
01681 }
01682 
01687 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01688         unsigned int width, unsigned int height,
01689         int lumStride, int chromStride, int dstStride)
01690 {
01691         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
01692 }
01693 
01699 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
01700         unsigned int width, unsigned int height,
01701         int lumStride, int chromStride, int srcStride)
01702 {
01703         unsigned y;
01704         const unsigned chromWidth= width>>1;
01705         for(y=0; y<height; y+=2)
01706         {
01707 #ifdef HAVE_MMX
01708                 asm volatile(
01709                         "xorl %%eax, %%eax              \n\t"
01710                         "pcmpeqw %%mm7, %%mm7           \n\t"
01711                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
01712                         ".balign 16                     \n\t"
01713                         "1:                             \n\t"
01714                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
01715                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
01716                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
01717                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
01718                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
01719                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
01720                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
01721                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
01722                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
01723                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
01724                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
01725 
01726                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
01727 
01728                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
01729                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
01730                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
01731                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
01732                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
01733                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
01734                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
01735                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
01736                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
01737                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
01738 
01739                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
01740 
01741                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
01742                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
01743                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
01744                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
01745                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
01746                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
01747                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
01748                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
01749 
01750                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
01751                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
01752 
01753                         "addl $8, %%eax                 \n\t"
01754                         "cmpl %4, %%eax                 \n\t"
01755                         " jb 1b                         \n\t"
01756                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01757                         : "memory", "%eax"
01758                 );
01759 
01760                 ydst += lumStride;
01761                 src  += srcStride;
01762 
01763                 asm volatile(
01764                         "xorl %%eax, %%eax              \n\t"
01765                         ".balign 16                     \n\t"
01766                         "1:                             \n\t"
01767                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
01768                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
01769                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
01770                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
01771                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
01772                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
01773                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
01774                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
01775                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
01776                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
01777                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
01778 
01779                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
01780                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
01781 
01782                         "addl $8, %%eax                 \n\t"
01783                         "cmpl %4, %%eax                 \n\t"
01784                         " jb 1b                         \n\t"
01785 
01786                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01787                         : "memory", "%eax"
01788                 );
01789 #else
01790                 unsigned i;
01791                 for(i=0; i<chromWidth; i++)
01792                 {
01793                         ydst[2*i+0]     = src[4*i+0];
01794                         udst[i]         = src[4*i+1];
01795                         ydst[2*i+1]     = src[4*i+2];
01796                         vdst[i]         = src[4*i+3];
01797                 }
01798                 ydst += lumStride;
01799                 src  += srcStride;
01800 
01801                 for(i=0; i<chromWidth; i++)
01802                 {
01803                         ydst[2*i+0]     = src[4*i+0];
01804                         ydst[2*i+1]     = src[4*i+2];
01805                 }
01806 #endif
01807                 udst += chromStride;
01808                 vdst += chromStride;
01809                 ydst += lumStride;
01810                 src  += srcStride;
01811         }
01812 #ifdef HAVE_MMX
01813 asm volatile(   EMMS" \n\t"
01814                 SFENCE" \n\t"
01815                 :::"memory");
01816 #endif
01817 }
01818 
01819 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
01820         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
01821         unsigned int width, unsigned int height, int lumStride, int chromStride)
01822 {
01823         /* Y Plane */
01824         memcpy(ydst, ysrc, width*height);
01825 
01826         /* XXX: implement upscaling for U,V */
01827 }
01828 
01829 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
01830 {
01831         int x,y;
01832         
01833         dst[0]= src[0];
01834         
01835         // first line
01836         for(x=0; x<srcWidth-1; x++){
01837                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
01838                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
01839         }
01840         dst[2*srcWidth-1]= src[srcWidth-1];
01841         
01842         dst+= dstStride;
01843 
01844         for(y=1; y<srcHeight; y++){
01845 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
01846                 const int mmxSize= srcWidth&~15;
01847                 asm volatile(
01848                         "movl %4, %%eax                 \n\t"
01849                         "1:                             \n\t"
01850                         "movq (%0, %%eax), %%mm0        \n\t"
01851                         "movq (%1, %%eax), %%mm1        \n\t"
01852                         "movq 1(%0, %%eax), %%mm2       \n\t"
01853                         "movq 1(%1, %%eax), %%mm3       \n\t"
01854                         "movq -1(%0, %%eax), %%mm4      \n\t"
01855                         "movq -1(%1, %%eax), %%mm5      \n\t"
01856                         PAVGB" %%mm0, %%mm5             \n\t"
01857                         PAVGB" %%mm0, %%mm3             \n\t"
01858                         PAVGB" %%mm0, %%mm5             \n\t"
01859                         PAVGB" %%mm0, %%mm3             \n\t"
01860                         PAVGB" %%mm1, %%mm4             \n\t"
01861                         PAVGB" %%mm1, %%mm2             \n\t"
01862                         PAVGB" %%mm1, %%mm4             \n\t"
01863                         PAVGB" %%mm1, %%mm2             \n\t"
01864                         "movq %%mm5, %%mm7              \n\t"
01865                         "movq %%mm4, %%mm6              \n\t"
01866                         "punpcklbw %%mm3, %%mm5         \n\t"
01867                         "punpckhbw %%mm3, %%mm7         \n\t"
01868                         "punpcklbw %%mm2, %%mm4         \n\t"
01869                         "punpckhbw %%mm2, %%mm6         \n\t"
01870 #if 1
01871                         MOVNTQ" %%mm5, (%2, %%eax, 2)   \n\t"
01872                         MOVNTQ" %%mm7, 8(%2, %%eax, 2)  \n\t"
01873                         MOVNTQ" %%mm4, (%3, %%eax, 2)   \n\t"
01874                         MOVNTQ" %%mm6, 8(%3, %%eax, 2)  \n\t"
01875 #else
01876                         "movq %%mm5, (%2, %%eax, 2)     \n\t"
01877                         "movq %%mm7, 8(%2, %%eax, 2)    \n\t"
01878                         "movq %%mm4, (%3, %%eax, 2)     \n\t"
01879                         "movq %%mm6, 8(%3, %%eax, 2)    \n\t"
01880 #endif
01881                         "addl $8, %%eax                 \n\t"
01882                         " js 1b                         \n\t"
01883                         :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
01884                            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
01885                            "g" (-mmxSize)
01886                         : "%eax"
01887 
01888                 );
01889 #else
01890                 const int mmxSize=1;
01891 #endif
01892                 dst[0        ]= (3*src[0] +   src[srcStride])>>2;
01893                 dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
01894 
01895                 for(x=mmxSize-1; x<srcWidth-1; x++){
01896                         dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
01897                         dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
01898                         dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
01899                         dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
01900                 }
01901                 dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
01902                 dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
01903 
01904                 dst+=dstStride*2;
01905                 src+=srcStride;
01906         }
01907         
01908         // last line
01909 #if 1
01910         dst[0]= src[0];
01911         
01912         for(x=0; x<srcWidth-1; x++){
01913                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
01914                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
01915         }
01916         dst[2*srcWidth-1]= src[srcWidth-1];
01917 #else
01918         for(x=0; x<srcWidth; x++){
01919                 dst[2*x+0]=
01920                 dst[2*x+1]= src[x];
01921         }
01922 #endif
01923 
01924 #ifdef HAVE_MMX
01925 asm volatile(   EMMS" \n\t"
01926                 SFENCE" \n\t"
01927                 :::"memory");
01928 #endif
01929 }
01930 
01937 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
01938         unsigned int width, unsigned int height,
01939         int lumStride, int chromStride, int srcStride)
01940 {
01941         unsigned y;
01942         const unsigned chromWidth= width>>1;
01943         for(y=0; y<height; y+=2)
01944         {
01945 #ifdef HAVE_MMX
01946                 asm volatile(
01947                         "xorl %%eax, %%eax              \n\t"
01948                         "pcmpeqw %%mm7, %%mm7           \n\t"
01949                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
01950                         ".balign 16                     \n\t"
01951                         "1:                             \n\t"
01952                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
01953                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
01954                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
01955                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
01956                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
01957                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
01958                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
01959                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
01960                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
01961                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
01962                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
01963 
01964                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
01965 
01966                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
01967                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
01968                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
01969                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
01970                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
01971                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
01972                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
01973                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
01974                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
01975                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
01976 
01977                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
01978 
01979                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
01980                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
01981                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
01982                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
01983                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
01984                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
01985                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
01986                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
01987 
01988                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
01989                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
01990 
01991                         "addl $8, %%eax                 \n\t"
01992                         "cmpl %4, %%eax                 \n\t"
01993                         " jb 1b                         \n\t"
01994                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01995                         : "memory", "%eax"
01996                 );
01997 
01998                 ydst += lumStride;
01999                 src  += srcStride;
02000 
02001                 asm volatile(
02002                         "xorl %%eax, %%eax              \n\t"
02003                         ".balign 16                     \n\t"
02004                         "1:                             \n\t"
02005                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
02006                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
02007                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
02008                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
02009                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
02010                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
02011                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
02012                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
02013                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
02014                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
02015                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
02016 
02017                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
02018                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
02019 
02020                         "addl $8, %%eax                 \n\t"
02021                         "cmpl %4, %%eax                 \n\t"
02022                         " jb 1b                         \n\t"
02023 
02024                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
02025                         : "memory", "%eax"
02026                 );
02027 #else
02028                 unsigned i;
02029                 for(i=0; i<chromWidth; i++)
02030                 {
02031                         udst[i]         = src[4*i+0];
02032                         ydst[2*i+0]     = src[4*i+1];
02033                         vdst[i]         = src[4*i+2];
02034                         ydst[2*i+1]     = src[4*i+3];
02035                 }
02036                 ydst += lumStride;
02037                 src  += srcStride;
02038 
02039                 for(i=0; i<chromWidth; i++)
02040                 {
02041                         ydst[2*i+0]     = src[4*i+1];
02042                         ydst[2*i+1]     = src[4*i+3];
02043                 }
02044 #endif
02045                 udst += chromStride;
02046                 vdst += chromStride;
02047                 ydst += lumStride;
02048                 src  += srcStride;
02049         }
02050 #ifdef HAVE_MMX
02051 asm volatile(   EMMS" \n\t"
02052                 SFENCE" \n\t"
02053                 :::"memory");
02054 #endif
02055 }
02056 
02063 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
02064         unsigned int width, unsigned int height,
02065         int lumStride, int chromStride, int srcStride)
02066 {
02067         unsigned y;
02068         const unsigned chromWidth= width>>1;
02069 #ifdef HAVE_MMX
02070         for(y=0; y<height-2; y+=2)
02071         {
02072                 unsigned i;
02073                 for(i=0; i<2; i++)
02074                 {
02075                         asm volatile(
02076                                 "movl %2, %%eax                 \n\t"
02077                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
02078                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
02079                                 "pxor %%mm7, %%mm7              \n\t"
02080                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
02081                                 ".balign 16                     \n\t"
02082                                 "1:                             \n\t"
02083                                 PREFETCH" 64(%0, %%ebx)         \n\t"
02084                                 "movd (%0, %%ebx), %%mm0        \n\t"
02085                                 "movd 3(%0, %%ebx), %%mm1       \n\t"
02086                                 "punpcklbw %%mm7, %%mm0         \n\t"
02087                                 "punpcklbw %%mm7, %%mm1         \n\t"
02088                                 "movd 6(%0, %%ebx), %%mm2       \n\t"
02089                                 "movd 9(%0, %%ebx), %%mm3       \n\t"
02090                                 "punpcklbw %%mm7, %%mm2         \n\t"
02091                                 "punpcklbw %%mm7, %%mm3         \n\t"
02092                                 "pmaddwd %%mm6, %%mm0           \n\t"
02093                                 "pmaddwd %%mm6, %%mm1           \n\t"
02094                                 "pmaddwd %%mm6, %%mm2           \n\t"
02095                                 "pmaddwd %%mm6, %%mm3           \n\t"
02096 #ifndef FAST_BGR2YV12
02097                                 "psrad $8, %%mm0                \n\t"
02098                                 "psrad $8, %%mm1                \n\t"
02099                                 "psrad $8, %%mm2                \n\t"
02100                                 "psrad $8, %%mm3                \n\t"
02101 #endif
02102                                 "packssdw %%mm1, %%mm0          \n\t"
02103                                 "packssdw %%mm3, %%mm2          \n\t"
02104                                 "pmaddwd %%mm5, %%mm0           \n\t"
02105                                 "pmaddwd %%mm5, %%mm2           \n\t"
02106                                 "packssdw %%mm2, %%mm0          \n\t"
02107                                 "psraw $7, %%mm0                \n\t"
02108 
02109                                 "movd 12(%0, %%ebx), %%mm4      \n\t"
02110                                 "movd 15(%0, %%ebx), %%mm1      \n\t"
02111                                 "punpcklbw %%mm7, %%mm4         \n\t"
02112                                 "punpcklbw %%mm7, %%mm1         \n\t"
02113                                 "movd 18(%0, %%ebx), %%mm2      \n\t"
02114                                 "movd 21(%0, %%ebx), %%mm3      \n\t"
02115                                 "punpcklbw %%mm7, %%mm2         \n\t"
02116                                 "punpcklbw %%mm7, %%mm3         \n\t"
02117                                 "pmaddwd %%mm6, %%mm4           \n\t"
02118                                 "pmaddwd %%mm6, %%mm1           \n\t"
02119                                 "pmaddwd %%mm6, %%mm2           \n\t"
02120                                 "pmaddwd %%mm6, %%mm3           \n\t"
02121 #ifndef FAST_BGR2YV12
02122                                 "psrad $8, %%mm4                \n\t"
02123                                 "psrad $8, %%mm1                \n\t"
02124                                 "psrad $8, %%mm2                \n\t"
02125                                 "psrad $8, %%mm3                \n\t"
02126 #endif
02127                                 "packssdw %%mm1, %%mm4          \n\t"
02128                                 "packssdw %%mm3, %%mm2          \n\t"
02129                                 "pmaddwd %%mm5, %%mm4           \n\t"
02130                                 "pmaddwd %%mm5, %%mm2           \n\t"
02131                                 "addl $24, %%ebx                \n\t"
02132                                 "packssdw %%mm2, %%mm4          \n\t"
02133                                 "psraw $7, %%mm4                \n\t"
02134 
02135                                 "packuswb %%mm4, %%mm0          \n\t"
02136                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
02137 
02138                                 MOVNTQ" %%mm0, (%1, %%eax)      \n\t"
02139                                 "addl $8, %%eax                 \n\t"
02140                                 " js 1b                         \n\t"
02141                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
02142                                 : "%eax", "%ebx"
02143                         );
02144                         ydst += lumStride;
02145                         src  += srcStride;
02146                 }
02147                 src -= srcStride*2;
02148                 asm volatile(
02149                         "movl %4, %%eax                 \n\t"
02150                         "movq "MANGLE(w1111)", %%mm5            \n\t"
02151                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
02152                         "pxor %%mm7, %%mm7              \n\t"
02153                         "leal (%%eax, %%eax, 2), %%ebx  \n\t"
02154                         "addl %%ebx, %%ebx              \n\t"
02155                         ".balign 16                     \n\t"
02156                         "1:                             \n\t"
02157                         PREFETCH" 64(%0, %%ebx)         \n\t"
02158                         PREFETCH" 64(%1, %%ebx)         \n\t"
02159 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
02160                         "movq (%0, %%ebx), %%mm0        \n\t"
02161                         "movq (%1, %%ebx), %%mm1        \n\t"
02162                         "movq 6(%0, %%ebx), %%mm2       \n\t"
02163                         "movq 6(%1, %%ebx), %%mm3       \n\t"
02164                         PAVGB" %%mm1, %%mm0             \n\t"
02165                         PAVGB" %%mm3, %%mm2             \n\t"
02166                         "movq %%mm0, %%mm1              \n\t"
02167                         "movq %%mm2, %%mm3              \n\t"
02168                         "psrlq $24, %%mm0               \n\t"
02169                         "psrlq $24, %%mm2               \n\t"
02170                         PAVGB" %%mm1, %%mm0             \n\t"
02171                         PAVGB" %%mm3, %%mm2             \n\t"
02172                         "punpcklbw %%mm7, %%mm0         \n\t"
02173                         "punpcklbw %%mm7, %%mm2         \n\t"
02174 #else
02175                         "movd (%0, %%ebx), %%mm0        \n\t"
02176                         "movd (%1, %%ebx), %%mm1        \n\t"
02177                         "movd 3(%0, %%ebx), %%mm2       \n\t"
02178                         "movd 3(%1, %%ebx), %%mm3       \n\t"
02179                         "punpcklbw %%mm7, %%mm0         \n\t"
02180                         "punpcklbw %%mm7, %%mm1         \n\t"
02181                         "punpcklbw %%mm7, %%mm2         \n\t"
02182                         "punpcklbw %%mm7, %%mm3         \n\t"
02183                         "paddw %%mm1, %%mm0             \n\t"
02184                         "paddw %%mm3, %%mm2             \n\t"
02185                         "paddw %%mm2, %%mm0             \n\t"
02186                         "movd 6(%0, %%ebx), %%mm4       \n\t"
02187                         "movd 6(%1, %%ebx), %%mm1       \n\t"
02188                         "movd 9(%0, %%ebx), %%mm2       \n\t"
02189                         "movd 9(%1, %%ebx), %%mm3       \n\t"
02190                         "punpcklbw %%mm7, %%mm4         \n\t"
02191                         "punpcklbw %%mm7, %%mm1         \n\t"
02192                         "punpcklbw %%mm7, %%mm2         \n\t"
02193                         "punpcklbw %%mm7, %%mm3         \n\t"
02194                         "paddw %%mm1, %%mm4             \n\t"
02195                         "paddw %%mm3, %%mm2             \n\t"
02196                         "paddw %%mm4, %%mm2             \n\t"
02197                         "psrlw $2, %%mm0                \n\t"
02198                         "psrlw $2, %%mm2                \n\t"
02199 #endif
02200                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
02201                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
02202 
02203                         "pmaddwd %%mm0, %%mm1           \n\t"
02204                         "pmaddwd %%mm2, %%mm3           \n\t"
02205                         "pmaddwd %%mm6, %%mm0           \n\t"
02206                         "pmaddwd %%mm6, %%mm2           \n\t"
02207 #ifndef FAST_BGR2YV12
02208                         "psrad $8, %%mm0                \n\t"
02209                         "psrad $8, %%mm1                \n\t"
02210                         "psrad $8, %%mm2                \n\t"
02211                         "psrad $8, %%mm3                \n\t"
02212 #endif
02213                         "packssdw %%mm2, %%mm0          \n\t"
02214                         "packssdw %%mm3, %%mm1          \n\t"
02215                         "pmaddwd %%mm5, %%mm0           \n\t"
02216                         "pmaddwd %%mm5, %%mm1           \n\t"
02217                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
02218                         "psraw $7, %%mm0                \n\t"
02219 
02220 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
02221                         "movq 12(%0, %%ebx), %%mm4      \n\t"
02222                         "movq 12(%1, %%ebx), %%mm1      \n\t"
02223                         "movq 18(%0, %%ebx), %%mm2      \n\t"
02224                         "movq 18(%1, %%ebx), %%mm3      \n\t"
02225                         PAVGB" %%mm1, %%mm4             \n\t"
02226                         PAVGB" %%mm3, %%mm2             \n\t"
02227                         "movq %%mm4, %%mm1              \n\t"
02228                         "movq %%mm2, %%mm3              \n\t"
02229                         "psrlq $24, %%mm4               \n\t"
02230                         "psrlq $24, %%mm2               \n\t"
02231                         PAVGB" %%mm1, %%mm4             \n\t"
02232                         PAVGB" %%mm3, %%mm2             \n\t"
02233                         "punpcklbw %%mm7, %%mm4         \n\t"
02234                         "punpcklbw %%mm7, %%mm2         \n\t"
02235 #else
02236                         "movd 12(%0, %%ebx), %%mm4      \n\t"
02237                         "movd 12(%1, %%ebx), %%mm1      \n\t"
02238                         "movd 15(%0, %%ebx), %%mm2      \n\t"
02239                         "movd 15(%1, %%ebx), %%mm3      \n\t"
02240                         "punpcklbw %%mm7, %%mm4         \n\t"
02241                         "punpcklbw %%mm7, %%mm1         \n\t"
02242                         "punpcklbw %%mm7, %%mm2         \n\t"
02243                         "punpcklbw %%mm7, %%mm3         \n\t"
02244                         "paddw %%mm1, %%mm4             \n\t"
02245                         "paddw %%mm3, %%mm2             \n\t"
02246                         "paddw %%mm2, %%mm4             \n\t"
02247                         "movd 18(%0, %%ebx), %%mm5      \n\t"
02248                         "movd 18(%1, %%ebx), %%mm1      \n\t"
02249                         "movd 21(%0, %%ebx), %%mm2      \n\t"
02250                         "movd 21(%1, %%ebx), %%mm3      \n\t"
02251                         "punpcklbw %%mm7, %%mm5         \n\t"
02252                         "punpcklbw %%mm7, %%mm1         \n\t"
02253                         "punpcklbw %%mm7, %%mm2         \n\t"
02254                         "punpcklbw %%mm7, %%mm3         \n\t"
02255                         "paddw %%mm1, %%mm5             \n\t"
02256                         "paddw %%mm3, %%mm2             \n\t"
02257                         "paddw %%mm5, %%mm2             \n\t"
02258                         "movq "MANGLE(w1111)", %%mm5            \n\t"
02259                         "psrlw $2, %%mm4                \n\t"
02260                         "psrlw $2, %%mm2                \n\t"
02261 #endif
02262                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
02263                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
02264 
02265                         "pmaddwd %%mm4, %%mm1           \n\t"
02266                         "pmaddwd %%mm2, %%mm3           \n\t"
02267                         "pmaddwd %%mm6, %%mm4           \n\t"
02268                         "pmaddwd %%mm6, %%mm2           \n\t"
02269 #ifndef FAST_BGR2YV12
02270                         "psrad $8, %%mm4                \n\t"
02271                         "psrad $8, %%mm1                \n\t"
02272                         "psrad $8, %%mm2                \n\t"
02273                         "psrad $8, %%mm3                \n\t"
02274 #endif
02275                         "packssdw %%mm2, %%mm4          \n\t"
02276                         "packssdw %%mm3, %%mm1          \n\t"
02277                         "pmaddwd %%mm5, %%mm4           \n\t"
02278                         "pmaddwd %%mm5, %%mm1           \n\t"
02279                         "addl $24, %%ebx                \n\t"
02280                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
02281                         "psraw $7, %%mm4                \n\t"
02282 
02283                         "movq %%mm0, %%mm1              \n\t"
02284                         "punpckldq %%mm4, %%mm0         \n\t"
02285                         "punpckhdq %%mm4, %%mm1         \n\t"
02286                         "packsswb %%mm1, %%mm0          \n\t"
02287                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
02288 
02289                         "movd %%mm0, (%2, %%eax)        \n\t"
02290                         "punpckhdq %%mm0, %%mm0         \n\t"
02291                         "movd %%mm0, (%3, %%eax)        \n\t"
02292                         "addl $4, %%eax                 \n\t"
02293                         " js 1b                         \n\t"
02294                         : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
02295                         : "%eax", "%ebx"
02296                 );
02297 
02298                 udst += chromStride;
02299                 vdst += chromStride;
02300                 src  += srcStride*2;
02301         }
02302 
02303         asm volatile(   EMMS" \n\t"
02304                         SFENCE" \n\t"
02305                         :::"memory");
02306 #else
02307         y=0;
02308 #endif
02309         for(; y<height; y+=2)
02310         {
02311                 unsigned i;
02312                 for(i=0; i<chromWidth; i++)
02313                 {
02314                         unsigned int b= src[6*i+0];
02315                         unsigned int g= src[6*i+1];
02316                         unsigned int r= src[6*i+2];
02317 
02318                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
02319                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
02320                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
02321 
02322                         udst[i]         = U;
02323                         vdst[i]         = V;
02324                         ydst[2*i]       = Y;
02325 
02326                         b= src[6*i+3];
02327                         g= src[6*i+4];
02328                         r= src[6*i+5];
02329 
02330                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
02331                         ydst[2*i+1]     = Y;
02332                 }
02333                 ydst += lumStride;
02334                 src  += srcStride;
02335 
02336                 for(i=0; i<chromWidth; i++)
02337                 {
02338                         unsigned int b= src[6*i+0];
02339                         unsigned int g= src[6*i+1];
02340                         unsigned int r= src[6*i+2];
02341 
02342                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
02343 
02344                         ydst[2*i]       = Y;
02345 
02346                         b= src[6*i+3];
02347                         g= src[6*i+4];
02348                         r= src[6*i+5];
02349 
02350                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
02351                         ydst[2*i+1]     = Y;
02352                 }
02353                 udst += chromStride;
02354                 vdst += chromStride;
02355                 ydst += lumStride;
02356                 src  += srcStride;
02357         }
02358 }
02359 
02360 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
02361                             unsigned width, unsigned height, int src1Stride,
02362                             int src2Stride, int dstStride){
02363         unsigned h;
02364 
02365         for(h=0; h < height; h++)
02366         {
02367                 unsigned w;
02368 
02369 #ifdef HAVE_MMX
02370 #ifdef HAVE_SSE2
02371                 asm(
02372                         "xorl %%eax, %%eax              \n\t"
02373                         "1:                             \n\t"
02374                         PREFETCH" 64(%1, %%eax)         \n\t"
02375                         PREFETCH" 64(%2, %%eax)         \n\t"
02376                         "movdqa (%1, %%eax), %%xmm0     \n\t"
02377                         "movdqa (%1, %%eax), %%xmm1     \n\t"
02378                         "movdqa (%2, %%eax), %%xmm2     \n\t"
02379                         "punpcklbw %%xmm2, %%xmm0       \n\t"
02380                         "punpckhbw %%xmm2, %%xmm1       \n\t"
02381                         "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
02382                         "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
02383                         "addl $16, %%eax                        \n\t"
02384                         "cmpl %3, %%eax                 \n\t"
02385                         " jb 1b                         \n\t"
02386                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
02387                         : "memory", "%eax"
02388                 );
02389 #else
02390                 asm(
02391                         "xorl %%eax, %%eax              \n\t"
02392                         "1:                             \n\t"
02393                         PREFETCH" 64(%1, %%eax)         \n\t"
02394                         PREFETCH" 64(%2, %%eax)         \n\t"
02395                         "movq (%1, %%eax), %%mm0        \n\t"
02396                         "movq 8(%1, %%eax), %%mm2       \n\t"
02397                         "movq %%mm0, %%mm1              \n\t"
02398                         "movq %%mm2, %%mm3              \n\t"
02399                         "movq (%2, %%eax), %%mm4        \n\t"
02400                         "movq 8(%2, %%eax), %%mm5       \n\t"
02401                         "punpcklbw %%mm4, %%mm0         \n\t"
02402                         "punpckhbw %%mm4, %%mm1         \n\t"
02403                         "punpcklbw %%mm5, %%mm2         \n\t"
02404                         "punpckhbw %%mm5, %%mm3         \n\t"
02405                         MOVNTQ" %%mm0, (%0, %%eax, 2)   \n\t"
02406                         MOVNTQ" %%mm1, 8(%0, %%eax, 2)  \n\t"
02407                         MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
02408                         MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
02409                         "addl $16, %%eax                        \n\t"
02410                         "cmpl %3, %%eax                 \n\t"
02411                         " jb 1b                         \n\t"
02412                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
02413                         : "memory", "%eax"
02414                 );
02415 #endif
02416                 for(w= (width&(~15)); w < width; w++)
02417                 {
02418                         dest[2*w+0] = src1[w];
02419                         dest[2*w+1] = src2[w];
02420                 }
02421 #else
02422                 for(w=0; w < width; w++)
02423                 {
02424                         dest[2*w+0] = src1[w];
02425                         dest[2*w+1] = src2[w];
02426                 }
02427 #endif
02428                 dest += dstStride;
02429                 src1 += src1Stride;
02430                 src2 += src2Stride;
02431         }
02432 #ifdef HAVE_MMX
02433         asm(
02434                 EMMS" \n\t"
02435                 SFENCE" \n\t"
02436                 ::: "memory"
02437                 );
02438 #endif
02439 }
02440 
02441 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
02442                         uint8_t *dst1, uint8_t *dst2,
02443                         unsigned width, unsigned height,
02444                         int srcStride1, int srcStride2,
02445                         int dstStride1, int dstStride2)
02446 {
02447     unsigned int y,x,h;
02448     int w;
02449     w=width/2; h=height/2;
02450 #ifdef HAVE_MMX
02451     asm volatile(
02452         PREFETCH" %0\n\t"
02453         PREFETCH" %1\n\t"
02454         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
02455 #endif
02456     for(y=0;y<h;y++){
02457         const uint8_t* s1=src1+srcStride1*(y>>1);
02458         uint8_t* d=dst1+dstStride1*y;
02459         x=0;
02460 #ifdef HAVE_MMX
02461         for(;x<w-31;x+=32)
02462         {
02463             asm volatile(
02464                 PREFETCH" 32%1\n\t"
02465                 "movq   %1, %%mm0\n\t"
02466                 "movq   8%1, %%mm2\n\t"
02467                 "movq   16%1, %%mm4\n\t"
02468                 "movq   24%1, %%mm6\n\t"
02469                 "movq   %%mm0, %%mm1\n\t"
02470                 "movq   %%mm2, %%mm3\n\t"
02471                 "movq   %%mm4, %%mm5\n\t"
02472                 "movq   %%mm6, %%mm7\n\t"
02473                 "punpcklbw %%mm0, %%mm0\n\t"
02474                 "punpckhbw %%mm1, %%mm1\n\t"
02475                 "punpcklbw %%mm2, %%mm2\n\t"
02476                 "punpckhbw %%mm3, %%mm3\n\t"
02477                 "punpcklbw %%mm4, %%mm4\n\t"
02478                 "punpckhbw %%mm5, %%mm5\n\t"
02479                 "punpcklbw %%mm6, %%mm6\n\t"
02480                 "punpckhbw %%mm7, %%mm7\n\t"
02481                 MOVNTQ" %%mm0, %0\n\t"
02482                 MOVNTQ" %%mm1, 8%0\n\t"
02483                 MOVNTQ" %%mm2, 16%0\n\t"
02484                 MOVNTQ" %%mm3, 24%0\n\t"
02485                 MOVNTQ" %%mm4, 32%0\n\t"
02486                 MOVNTQ" %%mm5, 40%0\n\t"
02487                 MOVNTQ" %%mm6, 48%0\n\t"
02488                 MOVNTQ" %%mm7, 56%0"
02489                 :"=m"(d[2*x])
02490                 :"m"(s1[x])
02491                 :"memory");
02492         }
02493 #endif
02494         for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
02495     }
02496     for(y=0;y<h;y++){
02497         const uint8_t* s2=src2+srcStride2*(y>>1);
02498         uint8_t* d=dst2+dstStride2*y;
02499         x=0;
02500 #ifdef HAVE_MMX
02501         for(;x<w-31;x+=32)
02502         {
02503             asm volatile(
02504                 PREFETCH" 32%1\n\t"
02505                 "movq   %1, %%mm0\n\t"
02506                 "movq   8%1, %%mm2\n\t"
02507                 "movq   16%1, %%mm4\n\t"
02508                 "movq   24%1, %%mm6\n\t"
02509                 "movq   %%mm0, %%mm1\n\t"
02510                 "movq   %%mm2, %%mm3\n\t"
02511                 "movq   %%mm4, %%mm5\n\t"
02512                 "movq   %%mm6, %%mm7\n\t"
02513                 "punpcklbw %%mm0, %%mm0\n\t"
02514                 "punpckhbw %%mm1, %%mm1\n\t"
02515                 "punpcklbw %%mm2, %%mm2\n\t"
02516                 "punpckhbw %%mm3, %%mm3\n\t"
02517                 "punpcklbw %%mm4, %%mm4\n\t"
02518                 "punpckhbw %%mm5, %%mm5\n\t"
02519                 "punpcklbw %%mm6, %%mm6\n\t"
02520                 "punpckhbw %%mm7, %%mm7\n\t"
02521                 MOVNTQ" %%mm0, %0\n\t"
02522                 MOVNTQ" %%mm1, 8%0\n\t"
02523                 MOVNTQ" %%mm2, 16%0\n\t"
02524                 MOVNTQ" %%mm3, 24%0\n\t"
02525                 MOVNTQ" %%mm4, 32%0\n\t"
02526                 MOVNTQ" %%mm5, 40%0\n\t"
02527                 MOVNTQ" %%mm6, 48%0\n\t"
02528                 MOVNTQ" %%mm7, 56%0"
02529                 :"=m"(d[2*x])
02530                 :"m"(s2[x])
02531                 :"memory");
02532         }
02533 #endif
02534         for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
02535     }
02536 #ifdef HAVE_MMX
02537         asm(
02538                 EMMS" \n\t"
02539                 SFENCE" \n\t"
02540                 ::: "memory"
02541                 );
02542 #endif
02543 }
02544 
02545 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
02546                         uint8_t *dst,
02547                         unsigned width, unsigned height,
02548                         int srcStride1, int srcStride2,
02549                         int srcStride3, int dstStride)
02550 {
02551     unsigned y,x,w,h;
02552     w=width/2; h=height;
02553     for(y=0;y<h;y++){
02554         const uint8_t* yp=src1+srcStride1*y;
02555         const uint8_t* up=src2+srcStride2*(y>>2);
02556         const uint8_t* vp=src3+srcStride3*(y>>2);
02557         uint8_t* d=dst+dstStride*y;
02558         x=0;
02559 #ifdef HAVE_MMX
02560         for(;x<w-7;x+=8)
02561         {
02562             asm volatile(
02563                 PREFETCH" 32(%1, %0)\n\t"
02564                 PREFETCH" 32(%2, %0)\n\t"
02565                 PREFETCH" 32(%3, %0)\n\t"
02566                 "movq   (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
02567                 "movq   (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
02568                 "movq   (%3, %0), %%mm2\n\t"         /* V0V1V2V3V4V5V6V7 */
02569                 "movq   %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
02570                 "movq   %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
02571                 "movq   %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
02572                 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
02573                 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
02574                 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
02575                 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
02576 
02577                 "movq   %%mm1, %%mm6\n\t"
02578                 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
02579                 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
02580                 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
02581                 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
02582                 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
02583                 
02584                 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
02585                 "movq   8(%1, %0, 4), %%mm0\n\t"
02586                 "movq   %%mm0, %%mm3\n\t"
02587                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
02588                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
02589                 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
02590                 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
02591 
02592                 "movq   %%mm4, %%mm6\n\t"
02593                 "movq   16(%1, %0, 4), %%mm0\n\t"
02594                 "movq   %%mm0, %%mm3\n\t"
02595                 "punpcklbw %%mm5, %%mm4\n\t"
02596                 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
02597                 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
02598                 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
02599                 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
02600                 
02601                 "punpckhbw %%mm5, %%mm6\n\t"
02602                 "movq   24(%1, %0, 4), %%mm0\n\t"
02603                 "movq   %%mm0, %%mm3\n\t"
02604                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
02605                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
02606                 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
02607                 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
02608 
02609                 : "+r" (x)
02610                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
02611                 :"memory");
02612         }
02613 #endif
02614         for(; x<w; x++)
02615         {
02616             const int x2= x<<2;
02617             d[8*x+0]=yp[x2];
02618             d[8*x+1]=up[x];
02619             d[8*x+2]=yp[x2+1];
02620             d[8*x+3]=vp[x];
02621             d[8*x+4]=yp[x2+2];
02622             d[8*x+5]=up[x];
02623             d[8*x+6]=yp[x2+3];
02624             d[8*x+7]=vp[x];
02625         }
02626     }
02627 #ifdef HAVE_MMX
02628         asm(
02629                 EMMS" \n\t"
02630                 SFENCE" \n\t"
02631                 ::: "memory"
02632                 );
02633 #endif
02634 }

Generated on Tue Dec 20 10:14:54 2005 for vlc-0.8.4a by  doxygen 1.4.2