Main Page | Modules | Class Hierarchy | Class List | Directories | File List | Class Members | File Members | Related Pages

swscale_template.c

00001 /*
00002     Copyright (C) 2001-2003 Michael Niedermayer <[email protected]>
00003 
00004     This program is free software; you can redistribute it and/or modify
00005     it under the terms of the GNU General Public License as published by
00006     the Free Software Foundation; either version 2 of the License, or
00007     (at your option) any later version.
00008 
00009     This program is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012     GNU General Public License for more details.
00013 
00014     You should have received a copy of the GNU General Public License
00015     along with this program; if not, write to the Free Software
00016     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00017 */
00018 
00019 #undef MOVNTQ
00020 #undef PAVGB
00021 #undef PREFETCH
00022 #undef PREFETCHW
00023 #undef EMMS
00024 #undef SFENCE
00025 
00026 #ifdef HAVE_3DNOW
00027 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
00028 #define EMMS     "femms"
00029 #else
00030 #define EMMS     "emms"
00031 #endif
00032 
00033 #ifdef HAVE_3DNOW
00034 #define PREFETCH  "prefetch"
00035 #define PREFETCHW "prefetchw"
00036 #elif defined ( HAVE_MMX2 )
00037 #define PREFETCH "prefetchnta"
00038 #define PREFETCHW "prefetcht0"
00039 #else
00040 #define PREFETCH "/nop"
00041 #define PREFETCHW "/nop"
00042 #endif
00043 
00044 #ifdef HAVE_MMX2
00045 #define SFENCE "sfence"
00046 #else
00047 #define SFENCE "/nop"
00048 #endif
00049 
00050 #ifdef HAVE_MMX2
00051 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
00052 #elif defined (HAVE_3DNOW)
00053 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
00054 #endif
00055 
00056 #ifdef HAVE_MMX2
00057 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
00058 #else
00059 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
00060 #endif
00061 
00062 #ifdef HAVE_ALTIVEC
00063 #include "swscale_altivec_template.c"
00064 #endif
00065 
00066 #define YSCALEYUV2YV12X(x, offset) \
00067                         "xorl %%eax, %%eax              \n\t"\
00068                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
00069                         "movq %%mm3, %%mm4              \n\t"\
00070                         "leal " offset "(%0), %%edx     \n\t"\
00071                         "movl (%%edx), %%esi            \n\t"\
00072                         ".balign 16                     \n\t" /* FIXME Unroll? */\
00073                         "1:                             \n\t"\
00074                         "movq 8(%%edx), %%mm0           \n\t" /* filterCoeff */\
00075                         "movq " #x "(%%esi, %%eax, 2), %%mm2    \n\t" /* srcData */\
00076                         "movq 8+" #x "(%%esi, %%eax, 2), %%mm5  \n\t" /* srcData */\
00077                         "addl $16, %%edx                \n\t"\
00078                         "movl (%%edx), %%esi            \n\t"\
00079                         "testl %%esi, %%esi             \n\t"\
00080                         "pmulhw %%mm0, %%mm2            \n\t"\
00081                         "pmulhw %%mm0, %%mm5            \n\t"\
00082                         "paddw %%mm2, %%mm3             \n\t"\
00083                         "paddw %%mm5, %%mm4             \n\t"\
00084                         " jnz 1b                        \n\t"\
00085                         "psraw $3, %%mm3                \n\t"\
00086                         "psraw $3, %%mm4                \n\t"\
00087                         "packuswb %%mm4, %%mm3          \n\t"\
00088                         MOVNTQ(%%mm3, (%1, %%eax))\
00089                         "addl $8, %%eax                 \n\t"\
00090                         "cmpl %2, %%eax                 \n\t"\
00091                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
00092                         "movq %%mm3, %%mm4              \n\t"\
00093                         "leal " offset "(%0), %%edx     \n\t"\
00094                         "movl (%%edx), %%esi            \n\t"\
00095                         "jb 1b                          \n\t"
00096 
00097 #define YSCALEYUV2YV121 \
00098                         "movl %2, %%eax                 \n\t"\
00099                         ".balign 16                     \n\t" /* FIXME Unroll? */\
00100                         "1:                             \n\t"\
00101                         "movq (%0, %%eax, 2), %%mm0     \n\t"\
00102                         "movq 8(%0, %%eax, 2), %%mm1    \n\t"\
00103                         "psraw $7, %%mm0                \n\t"\
00104                         "psraw $7, %%mm1                \n\t"\
00105                         "packuswb %%mm1, %%mm0          \n\t"\
00106                         MOVNTQ(%%mm0, (%1, %%eax))\
00107                         "addl $8, %%eax                 \n\t"\
00108                         "jnc 1b                         \n\t"
00109 
00110 /*
00111                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
00112                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
00113                            "r" (dest), "m" (dstW),
00114                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
00115                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
00116 */
00117 #define YSCALEYUV2PACKEDX \
00118                 "xorl %%eax, %%eax              \n\t"\
00119                 ".balign 16                     \n\t"\
00120                 "nop                            \n\t"\
00121                 "1:                             \n\t"\
00122                 "leal "CHR_MMX_FILTER_OFFSET"(%0), %%edx        \n\t"\
00123                 "movl (%%edx), %%esi            \n\t"\
00124                 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
00125                 "movq %%mm3, %%mm4              \n\t"\
00126                 ".balign 16                     \n\t"\
00127                 "2:                             \n\t"\
00128                 "movq 8(%%edx), %%mm0           \n\t" /* filterCoeff */\
00129                 "movq (%%esi, %%eax), %%mm2     \n\t" /* UsrcData */\
00130                 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
00131                 "addl $16, %%edx                \n\t"\
00132                 "movl (%%edx), %%esi            \n\t"\
00133                 "pmulhw %%mm0, %%mm2            \n\t"\
00134                 "pmulhw %%mm0, %%mm5            \n\t"\
00135                 "paddw %%mm2, %%mm3             \n\t"\
00136                 "paddw %%mm5, %%mm4             \n\t"\
00137                 "testl %%esi, %%esi             \n\t"\
00138                 " jnz 2b                        \n\t"\
00139 \
00140                 "leal "LUM_MMX_FILTER_OFFSET"(%0), %%edx        \n\t"\
00141                 "movl (%%edx), %%esi            \n\t"\
00142                 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
00143                 "movq %%mm1, %%mm7              \n\t"\
00144                 ".balign 16                     \n\t"\
00145                 "2:                             \n\t"\
00146                 "movq 8(%%edx), %%mm0           \n\t" /* filterCoeff */\
00147                 "movq (%%esi, %%eax, 2), %%mm2  \n\t" /* Y1srcData */\
00148                 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
00149                 "addl $16, %%edx                \n\t"\
00150                 "movl (%%edx), %%esi            \n\t"\
00151                 "pmulhw %%mm0, %%mm2            \n\t"\
00152                 "pmulhw %%mm0, %%mm5            \n\t"\
00153                 "paddw %%mm2, %%mm1             \n\t"\
00154                 "paddw %%mm5, %%mm7             \n\t"\
00155                 "testl %%esi, %%esi             \n\t"\
00156                 " jnz 2b                        \n\t"\
00157 
00158 
00159 #define YSCALEYUV2RGBX \
00160                 YSCALEYUV2PACKEDX\
00161                 "psubw "U_OFFSET"(%0), %%mm3    \n\t" /* (U-128)8*/\
00162                 "psubw "V_OFFSET"(%0), %%mm4    \n\t" /* (V-128)8*/\
00163                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
00164                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
00165                 "pmulhw "UG_COEFF"(%0), %%mm3   \n\t"\
00166                 "pmulhw "VG_COEFF"(%0), %%mm4   \n\t"\
00167         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00168                 "pmulhw "UB_COEFF"(%0), %%mm2   \n\t"\
00169                 "pmulhw "VR_COEFF"(%0), %%mm5   \n\t"\
00170                 "psubw "Y_OFFSET"(%0), %%mm1    \n\t" /* 8(Y-16)*/\
00171                 "psubw "Y_OFFSET"(%0), %%mm7    \n\t" /* 8(Y-16)*/\
00172                 "pmulhw "Y_COEFF"(%0), %%mm1    \n\t"\
00173                 "pmulhw "Y_COEFF"(%0), %%mm7    \n\t"\
00174         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00175                 "paddw %%mm3, %%mm4             \n\t"\
00176                 "movq %%mm2, %%mm0              \n\t"\
00177                 "movq %%mm5, %%mm6              \n\t"\
00178                 "movq %%mm4, %%mm3              \n\t"\
00179                 "punpcklwd %%mm2, %%mm2         \n\t"\
00180                 "punpcklwd %%mm5, %%mm5         \n\t"\
00181                 "punpcklwd %%mm4, %%mm4         \n\t"\
00182                 "paddw %%mm1, %%mm2             \n\t"\
00183                 "paddw %%mm1, %%mm5             \n\t"\
00184                 "paddw %%mm1, %%mm4             \n\t"\
00185                 "punpckhwd %%mm0, %%mm0         \n\t"\
00186                 "punpckhwd %%mm6, %%mm6         \n\t"\
00187                 "punpckhwd %%mm3, %%mm3         \n\t"\
00188                 "paddw %%mm7, %%mm0             \n\t"\
00189                 "paddw %%mm7, %%mm6             \n\t"\
00190                 "paddw %%mm7, %%mm3             \n\t"\
00191                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00192                 "packuswb %%mm0, %%mm2          \n\t"\
00193                 "packuswb %%mm6, %%mm5          \n\t"\
00194                 "packuswb %%mm3, %%mm4          \n\t"\
00195                 "pxor %%mm7, %%mm7              \n\t"
00196 #if 0
00197 #define FULL_YSCALEYUV2RGB \
00198                 "pxor %%mm7, %%mm7              \n\t"\
00199                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
00200                 "punpcklwd %%mm6, %%mm6         \n\t"\
00201                 "punpcklwd %%mm6, %%mm6         \n\t"\
00202                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
00203                 "punpcklwd %%mm5, %%mm5         \n\t"\
00204                 "punpcklwd %%mm5, %%mm5         \n\t"\
00205                 "xorl %%eax, %%eax              \n\t"\
00206                 ".balign 16                     \n\t"\
00207                 "1:                             \n\t"\
00208                 "movq (%0, %%eax, 2), %%mm0     \n\t" /*buf0[eax]*/\
00209                 "movq (%1, %%eax, 2), %%mm1     \n\t" /*buf1[eax]*/\
00210                 "movq (%2, %%eax,2), %%mm2      \n\t" /* uvbuf0[eax]*/\
00211                 "movq (%3, %%eax,2), %%mm3      \n\t" /* uvbuf1[eax]*/\
00212                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
00213                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
00214                 "pmulhw %%mm6, %%mm0            \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00215                 "pmulhw %%mm5, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
00216                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00217                 "movq 4096(%2, %%eax,2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
00218                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
00219                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00220                 "movq 4096(%3, %%eax,2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
00221                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
00222                 "psubw %%mm0, %%mm4             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
00223                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
00224                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* 8(U-128)*/\
00225                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
00226 \
00227 \
00228                 "pmulhw %%mm5, %%mm4            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
00229                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
00230                 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
00231                 "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
00232                 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
00233                 "paddw %%mm4, %%mm0             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
00234                 "psubw "MANGLE(w400)", %%mm0    \n\t" /* (V-128)8*/\
00235 \
00236 \
00237                 "movq %%mm0, %%mm4              \n\t" /* (V-128)8*/\
00238                 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
00239                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
00240                 "paddw %%mm1, %%mm3             \n\t" /* B*/\
00241                 "paddw %%mm1, %%mm0             \n\t" /* R*/\
00242                 "packuswb %%mm3, %%mm3          \n\t"\
00243 \
00244                 "packuswb %%mm0, %%mm0          \n\t"\
00245                 "paddw %%mm4, %%mm2             \n\t"\
00246                 "paddw %%mm2, %%mm1             \n\t" /* G*/\
00247 \
00248                 "packuswb %%mm1, %%mm1          \n\t"
00249 #endif
00250 
00251 #define YSCALEYUV2PACKED(index, c) \
00252                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
00253                 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
00254                 "psraw $3, %%mm0                \n\t"\
00255                 "psraw $3, %%mm1                \n\t"\
00256                 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
00257                 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
00258                 "xorl "#index", "#index"                \n\t"\
00259                 ".balign 16                     \n\t"\
00260                 "1:                             \n\t"\
00261                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00262                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00263                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
00264                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
00265                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
00266                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
00267                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
00268                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
00269                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
00270                 "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
00271                 "psraw $7, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
00272                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
00273                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
00274                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
00275                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
00276                 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
00277                 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
00278                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
00279                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
00280                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00281                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00282                 "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00283                 "psraw $7, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00284                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00285                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00286                 
00287 #define YSCALEYUV2RGB(index, c) \
00288                 "xorl "#index", "#index"        \n\t"\
00289                 ".balign 16                     \n\t"\
00290                 "1:                             \n\t"\
00291                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00292                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00293                 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
00294                 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
00295                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
00296                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
00297                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
00298                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
00299                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
00300                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
00301                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
00302                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
00303                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
00304                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
00305                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
00306                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
00307                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
00308                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
00309                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
00310         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00311                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
00312                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
00313                 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
00314                 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
00315                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
00316                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
00317                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00318                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00319                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00320                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00321                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00322                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00323                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
00324                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
00325                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
00326                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
00327                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
00328                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
00329         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00330                 "paddw %%mm3, %%mm4             \n\t"\
00331                 "movq %%mm2, %%mm0              \n\t"\
00332                 "movq %%mm5, %%mm6              \n\t"\
00333                 "movq %%mm4, %%mm3              \n\t"\
00334                 "punpcklwd %%mm2, %%mm2         \n\t"\
00335                 "punpcklwd %%mm5, %%mm5         \n\t"\
00336                 "punpcklwd %%mm4, %%mm4         \n\t"\
00337                 "paddw %%mm1, %%mm2             \n\t"\
00338                 "paddw %%mm1, %%mm5             \n\t"\
00339                 "paddw %%mm1, %%mm4             \n\t"\
00340                 "punpckhwd %%mm0, %%mm0         \n\t"\
00341                 "punpckhwd %%mm6, %%mm6         \n\t"\
00342                 "punpckhwd %%mm3, %%mm3         \n\t"\
00343                 "paddw %%mm7, %%mm0             \n\t"\
00344                 "paddw %%mm7, %%mm6             \n\t"\
00345                 "paddw %%mm7, %%mm3             \n\t"\
00346                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00347                 "packuswb %%mm0, %%mm2          \n\t"\
00348                 "packuswb %%mm6, %%mm5          \n\t"\
00349                 "packuswb %%mm3, %%mm4          \n\t"\
00350                 "pxor %%mm7, %%mm7              \n\t"
00351                 
00352 #define YSCALEYUV2PACKED1(index, c) \
00353                 "xorl "#index", "#index"                \n\t"\
00354                 ".balign 16                     \n\t"\
00355                 "1:                             \n\t"\
00356                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
00357                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
00358                 "psraw $7, %%mm3                \n\t" \
00359                 "psraw $7, %%mm4                \n\t" \
00360                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
00361                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
00362                 "psraw $7, %%mm1                \n\t" \
00363                 "psraw $7, %%mm7                \n\t" \
00364                 
00365 #define YSCALEYUV2RGB1(index, c) \
00366                 "xorl "#index", "#index"        \n\t"\
00367                 ".balign 16                     \n\t"\
00368                 "1:                             \n\t"\
00369                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
00370                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
00371                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
00372                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
00373                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
00374                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
00375                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
00376                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
00377                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
00378                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
00379         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00380                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
00381                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
00382                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00383                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00384                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
00385                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
00386                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
00387                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
00388                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
00389                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
00390         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00391                 "paddw %%mm3, %%mm4             \n\t"\
00392                 "movq %%mm2, %%mm0              \n\t"\
00393                 "movq %%mm5, %%mm6              \n\t"\
00394                 "movq %%mm4, %%mm3              \n\t"\
00395                 "punpcklwd %%mm2, %%mm2         \n\t"\
00396                 "punpcklwd %%mm5, %%mm5         \n\t"\
00397                 "punpcklwd %%mm4, %%mm4         \n\t"\
00398                 "paddw %%mm1, %%mm2             \n\t"\
00399                 "paddw %%mm1, %%mm5             \n\t"\
00400                 "paddw %%mm1, %%mm4             \n\t"\
00401                 "punpckhwd %%mm0, %%mm0         \n\t"\
00402                 "punpckhwd %%mm6, %%mm6         \n\t"\
00403                 "punpckhwd %%mm3, %%mm3         \n\t"\
00404                 "paddw %%mm7, %%mm0             \n\t"\
00405                 "paddw %%mm7, %%mm6             \n\t"\
00406                 "paddw %%mm7, %%mm3             \n\t"\
00407                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00408                 "packuswb %%mm0, %%mm2          \n\t"\
00409                 "packuswb %%mm6, %%mm5          \n\t"\
00410                 "packuswb %%mm3, %%mm4          \n\t"\
00411                 "pxor %%mm7, %%mm7              \n\t"
00412 
00413 #define YSCALEYUV2PACKED1b(index, c) \
00414                 "xorl "#index", "#index"                \n\t"\
00415                 ".balign 16                     \n\t"\
00416                 "1:                             \n\t"\
00417                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00418                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00419                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
00420                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
00421                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
00422                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
00423                 "psrlw $8, %%mm3                \n\t" \
00424                 "psrlw $8, %%mm4                \n\t" \
00425                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
00426                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
00427                 "psraw $7, %%mm1                \n\t" \
00428                 "psraw $7, %%mm7                \n\t" 
00429                 
00430 // do vertical chrominance interpolation
00431 #define YSCALEYUV2RGB1b(index, c) \
00432                 "xorl "#index", "#index"                \n\t"\
00433                 ".balign 16                     \n\t"\
00434                 "1:                             \n\t"\
00435                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00436                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00437                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
00438                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
00439                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
00440                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
00441                 "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
00442                 "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
00443                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
00444                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
00445                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
00446                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
00447                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
00448                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
00449         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00450                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
00451                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
00452                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00453                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00454                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
00455                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
00456                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
00457                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
00458                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
00459                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
00460         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00461                 "paddw %%mm3, %%mm4             \n\t"\
00462                 "movq %%mm2, %%mm0              \n\t"\
00463                 "movq %%mm5, %%mm6              \n\t"\
00464                 "movq %%mm4, %%mm3              \n\t"\
00465                 "punpcklwd %%mm2, %%mm2         \n\t"\
00466                 "punpcklwd %%mm5, %%mm5         \n\t"\
00467                 "punpcklwd %%mm4, %%mm4         \n\t"\
00468                 "paddw %%mm1, %%mm2             \n\t"\
00469                 "paddw %%mm1, %%mm5             \n\t"\
00470                 "paddw %%mm1, %%mm4             \n\t"\
00471                 "punpckhwd %%mm0, %%mm0         \n\t"\
00472                 "punpckhwd %%mm6, %%mm6         \n\t"\
00473                 "punpckhwd %%mm3, %%mm3         \n\t"\
00474                 "paddw %%mm7, %%mm0             \n\t"\
00475                 "paddw %%mm7, %%mm6             \n\t"\
00476                 "paddw %%mm7, %%mm3             \n\t"\
00477                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00478                 "packuswb %%mm0, %%mm2          \n\t"\
00479                 "packuswb %%mm6, %%mm5          \n\t"\
00480                 "packuswb %%mm3, %%mm4          \n\t"\
00481                 "pxor %%mm7, %%mm7              \n\t"
00482 
00483 #define WRITEBGR32(dst, dstw, index) \
00484                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
00485                         "movq %%mm2, %%mm1              \n\t" /* B */\
00486                         "movq %%mm5, %%mm6              \n\t" /* R */\
00487                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
00488                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
00489                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
00490                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
00491                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
00492                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
00493                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
00494                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
00495                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
00496                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
00497 \
00498                         MOVNTQ(%%mm0, (dst, index, 4))\
00499                         MOVNTQ(%%mm2, 8(dst, index, 4))\
00500                         MOVNTQ(%%mm1, 16(dst, index, 4))\
00501                         MOVNTQ(%%mm3, 24(dst, index, 4))\
00502 \
00503                         "addl $8, "#index"              \n\t"\
00504                         "cmpl "#dstw", "#index"         \n\t"\
00505                         " jb 1b                         \n\t"
00506 
00507 #define WRITEBGR16(dst, dstw, index) \
00508                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
00509                         "pand "MANGLE(bFC)", %%mm4      \n\t" /* G */\
00510                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
00511                         "psrlq $3, %%mm2                \n\t"\
00512 \
00513                         "movq %%mm2, %%mm1              \n\t"\
00514                         "movq %%mm4, %%mm3              \n\t"\
00515 \
00516                         "punpcklbw %%mm7, %%mm3         \n\t"\
00517                         "punpcklbw %%mm5, %%mm2         \n\t"\
00518                         "punpckhbw %%mm7, %%mm4         \n\t"\
00519                         "punpckhbw %%mm5, %%mm1         \n\t"\
00520 \
00521                         "psllq $3, %%mm3                \n\t"\
00522                         "psllq $3, %%mm4                \n\t"\
00523 \
00524                         "por %%mm3, %%mm2               \n\t"\
00525                         "por %%mm4, %%mm1               \n\t"\
00526 \
00527                         MOVNTQ(%%mm2, (dst, index, 2))\
00528                         MOVNTQ(%%mm1, 8(dst, index, 2))\
00529 \
00530                         "addl $8, "#index"              \n\t"\
00531                         "cmpl "#dstw", "#index"         \n\t"\
00532                         " jb 1b                         \n\t"
00533 
00534 #define WRITEBGR15(dst, dstw, index) \
00535                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
00536                         "pand "MANGLE(bF8)", %%mm4      \n\t" /* G */\
00537                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
00538                         "psrlq $3, %%mm2                \n\t"\
00539                         "psrlq $1, %%mm5                \n\t"\
00540 \
00541                         "movq %%mm2, %%mm1              \n\t"\
00542                         "movq %%mm4, %%mm3              \n\t"\
00543 \
00544                         "punpcklbw %%mm7, %%mm3         \n\t"\
00545                         "punpcklbw %%mm5, %%mm2         \n\t"\
00546                         "punpckhbw %%mm7, %%mm4         \n\t"\
00547                         "punpckhbw %%mm5, %%mm1         \n\t"\
00548 \
00549                         "psllq $2, %%mm3                \n\t"\
00550                         "psllq $2, %%mm4                \n\t"\
00551 \
00552                         "por %%mm3, %%mm2               \n\t"\
00553                         "por %%mm4, %%mm1               \n\t"\
00554 \
00555                         MOVNTQ(%%mm2, (dst, index, 2))\
00556                         MOVNTQ(%%mm1, 8(dst, index, 2))\
00557 \
00558                         "addl $8, "#index"              \n\t"\
00559                         "cmpl "#dstw", "#index"         \n\t"\
00560                         " jb 1b                         \n\t"
00561 
00562 #define WRITEBGR24OLD(dst, dstw, index) \
00563                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
00564                         "movq %%mm2, %%mm1              \n\t" /* B */\
00565                         "movq %%mm5, %%mm6              \n\t" /* R */\
00566                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
00567                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
00568                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
00569                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
00570                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
00571                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
00572                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
00573                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
00574                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
00575                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
00576 \
00577                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
00578                         "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
00579                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
00580                         "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
00581                         "por %%mm4, %%mm0               \n\t" /* 00RGBRGB 0 */\
00582                         "movq %%mm2, %%mm4              \n\t" /* 0RGB0RGB 1 */\
00583                         "psllq $48, %%mm2               \n\t" /* GB000000 1 */\
00584                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
00585 \
00586                         "movq %%mm4, %%mm2              \n\t" /* 0RGB0RGB 1 */\
00587                         "psrld $16, %%mm4               \n\t" /* 000R000R 1 */\
00588                         "psrlq $24, %%mm2               \n\t" /* 0000RGB0 1.5 */\
00589                         "por %%mm4, %%mm2               \n\t" /* 000RRGBR 1 */\
00590                         "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
00591                         "movq %%mm1, %%mm4              \n\t" /* 0RGB0RGB 2 */\
00592                         "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
00593                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
00594                         "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
00595                         "por %%mm4, %%mm1               \n\t" /* 00RGBRGB 2 */\
00596                         "movq %%mm1, %%mm4              \n\t" /* 00RGBRGB 2 */\
00597                         "psllq $32, %%mm1               \n\t" /* BRGB0000 2 */\
00598                         "por %%mm1, %%mm2               \n\t" /* BRGBRGBR 1 */\
00599 \
00600                         "psrlq $32, %%mm4               \n\t" /* 000000RG 2.5 */\
00601                         "movq %%mm3, %%mm5              \n\t" /* 0RGB0RGB 3 */\
00602                         "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
00603                         "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
00604                         "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
00605                         "por %%mm5, %%mm3               \n\t" /* 00RGBRGB 3 */\
00606                         "psllq $16, %%mm3               \n\t" /* RGBRGB00 3 */\
00607                         "por %%mm4, %%mm3               \n\t" /* RGBRGBRG 2.5 */\
00608 \
00609                         MOVNTQ(%%mm0, (dst))\
00610                         MOVNTQ(%%mm2, 8(dst))\
00611                         MOVNTQ(%%mm3, 16(dst))\
00612                         "addl $24, "#dst"               \n\t"\
00613 \
00614                         "addl $8, "#index"              \n\t"\
00615                         "cmpl "#dstw", "#index"         \n\t"\
00616                         " jb 1b                         \n\t"
00617 
00618 #define WRITEBGR24MMX(dst, dstw, index) \
00619                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
00620                         "movq %%mm2, %%mm1              \n\t" /* B */\
00621                         "movq %%mm5, %%mm6              \n\t" /* R */\
00622                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
00623                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
00624                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
00625                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
00626                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
00627                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
00628                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
00629                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
00630                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
00631                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
00632 \
00633                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
00634                         "movq %%mm2, %%mm6              \n\t" /* 0RGB0RGB 1 */\
00635                         "movq %%mm1, %%mm5              \n\t" /* 0RGB0RGB 2 */\
00636                         "movq %%mm3, %%mm7              \n\t" /* 0RGB0RGB 3 */\
00637 \
00638                         "psllq $40, %%mm0               \n\t" /* RGB00000 0 */\
00639                         "psllq $40, %%mm2               \n\t" /* RGB00000 1 */\
00640                         "psllq $40, %%mm1               \n\t" /* RGB00000 2 */\
00641                         "psllq $40, %%mm3               \n\t" /* RGB00000 3 */\
00642 \
00643                         "punpckhdq %%mm4, %%mm0         \n\t" /* 0RGBRGB0 0 */\
00644                         "punpckhdq %%mm6, %%mm2         \n\t" /* 0RGBRGB0 1 */\
00645                         "punpckhdq %%mm5, %%mm1         \n\t" /* 0RGBRGB0 2 */\
00646                         "punpckhdq %%mm7, %%mm3         \n\t" /* 0RGBRGB0 3 */\
00647 \
00648                         "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
00649                         "movq %%mm2, %%mm6              \n\t" /* 0RGBRGB0 1 */\
00650                         "psllq $40, %%mm2               \n\t" /* GB000000 1 */\
00651                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
00652                         MOVNTQ(%%mm0, (dst))\
00653 \
00654                         "psrlq $24, %%mm6               \n\t" /* 0000RGBR 1 */\
00655                         "movq %%mm1, %%mm5              \n\t" /* 0RGBRGB0 2 */\
00656                         "psllq $24, %%mm1               \n\t" /* BRGB0000 2 */\
00657                         "por %%mm1, %%mm6               \n\t" /* BRGBRGBR 1 */\
00658                         MOVNTQ(%%mm6, 8(dst))\
00659 \
00660                         "psrlq $40, %%mm5               \n\t" /* 000000RG 2 */\
00661                         "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
00662                         "por %%mm3, %%mm5               \n\t" /* RGBRGBRG 2 */\
00663                         MOVNTQ(%%mm5, 16(dst))\
00664 \
00665                         "addl $24, "#dst"               \n\t"\
00666 \
00667                         "addl $8, "#index"                      \n\t"\
00668                         "cmpl "#dstw", "#index"                 \n\t"\
00669                         " jb 1b                         \n\t"
00670 
00671 #define WRITEBGR24MMX2(dst, dstw, index) \
00672                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
00673                         "movq "MANGLE(M24A)", %%mm0     \n\t"\
00674                         "movq "MANGLE(M24C)", %%mm7     \n\t"\
00675                         "pshufw $0x50, %%mm2, %%mm1     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
00676                         "pshufw $0x50, %%mm4, %%mm3     \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
00677                         "pshufw $0x00, %%mm5, %%mm6     \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
00678 \
00679                         "pand %%mm0, %%mm1              \n\t" /*    B2        B1       B0 */\
00680                         "pand %%mm0, %%mm3              \n\t" /*    G2        G1       G0 */\
00681                         "pand %%mm7, %%mm6              \n\t" /*       R1        R0       */\
00682 \
00683                         "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
00684                         "por %%mm1, %%mm6               \n\t"\
00685                         "por %%mm3, %%mm6               \n\t"\
00686                         MOVNTQ(%%mm6, (dst))\
00687 \
00688                         "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
00689                         "pshufw $0xA5, %%mm2, %%mm1     \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
00690                         "pshufw $0x55, %%mm4, %%mm3     \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
00691                         "pshufw $0xA5, %%mm5, %%mm6     \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
00692 \
00693                         "pand "MANGLE(M24B)", %%mm1     \n\t" /* B5       B4        B3    */\
00694                         "pand %%mm7, %%mm3              \n\t" /*       G4        G3       */\
00695                         "pand %%mm0, %%mm6              \n\t" /*    R4        R3       R2 */\
00696 \
00697                         "por %%mm1, %%mm3               \n\t" /* B5    G4 B4     G3 B3    */\
00698                         "por %%mm3, %%mm6               \n\t"\
00699                         MOVNTQ(%%mm6, 8(dst))\
00700 \
00701                         "pshufw $0xFF, %%mm2, %%mm1     \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
00702                         "pshufw $0xFA, %%mm4, %%mm3     \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
00703                         "pshufw $0xFA, %%mm5, %%mm6     \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
00704 \
00705                         "pand %%mm7, %%mm1              \n\t" /*       B7        B6       */\
00706                         "pand %%mm0, %%mm3              \n\t" /*    G7        G6       G5 */\
00707                         "pand "MANGLE(M24B)", %%mm6     \n\t" /* R7       R6        R5    */\
00708 \
00709                         "por %%mm1, %%mm3               \n\t"\
00710                         "por %%mm3, %%mm6               \n\t"\
00711                         MOVNTQ(%%mm6, 16(dst))\
00712 \
00713                         "addl $24, "#dst"               \n\t"\
00714 \
00715                         "addl $8, "#index"              \n\t"\
00716                         "cmpl "#dstw", "#index"         \n\t"\
00717                         " jb 1b                         \n\t"
00718 
00719 #ifdef HAVE_MMX2
00720 #undef WRITEBGR24
00721 #define WRITEBGR24 WRITEBGR24MMX2
00722 #else
00723 #undef WRITEBGR24
00724 #define WRITEBGR24 WRITEBGR24MMX
00725 #endif
00726 
00727 #define WRITEYUY2(dst, dstw, index) \
00728                         "packuswb %%mm3, %%mm3          \n\t"\
00729                         "packuswb %%mm4, %%mm4          \n\t"\
00730                         "packuswb %%mm7, %%mm1          \n\t"\
00731                         "punpcklbw %%mm4, %%mm3         \n\t"\
00732                         "movq %%mm1, %%mm7              \n\t"\
00733                         "punpcklbw %%mm3, %%mm1         \n\t"\
00734                         "punpckhbw %%mm3, %%mm7         \n\t"\
00735 \
00736                         MOVNTQ(%%mm1, (dst, index, 2))\
00737                         MOVNTQ(%%mm7, 8(dst, index, 2))\
00738 \
00739                         "addl $8, "#index"              \n\t"\
00740                         "cmpl "#dstw", "#index"         \n\t"\
00741                         " jb 1b                         \n\t"
00742 
00743 
00744 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
00745                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
00746                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
00747 {
00748 #ifdef HAVE_MMX
00749         if(uDest != NULL)
00750         {
00751                 asm volatile(
00752                                 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
00753                                 :: "r" (&c->redDither),
00754                                 "r" (uDest), "m" (chrDstW)
00755                                 : "%eax", "%edx", "%esi"
00756                         );
00757 
00758                 asm volatile(
00759                                 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
00760                                 :: "r" (&c->redDither),
00761                                 "r" (vDest), "m" (chrDstW)
00762                                 : "%eax", "%edx", "%esi"
00763                         );
00764         }
00765 
00766         asm volatile(
00767                         YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
00768                         :: "r" (&c->redDither),
00769                            "r" (dest), "m" (dstW)
00770                         : "%eax", "%edx", "%esi"
00771                 );
00772 #else
00773 #ifdef HAVE_ALTIVEC
00774 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
00775                       chrFilter, chrSrc, chrFilterSize,
00776                       dest, uDest, vDest, dstW, chrDstW);
00777 #else //HAVE_ALTIVEC
00778 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
00779             chrFilter, chrSrc, chrFilterSize,
00780             dest, uDest, vDest, dstW, chrDstW);
00781 #endif 
00782 #endif
00783 }
00784 
00785 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
00786                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
00787 {
00788 #ifdef HAVE_MMX
00789         if(uDest != NULL)
00790         {
00791                 asm volatile(
00792                                 YSCALEYUV2YV121
00793                                 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
00794                                 "g" (-chrDstW)
00795                                 : "%eax"
00796                         );
00797 
00798                 asm volatile(
00799                                 YSCALEYUV2YV121
00800                                 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
00801                                 "g" (-chrDstW)
00802                                 : "%eax"
00803                         );
00804         }
00805 
00806         asm volatile(
00807                 YSCALEYUV2YV121
00808                 :: "r" (lumSrc + dstW), "r" (dest + dstW),
00809                 "g" (-dstW)
00810                 : "%eax"
00811         );
00812 #else
00813         int i;
00814         for(i=0; i<dstW; i++)
00815         {
00816                 int val= lumSrc[i]>>7;
00817                 
00818                 if(val&256){
00819                         if(val<0) val=0;
00820                         else      val=255;
00821                 }
00822 
00823                 dest[i]= val;
00824         }
00825 
00826         if(uDest != NULL)
00827                 for(i=0; i<chrDstW; i++)
00828                 {
00829                         int u=chrSrc[i]>>7;
00830                         int v=chrSrc[i + 2048]>>7;
00831 
00832                         if((u|v)&256){
00833                                 if(u<0)         u=0;
00834                                 else if (u>255) u=255;
00835                                 if(v<0)         v=0;
00836                                 else if (v>255) v=255;
00837                         }
00838 
00839                         uDest[i]= u;
00840                         vDest[i]= v;
00841                 }
00842 #endif
00843 }
00844 
00845 
00849 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
00850                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
00851                             uint8_t *dest, int dstW, int dstY)
00852 {
00853         int dummy=0;
00854         switch(c->dstFormat)
00855         {
00856 #ifdef HAVE_MMX
00857         case IMGFMT_BGR32:
00858                 {
00859                         asm volatile(
00860                                 YSCALEYUV2RGBX
00861                                 WRITEBGR32(%4, %5, %%eax)
00862 
00863                         :: "r" (&c->redDither), 
00864                            "m" (dummy), "m" (dummy), "m" (dummy),
00865                            "r" (dest), "m" (dstW)
00866                         : "%eax", "%edx", "%esi"
00867                         );
00868                 }
00869                 break;
00870         case IMGFMT_BGR24:
00871                 {
00872                         asm volatile(
00873                                 YSCALEYUV2RGBX
00874                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t" //FIXME optimize
00875                                 "addl %4, %%ebx                 \n\t"
00876                                 WRITEBGR24(%%ebx, %5, %%eax)
00877 
00878                         :: "r" (&c->redDither), 
00879                            "m" (dummy), "m" (dummy), "m" (dummy),
00880                            "r" (dest), "m" (dstW)
00881                         : "%eax", "%ebx", "%edx", "%esi" //FIXME ebx
00882                         );
00883                 }
00884                 break;
00885         case IMGFMT_BGR15:
00886                 {
00887                         asm volatile(
00888                                 YSCALEYUV2RGBX
00889                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00890 #ifdef DITHER1XBPP
00891                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
00892                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
00893                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
00894 #endif
00895 
00896                                 WRITEBGR15(%4, %5, %%eax)
00897 
00898                         :: "r" (&c->redDither), 
00899                            "m" (dummy), "m" (dummy), "m" (dummy),
00900                            "r" (dest), "m" (dstW)
00901                         : "%eax", "%edx", "%esi"
00902                         );
00903                 }
00904                 break;
00905         case IMGFMT_BGR16:
00906                 {
00907                         asm volatile(
00908                                 YSCALEYUV2RGBX
00909                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00910 #ifdef DITHER1XBPP
00911                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
00912                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
00913                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
00914 #endif
00915 
00916                                 WRITEBGR16(%4, %5, %%eax)
00917 
00918                         :: "r" (&c->redDither), 
00919                            "m" (dummy), "m" (dummy), "m" (dummy),
00920                            "r" (dest), "m" (dstW)
00921                         : "%eax", "%edx", "%esi"
00922                         );
00923                 }
00924                 break;
00925         case IMGFMT_YUY2:
00926                 {
00927                         asm volatile(
00928                                 YSCALEYUV2PACKEDX
00929                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00930 
00931                                 "psraw $3, %%mm3                \n\t"
00932                                 "psraw $3, %%mm4                \n\t"
00933                                 "psraw $3, %%mm1                \n\t"
00934                                 "psraw $3, %%mm7                \n\t"
00935                                 WRITEYUY2(%4, %5, %%eax)
00936 
00937                         :: "r" (&c->redDither), 
00938                            "m" (dummy), "m" (dummy), "m" (dummy),
00939                            "r" (dest), "m" (dstW)
00940                         : "%eax", "%edx", "%esi"
00941                         );
00942                 }
00943                 break;
00944 #endif
00945         default:
00946 #ifdef HAVE_ALTIVEC
00947                 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
00948                             chrFilter, chrSrc, chrFilterSize,
00949                             dest, dstW, dstY);
00950 #else
00951                 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
00952                             chrFilter, chrSrc, chrFilterSize,
00953                             dest, dstW, dstY);
00954 #endif
00955                 break;
00956         }
00957 }
00958 
00962 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
00963                             uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
00964 {
00965         int yalpha1=yalpha^4095;
00966         int uvalpha1=uvalpha^4095;
00967         int i;
00968 
00969 #if 0 //isn't used
00970         if(flags&SWS_FULL_CHR_H_INT)
00971         {
00972                 switch(dstFormat)
00973                 {
00974 #ifdef HAVE_MMX
00975                 case IMGFMT_BGR32:
00976                         asm volatile(
00977 
00978 
00979 FULL_YSCALEYUV2RGB
00980                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
00981                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
00982 
00983                         "movq %%mm3, %%mm1              \n\t"
00984                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
00985                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
00986 
00987                         MOVNTQ(%%mm3, (%4, %%eax, 4))
00988                         MOVNTQ(%%mm1, 8(%4, %%eax, 4))
00989 
00990                         "addl $4, %%eax                 \n\t"
00991                         "cmpl %5, %%eax                 \n\t"
00992                         " jb 1b                         \n\t"
00993 
00994 
00995                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
00996                         "m" (yalpha1), "m" (uvalpha1)
00997                         : "%eax"
00998                         );
00999                         break;
01000                 case IMGFMT_BGR24:
01001                         asm volatile(
01002 
01003 FULL_YSCALEYUV2RGB
01004 
01005                                                                 // lsb ... msb
01006                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
01007                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
01008 
01009                         "movq %%mm3, %%mm1              \n\t"
01010                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
01011                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
01012 
01013                         "movq %%mm3, %%mm2              \n\t" // BGR0BGR0
01014                         "psrlq $8, %%mm3                \n\t" // GR0BGR00
01015                         "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
01016                         "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
01017                         "por %%mm2, %%mm3               \n\t" // BGRBGR00
01018                         "movq %%mm1, %%mm2              \n\t"
01019                         "psllq $48, %%mm1               \n\t" // 000000BG
01020                         "por %%mm1, %%mm3               \n\t" // BGRBGRBG
01021 
01022                         "movq %%mm2, %%mm1              \n\t" // BGR0BGR0
01023                         "psrld $16, %%mm2               \n\t" // R000R000
01024                         "psrlq $24, %%mm1               \n\t" // 0BGR0000
01025                         "por %%mm2, %%mm1               \n\t" // RBGRR000
01026 
01027                         "movl %4, %%ebx                 \n\t"
01028                         "addl %%eax, %%ebx              \n\t"
01029 
01030 #ifdef HAVE_MMX2
01031                         //FIXME Alignment
01032                         "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
01033                         "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
01034 #else
01035                         "movd %%mm3, (%%ebx, %%eax, 2)  \n\t"
01036                         "psrlq $32, %%mm3               \n\t"
01037                         "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
01038                         "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
01039 #endif
01040                         "addl $4, %%eax                 \n\t"
01041                         "cmpl %5, %%eax                 \n\t"
01042                         " jb 1b                         \n\t"
01043 
01044                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
01045                         "m" (yalpha1), "m" (uvalpha1)
01046                         : "%eax", "%ebx"
01047                         );
01048                         break;
01049                 case IMGFMT_BGR15:
01050                         asm volatile(
01051 
01052 FULL_YSCALEYUV2RGB
01053 #ifdef DITHER1XBPP
01054                         "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
01055                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
01056                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
01057 #endif
01058                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
01059                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
01060                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
01061 
01062                         "psrlw $3, %%mm3                \n\t"
01063                         "psllw $2, %%mm1                \n\t"
01064                         "psllw $7, %%mm0                \n\t"
01065                         "pand "MANGLE(g15Mask)", %%mm1  \n\t"
01066                         "pand "MANGLE(r15Mask)", %%mm0  \n\t"
01067 
01068                         "por %%mm3, %%mm1               \n\t"
01069                         "por %%mm1, %%mm0               \n\t"
01070 
01071                         MOVNTQ(%%mm0, (%4, %%eax, 2))
01072 
01073                         "addl $4, %%eax                 \n\t"
01074                         "cmpl %5, %%eax                 \n\t"
01075                         " jb 1b                         \n\t"
01076 
01077                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
01078                         "m" (yalpha1), "m" (uvalpha1)
01079                         : "%eax"
01080                         );
01081                         break;
01082                 case IMGFMT_BGR16:
01083                         asm volatile(
01084 
01085 FULL_YSCALEYUV2RGB
01086 #ifdef DITHER1XBPP
01087                         "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
01088                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
01089                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
01090 #endif
01091                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
01092                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
01093                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
01094 
01095                         "psrlw $3, %%mm3                \n\t"
01096                         "psllw $3, %%mm1                \n\t"
01097                         "psllw $8, %%mm0                \n\t"
01098                         "pand "MANGLE(g16Mask)", %%mm1  \n\t"
01099                         "pand "MANGLE(r16Mask)", %%mm0  \n\t"
01100 
01101                         "por %%mm3, %%mm1               \n\t"
01102                         "por %%mm1, %%mm0               \n\t"
01103 
01104                         MOVNTQ(%%mm0, (%4, %%eax, 2))
01105 
01106                         "addl $4, %%eax                 \n\t"
01107                         "cmpl %5, %%eax                 \n\t"
01108                         " jb 1b                         \n\t"
01109 
01110                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
01111                         "m" (yalpha1), "m" (uvalpha1)
01112                         : "%eax"
01113                         );
01114                 break;
01115 #endif
01116                 case IMGFMT_RGB32:
01117 #ifndef HAVE_MMX
01118                 case IMGFMT_BGR32:
01119 #endif
01120                 if(dstFormat==IMGFMT_BGR32)
01121                 {
01122                         int i;
01123 #ifdef WORDS_BIGENDIAN
01124                         dest++;
01125 #endif
01126                         for(i=0;i<dstW;i++){
01127                                 // vertical linear interpolation && yuv2rgb in a single step:
01128                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
01129                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
01130                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
01131                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
01132                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
01133                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
01134                                 dest+= 4;
01135                         }
01136                 }
01137                 else if(dstFormat==IMGFMT_BGR24)
01138                 {
01139                         int i;
01140                         for(i=0;i<dstW;i++){
01141                                 // vertical linear interpolation && yuv2rgb in a single step:
01142                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
01143                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
01144                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
01145                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
01146                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
01147                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
01148                                 dest+= 3;
01149                         }
01150                 }
01151                 else if(dstFormat==IMGFMT_BGR16)
01152                 {
01153                         int i;
01154                         for(i=0;i<dstW;i++){
01155                                 // vertical linear interpolation && yuv2rgb in a single step:
01156                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
01157                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
01158                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
01159 
01160                                 ((uint16_t*)dest)[i] =
01161                                         clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
01162                                         clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
01163                                         clip_table16r[(Y + yuvtab_3343[V]) >>13];
01164                         }
01165                 }
01166                 else if(dstFormat==IMGFMT_BGR15)
01167                 {
01168                         int i;
01169                         for(i=0;i<dstW;i++){
01170                                 // vertical linear interpolation && yuv2rgb in a single step:
01171                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
01172                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
01173                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
01174 
01175                                 ((uint16_t*)dest)[i] =
01176                                         clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
01177                                         clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
01178                                         clip_table15r[(Y + yuvtab_3343[V]) >>13];
01179                         }
01180                 }
01181         }//FULL_UV_IPOL
01182         else
01183         {
01184 #endif // if 0
01185 #ifdef HAVE_MMX
01186         switch(c->dstFormat)
01187         {
01188 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
01189         case IMGFMT_BGR32:
01190                         asm volatile(
01191                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
01192                                 "movl %4, %%esp                         \n\t"
01193                                 YSCALEYUV2RGB(%%eax, %5)
01194                                 WRITEBGR32(%%esp, 8280(%5), %%eax)
01195                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
01196 
01197                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
01198                         "r" (&c->redDither)
01199                         : "%eax"
01200                         );
01201                         return;
01202         case IMGFMT_BGR24:
01203                         asm volatile(
01204                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
01205                                 "movl %4, %%esp                 \n\t"
01206                                 YSCALEYUV2RGB(%%eax, %5)
01207                                 WRITEBGR24(%%esp, 8280(%5), %%eax)
01208                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
01209                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
01210                         "r" (&c->redDither)
01211                         : "%eax"
01212                         );
01213                         return;
01214         case IMGFMT_BGR15:
01215                         asm volatile(
01216                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
01217                                 "movl %4, %%esp                         \n\t"
01218                                 YSCALEYUV2RGB(%%eax, %5)
01219                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01220 #ifdef DITHER1XBPP
01221                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
01222                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
01223                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
01224 #endif
01225 
01226                                 WRITEBGR15(%%esp, 8280(%5), %%eax)
01227                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
01228 
01229                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
01230                         "r" (&c->redDither)
01231                         : "%eax"
01232                         );
01233                         return;
01234         case IMGFMT_BGR16:
01235                         asm volatile(
01236                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
01237                                 "movl %4, %%esp                         \n\t"
01238                                 YSCALEYUV2RGB(%%eax, %5)
01239                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01240 #ifdef DITHER1XBPP
01241                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
01242                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
01243                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
01244 #endif
01245 
01246                                 WRITEBGR16(%%esp, 8280(%5), %%eax)
01247                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
01248                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
01249                         "r" (&c->redDither)
01250                         : "%eax"
01251                         );
01252                         return;
01253         case IMGFMT_YUY2:
01254                         asm volatile(
01255                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
01256                                 "movl %4, %%esp                         \n\t"
01257                                 YSCALEYUV2PACKED(%%eax, %5)
01258                                 WRITEYUY2(%%esp, 8280(%5), %%eax)
01259                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
01260                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
01261                         "r" (&c->redDither)
01262                         : "%eax"
01263                         );
01264                         return;
01265         default: break;
01266         }
01267 #endif //HAVE_MMX
01268 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
01269 }
01270 
01274 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
01275                             uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
01276 {
01277         const int yalpha1=0;
01278         int i;
01279         
01280         uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
01281         const int yalpha= 4096; //FIXME ...
01282 
01283         if(flags&SWS_FULL_CHR_H_INT)
01284         {
01285                 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
01286                 return;
01287         }
01288 
01289 #ifdef HAVE_MMX
01290         if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
01291         {
01292                 switch(dstFormat)
01293                 {
01294                 case IMGFMT_BGR32:
01295                         asm volatile(
01296                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
01297                                 "movl %4, %%esp                         \n\t"
01298                                 YSCALEYUV2RGB1(%%eax, %5)
01299                                 WRITEBGR32(%%esp, 8280(%5), %%eax)
01300                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
01301 
01302                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
01303                         "r" (&c->redDither)
01304                         : "%eax"
01305                         );
01306                         return;
01307                 case IMGFMT_BGR24:
01308                         asm volatile(
01309                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
01310                                 "movl %4, %%esp                         \n\t"
01311                                 YSCALEYUV2RGB1(%%eax, %5)
01312                                 WRITEBGR24(%%esp, 8280(%5), %%eax)
01313                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
01314 
01315                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
01316                         "r" (&c->redDither)
01317                         : "%eax"
01318                         );
01319                         return;
01320                 case IMGFMT_BGR15:
01321                         asm volatile(
01322                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
01323                                 "movl %4, %%esp                         \n\t"
01324                                 YSCALEYUV2RGB1(%%eax, %5)
01325                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01326 #ifdef DITHER1XBPP
01327                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
01328                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
01329                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
01330 #endif
01331                                 WRITEBGR15(%%esp, 8280(%5), %%eax)
01332                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
01333 
01334                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
01335                         "r" (&c->redDither)
01336                         : "%eax"
01337                         );
01338                         return;
01339                 case IMGFMT_BGR16:
01340                         asm volatile(
01341                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
01342                                 "movl %4, %%esp                         \n\t"
01343                                 YSCALEYUV2RGB1(%%eax, %5)
01344                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01345 #ifdef DITHER1XBPP
01346                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
01347                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
01348                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
01349 #endif
01350 
01351                                 WRITEBGR16(%%esp, 8280(%5), %%eax)
01352                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
01353 
01354                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
01355                         "r" (&c->redDither)
01356                         : "%eax"
01357                         );
01358                         return;
01359                 case IMGFMT_YUY2:
01360                         asm volatile(
01361                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
01362                                 "movl %4, %%esp                         \n\t"
01363                                 YSCALEYUV2PACKED1(%%eax, %5)
01364                                 WRITEYUY2(%%esp, 8280(%5), %%eax)
01365                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
01366 
01367                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
01368                         "r" (&c->redDither)
01369                         : "%eax"
01370                         );
01371                         return;
01372                 }
01373         }
01374         else
01375         {
01376                 switch(dstFormat)
01377                 {
01378                 case IMGFMT_BGR32:
01379                         asm volatile(
01380                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
01381                                 "movl %4, %%esp                         \n\t"
01382                                 YSCALEYUV2RGB1b(%%eax, %5)
01383                                 WRITEBGR32(%%esp, 8280(%5), %%eax)
01384                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
01385 
01386                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
01387                         "r" (&c->redDither)
01388                         : "%eax"
01389                         );
01390                         return;
01391                 case IMGFMT_BGR24:
01392                         asm volatile(
01393                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
01394                                 "movl %4, %%esp                         \n\t"
01395                                 YSCALEYUV2RGB1b(%%eax, %5)
01396                                 WRITEBGR24(%%esp, 8280(%5), %%eax)
01397                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
01398 
01399                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
01400                         "r" (&c->redDither)
01401                         : "%eax"
01402                         );
01403                         return;
01404                 case IMGFMT_BGR15:
01405                         asm volatile(
01406                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
01407                                 "movl %4, %%esp                         \n\t"
01408                                 YSCALEYUV2RGB1b(%%eax, %5)
01409                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01410 #ifdef DITHER1XBPP
01411                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
01412                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
01413                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
01414 #endif
01415                                 WRITEBGR15(%%esp, 8280(%5), %%eax)
01416                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
01417 
01418                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
01419                         "r" (&c->redDither)
01420                         : "%eax"
01421                         );
01422                         return;
01423                 case IMGFMT_BGR16:
01424                         asm volatile(
01425                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
01426                                 "movl %4, %%esp                         \n\t"
01427                                 YSCALEYUV2RGB1b(%%eax, %5)
01428                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01429 #ifdef DITHER1XBPP
01430                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
01431                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
01432                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
01433 #endif
01434 
01435                                 WRITEBGR16(%%esp, 8280(%5), %%eax)
01436                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
01437 
01438                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
01439                         "r" (&c->redDither)
01440                         : "%eax"
01441                         );
01442                         return;
01443                 case IMGFMT_YUY2:
01444                         asm volatile(
01445                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
01446                                 "movl %4, %%esp                         \n\t"
01447                                 YSCALEYUV2PACKED1b(%%eax, %5)
01448                                 WRITEYUY2(%%esp, 8280(%5), %%eax)
01449                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
01450 
01451                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
01452                         "r" (&c->redDither)
01453                         : "%eax"
01454                         );
01455                         return;
01456                 }
01457         }
01458 #endif
01459         if( uvalpha < 2048 )
01460         {
01461                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
01462         }else{
01463                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
01464         }
01465 }
01466 
01467 //FIXME yuy2* can read upto 7 samples to much
01468 
01469 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
01470 {
01471 #ifdef HAVE_MMX
01472         asm volatile(
01473                 "movq "MANGLE(bm01010101)", %%mm2\n\t"
01474                 "movl %0, %%eax                 \n\t"
01475                 "1:                             \n\t"
01476                 "movq (%1, %%eax,2), %%mm0      \n\t"
01477                 "movq 8(%1, %%eax,2), %%mm1     \n\t"
01478                 "pand %%mm2, %%mm0              \n\t"
01479                 "pand %%mm2, %%mm1              \n\t"
01480                 "packuswb %%mm1, %%mm0          \n\t"
01481                 "movq %%mm0, (%2, %%eax)        \n\t"
01482                 "addl $8, %%eax                 \n\t"
01483                 " js 1b                         \n\t"
01484                 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
01485                 : "%eax"
01486         );
01487 #else
01488         int i;
01489         for(i=0; i<width; i++)
01490                 dst[i]= src[2*i];
01491 #endif
01492 }
01493 
01494 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
01495 {
01496 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
01497         asm volatile(
01498                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
01499                 "movl %0, %%eax                 \n\t"
01500                 "1:                             \n\t"
01501                 "movq (%1, %%eax,4), %%mm0      \n\t"
01502                 "movq 8(%1, %%eax,4), %%mm1     \n\t"
01503                 "movq (%2, %%eax,4), %%mm2      \n\t"
01504                 "movq 8(%2, %%eax,4), %%mm3     \n\t"
01505                 PAVGB(%%mm2, %%mm0)
01506                 PAVGB(%%mm3, %%mm1)
01507                 "psrlw $8, %%mm0                \n\t"
01508                 "psrlw $8, %%mm1                \n\t"
01509                 "packuswb %%mm1, %%mm0          \n\t"
01510                 "movq %%mm0, %%mm1              \n\t"
01511                 "psrlw $8, %%mm0                \n\t"
01512                 "pand %%mm4, %%mm1              \n\t"
01513                 "packuswb %%mm0, %%mm0          \n\t"
01514                 "packuswb %%mm1, %%mm1          \n\t"
01515                 "movd %%mm0, (%4, %%eax)        \n\t"
01516                 "movd %%mm1, (%3, %%eax)        \n\t"
01517                 "addl $4, %%eax                 \n\t"
01518                 " js 1b                         \n\t"
01519                 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
01520                 : "%eax"
01521         );
01522 #else
01523         int i;
01524         for(i=0; i<width; i++)
01525         {
01526                 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
01527                 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
01528         }
01529 #endif
01530 }
01531 
01532 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
01533 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
01534 {
01535 #ifdef HAVE_MMX
01536         asm volatile(
01537                 "movl %0, %%eax                 \n\t"
01538                 "1:                             \n\t"
01539                 "movq (%1, %%eax,2), %%mm0      \n\t"
01540                 "movq 8(%1, %%eax,2), %%mm1     \n\t"
01541                 "psrlw $8, %%mm0                \n\t"
01542                 "psrlw $8, %%mm1                \n\t"
01543                 "packuswb %%mm1, %%mm0          \n\t"
01544                 "movq %%mm0, (%2, %%eax)        \n\t"
01545                 "addl $8, %%eax                 \n\t"
01546                 " js 1b                         \n\t"
01547                 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
01548                 : "%eax"
01549         );
01550 #else
01551         int i;
01552         for(i=0; i<width; i++)
01553                 dst[i]= src[2*i+1];
01554 #endif
01555 }
01556 
01557 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
01558 {
01559 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
01560         asm volatile(
01561                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
01562                 "movl %0, %%eax                 \n\t"
01563                 "1:                             \n\t"
01564                 "movq (%1, %%eax,4), %%mm0      \n\t"
01565                 "movq 8(%1, %%eax,4), %%mm1     \n\t"
01566                 "movq (%2, %%eax,4), %%mm2      \n\t"
01567                 "movq 8(%2, %%eax,4), %%mm3     \n\t"
01568                 PAVGB(%%mm2, %%mm0)
01569                 PAVGB(%%mm3, %%mm1)
01570                 "pand %%mm4, %%mm0              \n\t"
01571                 "pand %%mm4, %%mm1              \n\t"
01572                 "packuswb %%mm1, %%mm0          \n\t"
01573                 "movq %%mm0, %%mm1              \n\t"
01574                 "psrlw $8, %%mm0                \n\t"
01575                 "pand %%mm4, %%mm1              \n\t"
01576                 "packuswb %%mm0, %%mm0          \n\t"
01577                 "packuswb %%mm1, %%mm1          \n\t"
01578                 "movd %%mm0, (%4, %%eax)        \n\t"
01579                 "movd %%mm1, (%3, %%eax)        \n\t"
01580                 "addl $4, %%eax                 \n\t"
01581                 " js 1b                         \n\t"
01582                 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
01583                 : "%eax"
01584         );
01585 #else
01586         int i;
01587         for(i=0; i<width; i++)
01588         {
01589                 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
01590                 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
01591         }
01592 #endif
01593 }
01594 
01595 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
01596 {
01597 #ifdef HAVE_MMXFIXME
01598 #else
01599         int i;
01600         for(i=0; i<width; i++)
01601         {
01602                 int b=  ((uint32_t*)src)[i]&0xFF;
01603                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
01604                 int r= (((uint32_t*)src)[i]>>16)&0xFF;
01605 
01606                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
01607         }
01608 #endif
01609 }
01610 
01611 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
01612 {
01613 #ifdef HAVE_MMXFIXME
01614 #else
01615         int i;
01616         for(i=0; i<width; i++)
01617         {
01618                 const int a= ((uint32_t*)src1)[2*i+0];
01619                 const int e= ((uint32_t*)src1)[2*i+1];
01620                 const int c= ((uint32_t*)src2)[2*i+0];
01621                 const int d= ((uint32_t*)src2)[2*i+1];
01622                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
01623                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
01624                 const int b=  l&0x3FF;
01625                 const int g=  h>>8;
01626                 const int r=  l>>16;
01627 
01628                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
01629                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
01630         }
01631 #endif
01632 }
01633 
01634 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
01635 {
01636 #ifdef HAVE_MMX
01637         asm volatile(
01638                 "movl %2, %%eax                 \n\t"
01639                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
01640                 "movq "MANGLE(w1111)", %%mm5            \n\t"
01641                 "pxor %%mm7, %%mm7              \n\t"
01642                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
01643                 ".balign 16                     \n\t"
01644                 "1:                             \n\t"
01645                 PREFETCH" 64(%0, %%ebx)         \n\t"
01646                 "movd (%0, %%ebx), %%mm0        \n\t"
01647                 "movd 3(%0, %%ebx), %%mm1       \n\t"
01648                 "punpcklbw %%mm7, %%mm0         \n\t"
01649                 "punpcklbw %%mm7, %%mm1         \n\t"
01650                 "movd 6(%0, %%ebx), %%mm2       \n\t"
01651                 "movd 9(%0, %%ebx), %%mm3       \n\t"
01652                 "punpcklbw %%mm7, %%mm2         \n\t"
01653                 "punpcklbw %%mm7, %%mm3         \n\t"
01654                 "pmaddwd %%mm6, %%mm0           \n\t"
01655                 "pmaddwd %%mm6, %%mm1           \n\t"
01656                 "pmaddwd %%mm6, %%mm2           \n\t"
01657                 "pmaddwd %%mm6, %%mm3           \n\t"
01658 #ifndef FAST_BGR2YV12
01659                 "psrad $8, %%mm0                \n\t"
01660                 "psrad $8, %%mm1                \n\t"
01661                 "psrad $8, %%mm2                \n\t"
01662                 "psrad $8, %%mm3                \n\t"
01663 #endif
01664                 "packssdw %%mm1, %%mm0          \n\t"
01665                 "packssdw %%mm3, %%mm2          \n\t"
01666                 "pmaddwd %%mm5, %%mm0           \n\t"
01667                 "pmaddwd %%mm5, %%mm2           \n\t"
01668                 "packssdw %%mm2, %%mm0          \n\t"
01669                 "psraw $7, %%mm0                \n\t"
01670 
01671                 "movd 12(%0, %%ebx), %%mm4      \n\t"
01672                 "movd 15(%0, %%ebx), %%mm1      \n\t"
01673                 "punpcklbw %%mm7, %%mm4         \n\t"
01674                 "punpcklbw %%mm7, %%mm1         \n\t"
01675                 "movd 18(%0, %%ebx), %%mm2      \n\t"
01676                 "movd 21(%0, %%ebx), %%mm3      \n\t"
01677                 "punpcklbw %%mm7, %%mm2         \n\t"
01678                 "punpcklbw %%mm7, %%mm3         \n\t"
01679                 "pmaddwd %%mm6, %%mm4           \n\t"
01680                 "pmaddwd %%mm6, %%mm1           \n\t"
01681                 "pmaddwd %%mm6, %%mm2           \n\t"
01682                 "pmaddwd %%mm6, %%mm3           \n\t"
01683 #ifndef FAST_BGR2YV12
01684                 "psrad $8, %%mm4                \n\t"
01685                 "psrad $8, %%mm1                \n\t"
01686                 "psrad $8, %%mm2                \n\t"
01687                 "psrad $8, %%mm3                \n\t"
01688 #endif
01689                 "packssdw %%mm1, %%mm4          \n\t"
01690                 "packssdw %%mm3, %%mm2          \n\t"
01691                 "pmaddwd %%mm5, %%mm4           \n\t"
01692                 "pmaddwd %%mm5, %%mm2           \n\t"
01693                 "addl $24, %%ebx                \n\t"
01694                 "packssdw %%mm2, %%mm4          \n\t"
01695                 "psraw $7, %%mm4                \n\t"
01696 
01697                 "packuswb %%mm4, %%mm0          \n\t"
01698                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
01699 
01700                 "movq %%mm0, (%1, %%eax)        \n\t"
01701                 "addl $8, %%eax                 \n\t"
01702                 " js 1b                         \n\t"
01703                 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
01704                 : "%eax", "%ebx"
01705         );
01706 #else
01707         int i;
01708         for(i=0; i<width; i++)
01709         {
01710                 int b= src[i*3+0];
01711                 int g= src[i*3+1];
01712                 int r= src[i*3+2];
01713 
01714                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
01715         }
01716 #endif
01717 }
01718 
01719 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
01720 {
01721 #ifdef HAVE_MMX
01722         asm volatile(
01723                 "movl %4, %%eax                 \n\t"
01724                 "movq "MANGLE(w1111)", %%mm5            \n\t"
01725                 "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
01726                 "pxor %%mm7, %%mm7              \n\t"
01727                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
01728                 "addl %%ebx, %%ebx              \n\t"
01729                 ".balign 16                     \n\t"
01730                 "1:                             \n\t"
01731                 PREFETCH" 64(%0, %%ebx)         \n\t"
01732                 PREFETCH" 64(%1, %%ebx)         \n\t"
01733 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
01734                 "movq (%0, %%ebx), %%mm0        \n\t"
01735                 "movq (%1, %%ebx), %%mm1        \n\t"
01736                 "movq 6(%0, %%ebx), %%mm2       \n\t"
01737                 "movq 6(%1, %%ebx), %%mm3       \n\t"
01738                 PAVGB(%%mm1, %%mm0)
01739                 PAVGB(%%mm3, %%mm2)
01740                 "movq %%mm0, %%mm1              \n\t"
01741                 "movq %%mm2, %%mm3              \n\t"
01742                 "psrlq $24, %%mm0               \n\t"
01743                 "psrlq $24, %%mm2               \n\t"
01744                 PAVGB(%%mm1, %%mm0)
01745                 PAVGB(%%mm3, %%mm2)
01746                 "punpcklbw %%mm7, %%mm0         \n\t"
01747                 "punpcklbw %%mm7, %%mm2         \n\t"
01748 #else
01749                 "movd (%0, %%ebx), %%mm0        \n\t"
01750                 "movd (%1, %%ebx), %%mm1        \n\t"
01751                 "movd 3(%0, %%ebx), %%mm2       \n\t"
01752                 "movd 3(%1, %%ebx), %%mm3       \n\t"
01753                 "punpcklbw %%mm7, %%mm0         \n\t"
01754                 "punpcklbw %%mm7, %%mm1         \n\t"
01755                 "punpcklbw %%mm7, %%mm2         \n\t"
01756                 "punpcklbw %%mm7, %%mm3         \n\t"
01757                 "paddw %%mm1, %%mm0             \n\t"
01758                 "paddw %%mm3, %%mm2             \n\t"
01759                 "paddw %%mm2, %%mm0             \n\t"
01760                 "movd 6(%0, %%ebx), %%mm4       \n\t"
01761                 "movd 6(%1, %%ebx), %%mm1       \n\t"
01762                 "movd 9(%0, %%ebx), %%mm2       \n\t"
01763                 "movd 9(%1, %%ebx), %%mm3       \n\t"
01764                 "punpcklbw %%mm7, %%mm4         \n\t"
01765                 "punpcklbw %%mm7, %%mm1         \n\t"
01766                 "punpcklbw %%mm7, %%mm2         \n\t"
01767                 "punpcklbw %%mm7, %%mm3         \n\t"
01768                 "paddw %%mm1, %%mm4             \n\t"
01769                 "paddw %%mm3, %%mm2             \n\t"
01770                 "paddw %%mm4, %%mm2             \n\t"
01771                 "psrlw $2, %%mm0                \n\t"
01772                 "psrlw $2, %%mm2                \n\t"
01773 #endif
01774                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
01775                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
01776                 
01777                 "pmaddwd %%mm0, %%mm1           \n\t"
01778                 "pmaddwd %%mm2, %%mm3           \n\t"
01779                 "pmaddwd %%mm6, %%mm0           \n\t"
01780                 "pmaddwd %%mm6, %%mm2           \n\t"
01781 #ifndef FAST_BGR2YV12
01782                 "psrad $8, %%mm0                \n\t"
01783                 "psrad $8, %%mm1                \n\t"
01784                 "psrad $8, %%mm2                \n\t"
01785                 "psrad $8, %%mm3                \n\t"
01786 #endif
01787                 "packssdw %%mm2, %%mm0          \n\t"
01788                 "packssdw %%mm3, %%mm1          \n\t"
01789                 "pmaddwd %%mm5, %%mm0           \n\t"
01790                 "pmaddwd %%mm5, %%mm1           \n\t"
01791                 "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
01792                 "psraw $7, %%mm0                \n\t"
01793 
01794 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
01795                 "movq 12(%0, %%ebx), %%mm4      \n\t"
01796                 "movq 12(%1, %%ebx), %%mm1      \n\t"
01797                 "movq 18(%0, %%ebx), %%mm2      \n\t"
01798                 "movq 18(%1, %%ebx), %%mm3      \n\t"
01799                 PAVGB(%%mm1, %%mm4)
01800                 PAVGB(%%mm3, %%mm2)
01801                 "movq %%mm4, %%mm1              \n\t"
01802                 "movq %%mm2, %%mm3              \n\t"
01803                 "psrlq $24, %%mm4               \n\t"
01804                 "psrlq $24, %%mm2               \n\t"
01805                 PAVGB(%%mm1, %%mm4)
01806                 PAVGB(%%mm3, %%mm2)
01807                 "punpcklbw %%mm7, %%mm4         \n\t"
01808                 "punpcklbw %%mm7, %%mm2         \n\t"
01809 #else
01810                 "movd 12(%0, %%ebx), %%mm4      \n\t"
01811                 "movd 12(%1, %%ebx), %%mm1      \n\t"
01812                 "movd 15(%0, %%ebx), %%mm2      \n\t"
01813                 "movd 15(%1, %%ebx), %%mm3      \n\t"
01814                 "punpcklbw %%mm7, %%mm4         \n\t"
01815                 "punpcklbw %%mm7, %%mm1         \n\t"
01816                 "punpcklbw %%mm7, %%mm2         \n\t"
01817                 "punpcklbw %%mm7, %%mm3         \n\t"
01818                 "paddw %%mm1, %%mm4             \n\t"
01819                 "paddw %%mm3, %%mm2             \n\t"
01820                 "paddw %%mm2, %%mm4             \n\t"
01821                 "movd 18(%0, %%ebx), %%mm5      \n\t"
01822                 "movd 18(%1, %%ebx), %%mm1      \n\t"
01823                 "movd 21(%0, %%ebx), %%mm2      \n\t"
01824                 "movd 21(%1, %%ebx), %%mm3      \n\t"
01825                 "punpcklbw %%mm7, %%mm5         \n\t"
01826                 "punpcklbw %%mm7, %%mm1         \n\t"
01827                 "punpcklbw %%mm7, %%mm2         \n\t"
01828                 "punpcklbw %%mm7, %%mm3         \n\t"
01829                 "paddw %%mm1, %%mm5             \n\t"
01830                 "paddw %%mm3, %%mm2             \n\t"
01831                 "paddw %%mm5, %%mm2             \n\t"
01832                 "movq "MANGLE(w1111)", %%mm5            \n\t"
01833                 "psrlw $2, %%mm4                \n\t"
01834                 "psrlw $2, %%mm2                \n\t"
01835 #endif
01836                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
01837                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
01838                 
01839                 "pmaddwd %%mm4, %%mm1           \n\t"
01840                 "pmaddwd %%mm2, %%mm3           \n\t"
01841                 "pmaddwd %%mm6, %%mm4           \n\t"
01842                 "pmaddwd %%mm6, %%mm2           \n\t"
01843 #ifndef FAST_BGR2YV12
01844                 "psrad $8, %%mm4                \n\t"
01845                 "psrad $8, %%mm1                \n\t"
01846                 "psrad $8, %%mm2                \n\t"
01847                 "psrad $8, %%mm3                \n\t"
01848 #endif
01849                 "packssdw %%mm2, %%mm4          \n\t"
01850                 "packssdw %%mm3, %%mm1          \n\t"
01851                 "pmaddwd %%mm5, %%mm4           \n\t"
01852                 "pmaddwd %%mm5, %%mm1           \n\t"
01853                 "addl $24, %%ebx                \n\t"
01854                 "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
01855                 "psraw $7, %%mm4                \n\t"
01856                 
01857                 "movq %%mm0, %%mm1              \n\t"
01858                 "punpckldq %%mm4, %%mm0         \n\t"
01859                 "punpckhdq %%mm4, %%mm1         \n\t"
01860                 "packsswb %%mm1, %%mm0          \n\t"
01861                 "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
01862 
01863                 "movd %%mm0, (%2, %%eax)        \n\t"
01864                 "punpckhdq %%mm0, %%mm0         \n\t"
01865                 "movd %%mm0, (%3, %%eax)        \n\t"
01866                 "addl $4, %%eax                 \n\t"
01867                 " js 1b                         \n\t"
01868                 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
01869                 : "%eax", "%ebx"
01870         );
01871 #else
01872         int i;
01873         for(i=0; i<width; i++)
01874         {
01875                 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
01876                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
01877                 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
01878 
01879                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
01880                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
01881         }
01882 #endif
01883 }
01884 
01885 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
01886 {
01887         int i;
01888         for(i=0; i<width; i++)
01889         {
01890                 int d= ((uint16_t*)src)[i];
01891                 int b= d&0x1F;
01892                 int g= (d>>5)&0x3F;
01893                 int r= (d>>11)&0x1F;
01894 
01895                 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
01896         }
01897 }
01898 
01899 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
01900 {
01901         int i;
01902         for(i=0; i<width; i++)
01903         {
01904                 int d0= ((uint32_t*)src1)[i];
01905                 int d1= ((uint32_t*)src2)[i];
01906                 
01907                 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
01908                 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
01909 
01910                 int dh2= (dh>>11) + (dh<<21);
01911                 int d= dh2 + dl;
01912 
01913                 int b= d&0x7F;
01914                 int r= (d>>11)&0x7F;
01915                 int g= d>>21;
01916                 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
01917                 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
01918         }
01919 }
01920 
01921 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
01922 {
01923         int i;
01924         for(i=0; i<width; i++)
01925         {
01926                 int d= ((uint16_t*)src)[i];
01927                 int b= d&0x1F;
01928                 int g= (d>>5)&0x1F;
01929                 int r= (d>>10)&0x1F;
01930 
01931                 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
01932         }
01933 }
01934 
01935 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
01936 {
01937         int i;
01938         for(i=0; i<width; i++)
01939         {
01940                 int d0= ((uint32_t*)src1)[i];
01941                 int d1= ((uint32_t*)src2)[i];
01942                 
01943                 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
01944                 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
01945 
01946                 int dh2= (dh>>11) + (dh<<21);
01947                 int d= dh2 + dl;
01948 
01949                 int b= d&0x7F;
01950                 int r= (d>>10)&0x7F;
01951                 int g= d>>21;
01952                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
01953                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
01954         }
01955 }
01956 
01957 
01958 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
01959 {
01960         int i;
01961         for(i=0; i<width; i++)
01962         {
01963                 int r=  ((uint32_t*)src)[i]&0xFF;
01964                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
01965                 int b= (((uint32_t*)src)[i]>>16)&0xFF;
01966 
01967                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
01968         }
01969 }
01970 
01971 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
01972 {
01973         int i;
01974         for(i=0; i<width; i++)
01975         {
01976                 const int a= ((uint32_t*)src1)[2*i+0];
01977                 const int e= ((uint32_t*)src1)[2*i+1];
01978                 const int c= ((uint32_t*)src2)[2*i+0];
01979                 const int d= ((uint32_t*)src2)[2*i+1];
01980                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
01981                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
01982                 const int r=  l&0x3FF;
01983                 const int g=  h>>8;
01984                 const int b=  l>>16;
01985 
01986                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
01987                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
01988         }
01989 }
01990 
01991 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
01992 {
01993         int i;
01994         for(i=0; i<width; i++)
01995         {
01996                 int r= src[i*3+0];
01997                 int g= src[i*3+1];
01998                 int b= src[i*3+2];
01999 
02000                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
02001         }
02002 }
02003 
02004 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
02005 {
02006         int i;
02007         for(i=0; i<width; i++)
02008         {
02009                 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
02010                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
02011                 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
02012 
02013                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
02014                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
02015         }
02016 }
02017 
02018 
02019 // Bilinear / Bicubic scaling
02020 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
02021                                   int16_t *filter, int16_t *filterPos, int filterSize)
02022 {
02023 #ifdef HAVE_MMX
02024         assert(filterSize % 4 == 0 && filterSize>0);
02025         if(filterSize==4) // allways true for upscaling, sometimes for down too
02026         {
02027                 int counter= -2*dstW;
02028                 filter-= counter*2;
02029                 filterPos-= counter/2;
02030                 dst-= counter/2;
02031                 asm volatile(
02032                         "pxor %%mm7, %%mm7              \n\t"
02033                         "movq "MANGLE(w02)", %%mm6      \n\t"
02034                         "pushl %%ebp                    \n\t" // we use 7 regs here ...
02035                         "movl %%eax, %%ebp              \n\t"
02036                         ".balign 16                     \n\t"
02037                         "1:                             \n\t"
02038                         "movzwl (%2, %%ebp), %%eax      \n\t"
02039                         "movzwl 2(%2, %%ebp), %%ebx     \n\t"
02040                         "movq (%1, %%ebp, 4), %%mm1     \n\t"
02041                         "movq 8(%1, %%ebp, 4), %%mm3    \n\t"
02042                         "movd (%3, %%eax), %%mm0        \n\t"
02043                         "movd (%3, %%ebx), %%mm2        \n\t"
02044                         "punpcklbw %%mm7, %%mm0         \n\t"
02045                         "punpcklbw %%mm7, %%mm2         \n\t"
02046                         "pmaddwd %%mm1, %%mm0           \n\t"
02047                         "pmaddwd %%mm2, %%mm3           \n\t"
02048                         "psrad $8, %%mm0                \n\t"
02049                         "psrad $8, %%mm3                \n\t"
02050                         "packssdw %%mm3, %%mm0          \n\t"
02051                         "pmaddwd %%mm6, %%mm0           \n\t"
02052                         "packssdw %%mm0, %%mm0          \n\t"
02053                         "movd %%mm0, (%4, %%ebp)        \n\t"
02054                         "addl $4, %%ebp                 \n\t"
02055                         " jnc 1b                        \n\t"
02056 
02057                         "popl %%ebp                     \n\t"
02058                         : "+a" (counter)
02059                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
02060                         : "%ebx"
02061                 );
02062         }
02063         else if(filterSize==8)
02064         {
02065                 int counter= -2*dstW;
02066                 filter-= counter*4;
02067                 filterPos-= counter/2;
02068                 dst-= counter/2;
02069                 asm volatile(
02070                         "pxor %%mm7, %%mm7              \n\t"
02071                         "movq "MANGLE(w02)", %%mm6      \n\t"
02072                         "pushl %%ebp                    \n\t" // we use 7 regs here ...
02073                         "movl %%eax, %%ebp              \n\t"
02074                         ".balign 16                     \n\t"
02075                         "1:                             \n\t"
02076                         "movzwl (%2, %%ebp), %%eax      \n\t"
02077                         "movzwl 2(%2, %%ebp), %%ebx     \n\t"
02078                         "movq (%1, %%ebp, 8), %%mm1     \n\t"
02079                         "movq 16(%1, %%ebp, 8), %%mm3   \n\t"
02080                         "movd (%3, %%eax), %%mm0        \n\t"
02081                         "movd (%3, %%ebx), %%mm2        \n\t"
02082                         "punpcklbw %%mm7, %%mm0         \n\t"
02083                         "punpcklbw %%mm7, %%mm2         \n\t"
02084                         "pmaddwd %%mm1, %%mm0           \n\t"
02085                         "pmaddwd %%mm2, %%mm3           \n\t"
02086 
02087                         "movq 8(%1, %%ebp, 8), %%mm1    \n\t"
02088                         "movq 24(%1, %%ebp, 8), %%mm5   \n\t"
02089                         "movd 4(%3, %%eax), %%mm4       \n\t"
02090                         "movd 4(%3, %%ebx), %%mm2       \n\t"
02091                         "punpcklbw %%mm7, %%mm4         \n\t"
02092                         "punpcklbw %%mm7, %%mm2         \n\t"
02093                         "pmaddwd %%mm1, %%mm4           \n\t"
02094                         "pmaddwd %%mm2, %%mm5           \n\t"
02095                         "paddd %%mm4, %%mm0             \n\t"
02096                         "paddd %%mm5, %%mm3             \n\t"
02097                                                 
02098                         "psrad $8, %%mm0                \n\t"
02099                         "psrad $8, %%mm3                \n\t"
02100                         "packssdw %%mm3, %%mm0          \n\t"
02101                         "pmaddwd %%mm6, %%mm0           \n\t"
02102                         "packssdw %%mm0, %%mm0          \n\t"
02103                         "movd %%mm0, (%4, %%ebp)        \n\t"
02104                         "addl $4, %%ebp                 \n\t"
02105                         " jnc 1b                        \n\t"
02106 
02107                         "popl %%ebp                     \n\t"
02108                         : "+a" (counter)
02109                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
02110                         : "%ebx"
02111                 );
02112         }
02113         else
02114         {
02115                 int counter= -2*dstW;
02116 //              filter-= counter*filterSize/2;
02117                 filterPos-= counter/2;
02118                 dst-= counter/2;
02119                 asm volatile(
02120                         "pxor %%mm7, %%mm7              \n\t"
02121                         "movq "MANGLE(w02)", %%mm6      \n\t"
02122                         ".balign 16                     \n\t"
02123                         "1:                             \n\t"
02124                         "movl %2, %%ecx                 \n\t"
02125                         "movzwl (%%ecx, %0), %%eax      \n\t"
02126                         "movzwl 2(%%ecx, %0), %%ebx     \n\t"
02127                         "movl %5, %%ecx                 \n\t"
02128                         "pxor %%mm4, %%mm4              \n\t"
02129                         "pxor %%mm5, %%mm5              \n\t"
02130                         "2:                             \n\t"
02131                         "movq (%1), %%mm1               \n\t"
02132                         "movq (%1, %6), %%mm3           \n\t"
02133                         "movd (%%ecx, %%eax), %%mm0     \n\t"
02134                         "movd (%%ecx, %%ebx), %%mm2     \n\t"
02135                         "punpcklbw %%mm7, %%mm0         \n\t"
02136                         "punpcklbw %%mm7, %%mm2         \n\t"
02137                         "pmaddwd %%mm1, %%mm0           \n\t"
02138                         "pmaddwd %%mm2, %%mm3           \n\t"
02139                         "paddd %%mm3, %%mm5             \n\t"
02140                         "paddd %%mm0, %%mm4             \n\t"
02141                         "addl $8, %1                    \n\t"
02142                         "addl $4, %%ecx                 \n\t"
02143                         "cmpl %4, %%ecx                 \n\t"
02144                         " jb 2b                         \n\t"
02145                         "addl %6, %1                    \n\t"
02146                         "psrad $8, %%mm4                \n\t"
02147                         "psrad $8, %%mm5                \n\t"
02148                         "packssdw %%mm5, %%mm4          \n\t"
02149                         "pmaddwd %%mm6, %%mm4           \n\t"
02150                         "packssdw %%mm4, %%mm4          \n\t"
02151                         "movl %3, %%eax                 \n\t"
02152                         "movd %%mm4, (%%eax, %0)        \n\t"
02153                         "addl $4, %0                    \n\t"
02154                         " jnc 1b                        \n\t"
02155 
02156                         : "+r" (counter), "+r" (filter)
02157                         : "m" (filterPos), "m" (dst), "m"(src+filterSize),
02158                           "m" (src), "r" (filterSize*2)
02159                         : "%ebx", "%eax", "%ecx"
02160                 );
02161         }
02162 #else
02163 #ifdef HAVE_ALTIVEC
02164         hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
02165 #else
02166         int i;
02167         for(i=0; i<dstW; i++)
02168         {
02169                 int j;
02170                 int srcPos= filterPos[i];
02171                 int val=0;
02172 //              printf("filterPos: %d\n", filterPos[i]);
02173                 for(j=0; j<filterSize; j++)
02174                 {
02175 //                      printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
02176                         val += ((int)src[srcPos + j])*filter[filterSize*i + j];
02177                 }
02178 //              filter += hFilterSize;
02179                 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
02180 //              dst[i] = val>>7;
02181         }
02182 #endif
02183 #endif
02184 }
02185       // *** horizontal scale Y line to temp buffer
02186 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
02187                                    int flags, int canMMX2BeUsed, int16_t *hLumFilter,
02188                                    int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
02189                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
02190                                    int32_t *mmx2FilterPos)
02191 {
02192     if(srcFormat==IMGFMT_YUY2)
02193     {
02194         RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
02195         src= formatConvBuffer;
02196     }
02197     else if(srcFormat==IMGFMT_UYVY)
02198     {
02199         RENAME(uyvyToY)(formatConvBuffer, src, srcW);
02200         src= formatConvBuffer;
02201     }
02202     else if(srcFormat==IMGFMT_BGR32)
02203     {
02204         RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
02205         src= formatConvBuffer;
02206     }
02207     else if(srcFormat==IMGFMT_BGR24)
02208     {
02209         RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
02210         src= formatConvBuffer;
02211     }
02212     else if(srcFormat==IMGFMT_BGR16)
02213     {
02214         RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
02215         src= formatConvBuffer;
02216     }
02217     else if(srcFormat==IMGFMT_BGR15)
02218     {
02219         RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
02220         src= formatConvBuffer;
02221     }
02222     else if(srcFormat==IMGFMT_RGB32)
02223     {
02224         RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
02225         src= formatConvBuffer;
02226     }
02227     else if(srcFormat==IMGFMT_RGB24)
02228     {
02229         RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
02230         src= formatConvBuffer;
02231     }
02232 
02233 #ifdef HAVE_MMX
02234         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
02235     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
02236 #else
02237     if(!(flags&SWS_FAST_BILINEAR))
02238 #endif
02239     {
02240         RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
02241     }
02242     else // Fast Bilinear upscale / crap downscale
02243     {
02244 #ifdef ARCH_X86
02245 #ifdef HAVE_MMX2
02246         int i;
02247         if(canMMX2BeUsed)
02248         {
02249                 asm volatile(
02250                         "pxor %%mm7, %%mm7              \n\t"
02251                         "movl %0, %%ecx                 \n\t"
02252                         "movl %1, %%edi                 \n\t"
02253                         "movl %2, %%edx                 \n\t"
02254                         "movl %3, %%ebx                 \n\t"
02255                         "xorl %%eax, %%eax              \n\t" // i
02256                         PREFETCH" (%%ecx)               \n\t"
02257                         PREFETCH" 32(%%ecx)             \n\t"
02258                         PREFETCH" 64(%%ecx)             \n\t"
02259 
02260 #define FUNNY_Y_CODE \
02261                         "movl (%%ebx), %%esi            \n\t"\
02262                         "call *%4                       \n\t"\
02263                         "addl (%%ebx, %%eax), %%ecx     \n\t"\
02264                         "addl %%eax, %%edi              \n\t"\
02265                         "xorl %%eax, %%eax              \n\t"\
02266 
02267 FUNNY_Y_CODE
02268 FUNNY_Y_CODE
02269 FUNNY_Y_CODE
02270 FUNNY_Y_CODE
02271 FUNNY_Y_CODE
02272 FUNNY_Y_CODE
02273 FUNNY_Y_CODE
02274 FUNNY_Y_CODE
02275 
02276                         :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
02277                         "m" (funnyYCode)
02278                         : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
02279                 );
02280                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
02281         }
02282         else
02283         {
02284 #endif
02285         //NO MMX just normal asm ...
02286         asm volatile(
02287                 "xorl %%eax, %%eax              \n\t" // i
02288                 "xorl %%ebx, %%ebx              \n\t" // xx
02289                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
02290                 ".balign 16                     \n\t"
02291                 "1:                             \n\t"
02292                 "movzbl  (%0, %%ebx), %%edi     \n\t" //src[xx]
02293                 "movzbl 1(%0, %%ebx), %%esi     \n\t" //src[xx+1]
02294                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
02295                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
02296                 "shll $16, %%edi                \n\t"
02297                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
02298                 "movl %1, %%edi                 \n\t"
02299                 "shrl $9, %%esi                 \n\t"
02300                 "movw %%si, (%%edi, %%eax, 2)   \n\t"
02301                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
02302                 "adcl %3, %%ebx                 \n\t" //xx+= xInc>>8 + carry
02303 
02304                 "movzbl (%0, %%ebx), %%edi      \n\t" //src[xx]
02305                 "movzbl 1(%0, %%ebx), %%esi     \n\t" //src[xx+1]
02306                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
02307                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
02308                 "shll $16, %%edi                \n\t"
02309                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
02310                 "movl %1, %%edi                 \n\t"
02311                 "shrl $9, %%esi                 \n\t"
02312                 "movw %%si, 2(%%edi, %%eax, 2)  \n\t"
02313                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
02314                 "adcl %3, %%ebx                 \n\t" //xx+= xInc>>8 + carry
02315 
02316 
02317                 "addl $2, %%eax                 \n\t"
02318                 "cmpl %2, %%eax                 \n\t"
02319                 " jb 1b                         \n\t"
02320 
02321 
02322                 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
02323                 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
02324                 );
02325 #ifdef HAVE_MMX2
02326         } //if MMX2 can't be used
02327 #endif
02328 #else
02329         int i;
02330         unsigned int xpos=0;
02331         for(i=0;i<dstWidth;i++)
02332         {
02333                 register unsigned int xx=xpos>>16;
02334                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
02335                 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
02336                 xpos+=xInc;
02337         }
02338 #endif
02339     }
02340 }
02341 
02342 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
02343                                    int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
02344                                    int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
02345                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
02346                                    int32_t *mmx2FilterPos)
02347 {
02348     if(srcFormat==IMGFMT_YUY2)
02349     {
02350         RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
02351         src1= formatConvBuffer;
02352         src2= formatConvBuffer+2048;
02353     }
02354     else if(srcFormat==IMGFMT_UYVY)
02355     {
02356         RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
02357         src1= formatConvBuffer;
02358         src2= formatConvBuffer+2048;
02359     }
02360     else if(srcFormat==IMGFMT_BGR32)
02361     {
02362         RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
02363         src1= formatConvBuffer;
02364         src2= formatConvBuffer+2048;
02365     }
02366     else if(srcFormat==IMGFMT_BGR24)
02367     {
02368         RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
02369         src1= formatConvBuffer;
02370         src2= formatConvBuffer+2048;
02371     }
02372     else if(srcFormat==IMGFMT_BGR16)
02373     {
02374         RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
02375         src1= formatConvBuffer;
02376         src2= formatConvBuffer+2048;
02377     }
02378     else if(srcFormat==IMGFMT_BGR15)
02379     {
02380         RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
02381         src1= formatConvBuffer;
02382         src2= formatConvBuffer+2048;
02383     }
02384     else if(srcFormat==IMGFMT_RGB32)
02385     {
02386         RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
02387         src1= formatConvBuffer;
02388         src2= formatConvBuffer+2048;
02389     }
02390     else if(srcFormat==IMGFMT_RGB24)
02391     {
02392         RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
02393         src1= formatConvBuffer;
02394         src2= formatConvBuffer+2048;
02395     }
02396     else if(isGray(srcFormat))
02397     {
02398         return;
02399     }
02400 
02401 #ifdef HAVE_MMX
02402         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
02403     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
02404 #else
02405     if(!(flags&SWS_FAST_BILINEAR))
02406 #endif
02407     {
02408         RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
02409         RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
02410     }
02411     else // Fast Bilinear upscale / crap downscale
02412     {
02413 #ifdef ARCH_X86
02414 #ifdef HAVE_MMX2
02415         int i;
02416         if(canMMX2BeUsed)
02417         {
02418                 asm volatile(
02419                         "pxor %%mm7, %%mm7              \n\t"
02420                         "movl %0, %%ecx                 \n\t"
02421                         "movl %1, %%edi                 \n\t"
02422                         "movl %2, %%edx                 \n\t"
02423                         "movl %3, %%ebx                 \n\t"
02424                         "xorl %%eax, %%eax              \n\t" // i
02425                         PREFETCH" (%%ecx)               \n\t"
02426                         PREFETCH" 32(%%ecx)             \n\t"
02427                         PREFETCH" 64(%%ecx)             \n\t"
02428 
02429 #define FUNNY_UV_CODE \
02430                         "movl (%%ebx), %%esi            \n\t"\
02431                         "call *%4                       \n\t"\
02432                         "addl (%%ebx, %%eax), %%ecx     \n\t"\
02433                         "addl %%eax, %%edi              \n\t"\
02434                         "xorl %%eax, %%eax              \n\t"\
02435 
02436 FUNNY_UV_CODE
02437 FUNNY_UV_CODE
02438 FUNNY_UV_CODE
02439 FUNNY_UV_CODE
02440                         "xorl %%eax, %%eax              \n\t" // i
02441                         "movl %5, %%ecx                 \n\t" // src
02442                         "movl %1, %%edi                 \n\t" // buf1
02443                         "addl $4096, %%edi              \n\t"
02444                         PREFETCH" (%%ecx)               \n\t"
02445                         PREFETCH" 32(%%ecx)             \n\t"
02446                         PREFETCH" 64(%%ecx)             \n\t"
02447 
02448 FUNNY_UV_CODE
02449 FUNNY_UV_CODE
02450 FUNNY_UV_CODE
02451 FUNNY_UV_CODE
02452 
02453                         :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
02454                         "m" (funnyUVCode), "m" (src2)
02455                         : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
02456                 );
02457                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
02458                 {
02459 //                      printf("%d %d %d\n", dstWidth, i, srcW);
02460                         dst[i] = src1[srcW-1]*128;
02461                         dst[i+2048] = src2[srcW-1]*128;
02462                 }
02463         }
02464         else
02465         {
02466 #endif
02467         asm volatile(
02468                 "xorl %%eax, %%eax              \n\t" // i
02469                 "xorl %%ebx, %%ebx              \n\t" // xx
02470                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
02471                 ".balign 16                     \n\t"
02472                 "1:                             \n\t"
02473                 "movl %0, %%esi                 \n\t"
02474                 "movzbl  (%%esi, %%ebx), %%edi  \n\t" //src[xx]
02475                 "movzbl 1(%%esi, %%ebx), %%esi  \n\t" //src[xx+1]
02476                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
02477                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
02478                 "shll $16, %%edi                \n\t"
02479                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
02480                 "movl %1, %%edi                 \n\t"
02481                 "shrl $9, %%esi                 \n\t"
02482                 "movw %%si, (%%edi, %%eax, 2)   \n\t"
02483 
02484                 "movzbl  (%5, %%ebx), %%edi     \n\t" //src[xx]
02485                 "movzbl 1(%5, %%ebx), %%esi     \n\t" //src[xx+1]
02486                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
02487                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
02488                 "shll $16, %%edi                \n\t"
02489                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
02490                 "movl %1, %%edi                 \n\t"
02491                 "shrl $9, %%esi                 \n\t"
02492                 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
02493 
02494                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
02495                 "adcl %3, %%ebx                 \n\t" //xx+= xInc>>8 + carry
02496                 "addl $1, %%eax                 \n\t"
02497                 "cmpl %2, %%eax                 \n\t"
02498                 " jb 1b                         \n\t"
02499 
02500                 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
02501                 "r" (src2)
02502                 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
02503                 );
02504 #ifdef HAVE_MMX2
02505         } //if MMX2 can't be used
02506 #endif
02507 #else
02508         int i;
02509         unsigned int xpos=0;
02510         for(i=0;i<dstWidth;i++)
02511         {
02512                 register unsigned int xx=xpos>>16;
02513                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
02514                 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
02515                 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
02516 /* slower
02517           dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
02518           dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
02519 */
02520                 xpos+=xInc;
02521         }
02522 #endif
02523    }
02524 }
02525 
02526 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
02527              int srcSliceH, uint8_t* dst[], int dstStride[]){
02528 
02529         /* load a few things into local vars to make the code more readable? and faster */
02530         const int srcW= c->srcW;
02531         const int dstW= c->dstW;
02532         const int dstH= c->dstH;
02533         const int chrDstW= c->chrDstW;
02534         const int chrSrcW= c->chrSrcW;
02535         const int lumXInc= c->lumXInc;
02536         const int chrXInc= c->chrXInc;
02537         const int dstFormat= c->dstFormat;
02538         const int srcFormat= c->srcFormat;
02539         const int flags= c->flags;
02540         const int canMMX2BeUsed= c->canMMX2BeUsed;
02541         int16_t *vLumFilterPos= c->vLumFilterPos;
02542         int16_t *vChrFilterPos= c->vChrFilterPos;
02543         int16_t *hLumFilterPos= c->hLumFilterPos;
02544         int16_t *hChrFilterPos= c->hChrFilterPos;
02545         int16_t *vLumFilter= c->vLumFilter;
02546         int16_t *vChrFilter= c->vChrFilter;
02547         int16_t *hLumFilter= c->hLumFilter;
02548         int16_t *hChrFilter= c->hChrFilter;
02549         int32_t *lumMmxFilter= c->lumMmxFilter;
02550         int32_t *chrMmxFilter= c->chrMmxFilter;
02551         const int vLumFilterSize= c->vLumFilterSize;
02552         const int vChrFilterSize= c->vChrFilterSize;
02553         const int hLumFilterSize= c->hLumFilterSize;
02554         const int hChrFilterSize= c->hChrFilterSize;
02555         int16_t **lumPixBuf= c->lumPixBuf;
02556         int16_t **chrPixBuf= c->chrPixBuf;
02557         const int vLumBufSize= c->vLumBufSize;
02558         const int vChrBufSize= c->vChrBufSize;
02559         uint8_t *funnyYCode= c->funnyYCode;
02560         uint8_t *funnyUVCode= c->funnyUVCode;
02561         uint8_t *formatConvBuffer= c->formatConvBuffer;
02562         const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
02563         const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
02564         int lastDstY;
02565 
02566         /* vars whch will change and which we need to storw back in the context */
02567         int dstY= c->dstY;
02568         int lumBufIndex= c->lumBufIndex;
02569         int chrBufIndex= c->chrBufIndex;
02570         int lastInLumBuf= c->lastInLumBuf;
02571         int lastInChrBuf= c->lastInChrBuf;
02572         
02573         if(isPacked(c->srcFormat)){
02574                 src[0]=
02575                 src[1]=
02576                 src[2]= src[0];
02577                 srcStride[0]=
02578                 srcStride[1]=
02579                 srcStride[2]= srcStride[0];
02580         }
02581         srcStride[1]<<= c->vChrDrop;
02582         srcStride[2]<<= c->vChrDrop;
02583 
02584 //      printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
02585 //              (int)dst[0], (int)dst[1], (int)dst[2]);
02586 
02587 #if 0 //self test FIXME move to a vfilter or something
02588 {
02589 static volatile int i=0;
02590 i++;
02591 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
02592         selfTest(src, srcStride, c->srcW, c->srcH);
02593 i--;
02594 }
02595 #endif
02596 
02597 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
02598 //dstStride[0],dstStride[1],dstStride[2]);
02599 
02600         if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
02601         {
02602                 static int firstTime=1; //FIXME move this into the context perhaps
02603                 if(flags & SWS_PRINT_INFO && firstTime)
02604                 {
02605                         MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
02606                                         "SwScaler:          ->cannot do aligned memory acesses anymore\n");
02607                         firstTime=0;
02608                 }
02609         }
02610 
02611         /* Note the user might start scaling the picture in the middle so this will not get executed
02612            this is not really intended but works currently, so ppl might do it */
02613         if(srcSliceY ==0){
02614                 lumBufIndex=0;
02615                 chrBufIndex=0;
02616                 dstY=0; 
02617                 lastInLumBuf= -1;
02618                 lastInChrBuf= -1;
02619         }
02620 
02621         lastDstY= dstY;
02622 
02623         for(;dstY < dstH; dstY++){
02624                 unsigned char *dest =dst[0]+dstStride[0]*dstY;
02625                 const int chrDstY= dstY>>c->chrDstVSubSample;
02626                 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
02627                 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
02628 
02629                 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
02630                 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
02631                 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
02632                 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
02633 
02634 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
02635 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
02636                 //handle holes (FAST_BILINEAR & weird filters)
02637                 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
02638                 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
02639 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
02640                 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
02641                 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
02642 
02643                 // Do we have enough lines in this slice to output the dstY line
02644                 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
02645                 {
02646                         //Do horizontal scaling
02647                         while(lastInLumBuf < lastLumSrcY)
02648                         {
02649                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
02650                                 lumBufIndex++;
02651 //                              printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
02652                                 ASSERT(lumBufIndex < 2*vLumBufSize)
02653                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
02654                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
02655 //                              printf("%d %d\n", lumBufIndex, vLumBufSize);
02656                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
02657                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
02658                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
02659                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
02660                                 lastInLumBuf++;
02661                         }
02662                         while(lastInChrBuf < lastChrSrcY)
02663                         {
02664                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
02665                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
02666                                 chrBufIndex++;
02667                                 ASSERT(chrBufIndex < 2*vChrBufSize)
02668                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
02669                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
02670                                 //FIXME replace parameters through context struct (some at least)
02671 
02672                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
02673                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
02674                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
02675                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
02676                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
02677                                 lastInChrBuf++;
02678                         }
02679                         //wrap buf index around to stay inside the ring buffer
02680                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
02681                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
02682                 }
02683                 else // not enough lines left in this slice -> load the rest in the buffer
02684                 {
02685 /*              printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
02686                         firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
02687                         lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
02688                         vChrBufSize, vLumBufSize);*/
02689 
02690                         //Do horizontal scaling
02691                         while(lastInLumBuf+1 < srcSliceY + srcSliceH)
02692                         {
02693                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
02694                                 lumBufIndex++;
02695                                 ASSERT(lumBufIndex < 2*vLumBufSize)
02696                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
02697                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
02698                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
02699                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
02700                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
02701                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
02702                                 lastInLumBuf++;
02703                         }
02704                         while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
02705                         {
02706                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
02707                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
02708                                 chrBufIndex++;
02709                                 ASSERT(chrBufIndex < 2*vChrBufSize)
02710                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
02711                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
02712 
02713                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
02714                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
02715                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
02716                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
02717                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
02718                                 lastInChrBuf++;
02719                         }
02720                         //wrap buf index around to stay inside the ring buffer
02721                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
02722                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
02723                         break; //we can't output a dstY line so let's try with the next slice
02724                 }
02725 
02726 #ifdef HAVE_MMX
02727                 b5Dither= dither8[dstY&1];
02728                 g6Dither= dither4[dstY&1];
02729                 g5Dither= dither8[dstY&1];
02730                 r5Dither= dither8[(dstY+1)&1];
02731 #endif
02732             if(dstY < dstH-2)
02733             {
02734                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
02735                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
02736 #ifdef HAVE_MMX
02737                 int i;
02738                 for(i=0; i<vLumFilterSize; i++)
02739                 {
02740                         lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
02741                         lumMmxFilter[4*i+2]= 
02742                         lumMmxFilter[4*i+3]= 
02743                                 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
02744                 }
02745                 for(i=0; i<vChrFilterSize; i++)
02746                 {
02747                         chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
02748                         chrMmxFilter[4*i+2]= 
02749                         chrMmxFilter[4*i+3]= 
02750                                 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
02751                 }
02752 #endif
02753                 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
02754                 {
02755                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
02756                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
02757                         if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
02758                         {
02759                                 int16_t *lumBuf = lumPixBuf[0];
02760                                 int16_t *chrBuf= chrPixBuf[0];
02761                                 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
02762                         }
02763                         else //General YV12
02764                         {
02765                                 RENAME(yuv2yuvX)(c,
02766                                         vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
02767                                         vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02768                                         dest, uDest, vDest, dstW, chrDstW);
02769                         }
02770                 }
02771                 else
02772                 {
02773                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
02774                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
02775                         if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
02776                         {
02777                                 int chrAlpha= vChrFilter[2*dstY+1];
02778                                 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
02779                                                  dest, dstW, chrAlpha, dstFormat, flags, dstY);
02780                         }
02781                         else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
02782                         {
02783                                 int lumAlpha= vLumFilter[2*dstY+1];
02784                                 int chrAlpha= vChrFilter[2*dstY+1];
02785                                 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
02786                                                  dest, dstW, lumAlpha, chrAlpha, dstY);
02787                         }
02788                         else //General RGB
02789                         {
02790                                 RENAME(yuv2packedX)(c,
02791                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02792                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02793                                         dest, dstW, dstY);
02794                         }
02795                 }
02796             }
02797             else // hmm looks like we can't use MMX here without overwriting this array's tail
02798             {
02799                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
02800                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
02801                 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
02802                 {
02803                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
02804                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
02805                         yuv2yuvXinC(
02806                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
02807                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02808                                 dest, uDest, vDest, dstW, chrDstW);
02809                 }
02810                 else
02811                 {
02812                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
02813                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
02814                         yuv2packedXinC(c, 
02815                                 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02816                                 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02817                                 dest, dstW, dstY);
02818                 }
02819             }
02820         }
02821 
02822 #ifdef HAVE_MMX
02823         __asm __volatile(SFENCE:::"memory");
02824         __asm __volatile(EMMS:::"memory");
02825 #endif
02826         /* store changed local vars back in the context */
02827         c->dstY= dstY;
02828         c->lumBufIndex= lumBufIndex;
02829         c->chrBufIndex= chrBufIndex;
02830         c->lastInLumBuf= lastInLumBuf;
02831         c->lastInChrBuf= lastInChrBuf;
02832 
02833         return dstY - lastDstY;
02834 }

Generated on Tue Dec 20 10:14:55 2005 for vlc-0.8.4a by  doxygen 1.4.2