vlc-0.8.4a: /home/huihoo/src/vlc/vlc-0.8.4a/modules/video_filter/swscale/yuv2rgb

00001 /*
00002   [email protected]    March 8, 2004
00003 
00004   Altivec Acceleration for Color Space Conversion revision 0.2
00005 
00006   convert I420 YV12 to RGB in various formats,
00007     it rejects images that are not in 420 formats
00008     it rejects images that don't have widths of multiples of 16
00009     it rejects images that don't have heights of multiples of 2
00010   reject defers to C simulation codes.
00011 
00012   lots of optimizations to be done here
00013 
00014   1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
00015      so we currently use max min to clip
00016 
00017   2. the inefficient use of chroma loading needs a bit of brushing up
00018 
00019   3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
00020 
00021 
00022   MODIFIED to calculate coeffs from currently selected color space.
00023   MODIFIED core to be a macro which you spec the output format.
00024   ADDED UYVY conversion which is never called due to some thing in SWSCALE.
00025   CORRECTED algorithim selection to be strict on input formats.
00026   ADDED runtime detection of altivec.
00027 
00028   ADDED altivec_yuv2packedX vertical scl + RGB converter
00029 
00030   March 27,2004
00031   PERFORMANCE ANALYSIS
00032 
00033   The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
00034   The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
00035 
00036   720*480*30  ~10MPS
00037 
00038   so we have roughly 10clocks per pixel this is too high something has to be wrong.
00039 
00040   OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
00041 
00042   OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
00043   guaranteed to have the input video frame it was just decompressed so
00044   it probably resides in L1 caches.  However we are creating the
00045   output video stream this needs to use the DSTST instruction to
00046   optimize for the cache.  We couple this with the fact that we are
00047   not going to be visiting the input buffer again so we mark it Least
00048   Recently Used.  This shaves 25% of the processor cycles off.
00049 
00050   Now MEMCPY is the largest mips consumer in the system, probably due
00051   to the inefficient X11 stuff.
00052 
00053   GL libraries seem to be very slow on this machine 1.33Ghz PB running
00054   Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
00055   a versioning issues, however i have libGL.1.2.dylib for both
00056   machines. ((We need to figure this out now))
00057 
00058   GL2 libraries work now with patch for RGB32
00059 
00060   NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
00061 
00062   Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. 
00063 
00064 */
00065 #include <stdio.h>
00066 #include <stdlib.h>
00067 #include <string.h>
00068 #include <inttypes.h>
00069 #include <assert.h>
00070 #include "config.h"
00071 #include "rgb2rgb.h"
00072 #include "swscale.h"
00073 #include "swscale_internal.h"
00074 #include "mangle.h"
00075 #include "img_format.h" //FIXME try to reduce dependency of such stuff
00076 
00077 #undef PROFILE_THE_BEAST
00078 #undef INC_SCALING
00079 
00080 typedef unsigned char ubyte;
00081 typedef signed char   sbyte;
00082 
00083 
00084 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
00085    homogeneous vector registers x0,x1,x2 are interleaved with the
00086    following technique:
00087 
00088       o0 = vec_mergeh (x0,x1);
00089       o1 = vec_perm (o0, x2, perm_rgb_0);
00090       o2 = vec_perm (o0, x2, perm_rgb_1);
00091       o3 = vec_mergel (x0,x1);
00092       o4 = vec_perm (o3,o2,perm_rgb_2);
00093       o5 = vec_perm (o3,o2,perm_rgb_3);
00094 
00095   perm_rgb_0:   o0(RG).h v1(B) --> o1*
00096               0   1  2   3   4
00097              rgbr|gbrg|brgb|rgbr
00098              0010 0100 1001 0010
00099              0102 3145 2673 894A
00100 
00101   perm_rgb_1:   o0(RG).h v1(B) --> o2
00102               0   1  2   3   4
00103              gbrg|brgb|bbbb|bbbb
00104              0100 1001 1111 1111
00105              B5CD 6EF7 89AB CDEF
00106 
00107   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
00108               0   1  2   3   4
00109              gbrg|brgb|rgbr|gbrg
00110              1111 1111 0010 0100
00111              89AB CDEF 0182 3945
00112 
00113   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
00114               0   1  2   3   4
00115              brgb|rgbr|gbrg|brgb
00116              1001 0010 0100 1001
00117              a67b 89cA BdCD eEFf
00118 
00119 */
00120 static
00121 const vector unsigned char
00122   perm_rgb_0 = (vector unsigned char)(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
00123                                       0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
00124   perm_rgb_1 = (vector unsigned char)(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
00125                                       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
00126   perm_rgb_2 = (vector unsigned char)(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
00127                                       0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
00128   perm_rgb_3 = (vector unsigned char)(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
00129                                       0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
00130 
00131 #define vec_merge3(x2,x1,x0,y0,y1,y2)    \
00132 do {                                     \
00133   typeof(x0) o0,o2,o3;                   \
00134       o0 = vec_mergeh (x0,x1);           \
00135       y0 = vec_perm (o0, x2, perm_rgb_0);\
00136       o2 = vec_perm (o0, x2, perm_rgb_1);\
00137       o3 = vec_mergel (x0,x1);           \
00138       y1 = vec_perm (o3,o2,perm_rgb_2);  \
00139       y2 = vec_perm (o3,o2,perm_rgb_3);  \
00140 } while(0)
00141 
00142 #define vec_mstrgb24(x0,x1,x2,ptr)        \
00143 do {                                     \
00144   typeof(x0) _0,_1,_2;                   \
00145   vec_merge3 (x0,x1,x2,_0,_1,_2);        \
00146   vec_st (_0, 0, ptr++);                 \
00147   vec_st (_1, 0, ptr++);                 \
00148   vec_st (_2, 0, ptr++);                 \
00149 }  while (0);
00150 
00151 #define vec_mstbgr24(x0,x1,x2,ptr)       \
00152 do {                                     \
00153   typeof(x0) _0,_1,_2;                   \
00154   vec_merge3 (x2,x1,x0,_0,_1,_2);        \
00155   vec_st (_0, 0, ptr++);                 \
00156   vec_st (_1, 0, ptr++);                 \
00157   vec_st (_2, 0, ptr++);                 \
00158 }  while (0);
00159 
00160 /* pack the pixels in rgb0 format
00161    msb R
00162    lsb 0
00163 */
00164 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                                \
00165 do {                                                                                   \
00166   T _0,_1,_2,_3;                                                                       \
00167   _0 = vec_mergeh (x0,x1);                                                             \
00168   _1 = vec_mergeh (x2,x3);                                                             \
00169   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
00170   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
00171   vec_st (_2, 0*16, (T *)ptr);                                                         \
00172   vec_st (_3, 1*16, (T *)ptr);                                                         \
00173   _0 = vec_mergel (x0,x1);                                                             \
00174   _1 = vec_mergel (x2,x3);                                                             \
00175   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
00176   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
00177   vec_st (_2, 2*16, (T *)ptr);                                                         \
00178   vec_st (_3, 3*16, (T *)ptr);                                                         \
00179   ptr += 4;                                                                            \
00180 }  while (0);
00181 
00182 /*
00183 
00184   | 1     0       1.4021   | | Y |
00185   | 1    -0.3441 -0.7142   |x| Cb|
00186   | 1     1.7718  0        | | Cr|
00187 
00188 
00189   Y:      [-128 127]
00190   Cb/Cr : [-128 127]
00191 
00192   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
00193 
00194 */
00195 
00196 
00197 
00198 
00199 #define vec_unh(x) \
00200   (vector signed short) \
00201     vec_perm(x,(typeof(x))(0),\
00202              (vector unsigned char)(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
00203                                     0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
00204 #define vec_unl(x) \
00205   (vector signed short) \
00206     vec_perm(x,(typeof(x))(0),\
00207              (vector unsigned char)(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
00208                                     0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
00209 
00210 #define vec_clip(x) \
00211   vec_max (vec_min (x, (typeof(x))(255)), (typeof(x))(0))
00212 
00213 #define vec_packclp_a(x,y) \
00214   (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
00215 
00216 #define vec_packclp(x,y) \
00217   (vector unsigned char)vec_packs \
00218       ((vector unsigned short)vec_max (x,(vector signed short) (0)), \
00219        (vector unsigned short)vec_max (y,(vector signed short) (0)))
00220 
00221 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,a,a,ptr)
00222 
00223 
00224 static inline void cvtyuvtoRGB (SwsContext *c,
00225                            vector signed short Y, vector signed short U, vector signed short V,
00226                            vector signed short *R, vector signed short *G, vector signed short *B)
00227 {
00228   vector signed   short vx,ux,uvx;
00229 
00230   Y = vec_mradds (Y, c->CY, c->OY);
00231 
00232   U = vec_sub (U,(vector signed short)(128));
00233   V = vec_sub (V,(vector signed short)(128));
00234 
00235   //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
00236   ux = vec_sl (U, c->CSHIFT);
00237   *B = vec_mradds (ux, c->CBU, Y);
00238 
00239   // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
00240   vx = vec_sl (V, c->CSHIFT);
00241   *R = vec_mradds (vx, c->CRV, Y);
00242 
00243   // uvx = ((CGU*u) + (CGV*v))>>15;
00244   uvx = vec_mradds (U, c->CGU, Y);
00245   *G = vec_mradds (V, c->CGV, uvx);
00246 }
00247 
00248 
00249 /*
00250   ------------------------------------------------------------------------------
00251   CS converters
00252   ------------------------------------------------------------------------------
00253 */
00254 
00255 
00256 #define DEFCSP420_CVT(name,out_pixels)                                     \
00257 static int altivec_##name (SwsContext *c,                                  \
00258                                 unsigned char **in, int *instrides,        \
00259                                 int srcSliceY,  int srcSliceH,             \
00260                                 unsigned char **oplanes, int *outstrides)  \
00261 {                                                                          \
00262   int w = c->srcW;                                                         \
00263   int h = srcSliceH;                                                       \
00264   int i,j;                                                                 \
00265   int instrides_scl[3];                                                    \
00266   vector unsigned char y0,y1;                                              \
00267                                                                            \
00268   vector signed char  u,v;                                                 \
00269                                                                            \
00270   vector signed short Y0,Y1,Y2,Y3;                                         \
00271   vector signed short U,V;                                                 \
00272   vector signed short vx,ux,uvx;                                           \
00273   vector signed short vx0,ux0,uvx0;                                        \
00274   vector signed short vx1,ux1,uvx1;                                        \
00275   vector signed short R0,G0,B0;                                            \
00276   vector signed short R1,G1,B1;                                            \
00277   vector unsigned char R,G,B;                                              \
00278                                                                            \
00279   vector unsigned char *uivP, *vivP;                                       \
00280   vector unsigned char align_perm;                                         \
00281                                                                            \
00282   vector signed short                                                      \
00283     lCY  = c->CY,                                                          \
00284     lOY  = c->OY,                                                          \
00285     lCRV = c->CRV,                                                         \
00286     lCBU = c->CBU,                                                         \
00287     lCGU = c->CGU,                                                         \
00288     lCGV = c->CGV;                                                         \
00289                                                                            \
00290   vector unsigned short lCSHIFT = c->CSHIFT;                               \
00291                                                                            \
00292   ubyte *y1i   = in[0];                                                    \
00293   ubyte *y2i   = in[0]+w;                                                  \
00294   ubyte *ui    = in[1];                                                    \
00295   ubyte *vi    = in[2];                                                    \
00296                                                                            \
00297   vector unsigned char *oute                                               \
00298     = (vector unsigned char *)                                             \
00299         (oplanes[0]+srcSliceY*outstrides[0]);                              \
00300   vector unsigned char *outo                                               \
00301     = (vector unsigned char *)                                             \
00302         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                \
00303                                                                            \
00304                                                                            \
00305   instrides_scl[0] = instrides[0];                                         \
00306   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */     \
00307   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */     \
00308                                                                            \
00309                                                                            \
00310   for (i=0;i<h/2;i++) {                                                    \
00311     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 \
00312     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 \
00313                                                                            \
00314     for (j=0;j<w/16;j++) {                                                 \
00315                                                                            \
00316       y0 = vec_ldl (0,y1i);                                                \
00317       y1 = vec_ldl (0,y2i);                                                \
00318       uivP = (vector unsigned char *)ui;                                   \
00319       vivP = (vector unsigned char *)vi;                                   \
00320                                                                            \
00321       align_perm = vec_lvsl (0, ui);                                       \
00322       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);     \
00323                                                                            \
00324       align_perm = vec_lvsl (0, vi);                                       \
00325       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);     \
00326                                                                            \
00327       u  = (vector signed char)vec_sub (u, (vector signed char)(128));     \
00328       v  = (vector signed char)vec_sub (v, (vector signed char)(128));     \
00329       U  = vec_unpackh (u);                                                \
00330       V  = vec_unpackh (v);                                                \
00331                                                                            \
00332                                                                            \
00333         Y0 = vec_unh (y0);                                                 \
00334         Y1 = vec_unl (y0);                                                 \
00335         Y2 = vec_unh (y1);                                                 \
00336         Y3 = vec_unl (y1);                                                 \
00337                                                                            \
00338         Y0 = vec_mradds (Y0, lCY, lOY);                                    \
00339         Y1 = vec_mradds (Y1, lCY, lOY);                                    \
00340         Y2 = vec_mradds (Y2, lCY, lOY);                                    \
00341         Y3 = vec_mradds (Y3, lCY, lOY);                                    \
00342                                                                            \
00343         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                         \
00344         ux = vec_sl (U, lCSHIFT);                                          \
00345         ux = vec_mradds (ux, lCBU, (vector signed short)(0));              \
00346         ux0  = vec_mergeh (ux,ux);                                         \
00347         ux1  = vec_mergel (ux,ux);                                         \
00348                                                                            \
00349         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */                         \
00350         vx = vec_sl (V, lCSHIFT);                                          \
00351         vx = vec_mradds (vx, lCRV, (vector signed short)(0));              \
00352         vx0  = vec_mergeh (vx,vx);                                         \
00353         vx1  = vec_mergel (vx,vx);                                         \
00354                                                                            \
00355         /* uvx = ((CGU*u) + (CGV*v))>>15 */                                \
00356         uvx = vec_mradds (U, lCGU, (vector signed short)(0));              \
00357         uvx = vec_mradds (V, lCGV, uvx);                                   \
00358         uvx0 = vec_mergeh (uvx,uvx);                                       \
00359         uvx1 = vec_mergel (uvx,uvx);                                       \
00360                                                                            \
00361         R0 = vec_add (Y0,vx0);                                             \
00362         G0 = vec_add (Y0,uvx0);                                            \
00363         B0 = vec_add (Y0,ux0);                                             \
00364         R1 = vec_add (Y1,vx1);                                             \
00365         G1 = vec_add (Y1,uvx1);                                            \
00366         B1 = vec_add (Y1,ux1);                                             \
00367                                                                            \
00368         R  = vec_packclp (R0,R1);                                          \
00369         G  = vec_packclp (G0,G1);                                          \
00370         B  = vec_packclp (B0,B1);                                          \
00371                                                                            \
00372         out_pixels(R,G,B,oute);                                            \
00373                                                                            \
00374         R0 = vec_add (Y2,vx0);                                             \
00375         G0 = vec_add (Y2,uvx0);                                            \
00376         B0 = vec_add (Y2,ux0);                                             \
00377         R1 = vec_add (Y3,vx1);                                             \
00378         G1 = vec_add (Y3,uvx1);                                            \
00379         B1 = vec_add (Y3,ux1);                                             \
00380         R  = vec_packclp (R0,R1);                                          \
00381         G  = vec_packclp (G0,G1);                                          \
00382         B  = vec_packclp (B0,B1);                                          \
00383                                                                            \
00384                                                                            \
00385         out_pixels(R,G,B,outo);                                            \
00386                                                                            \
00387       y1i  += 16;                                                          \
00388       y2i  += 16;                                                          \
00389       ui   += 8;                                                           \
00390       vi   += 8;                                                           \
00391                                                                            \
00392     }                                                                      \
00393                                                                            \
00394     outo += (outstrides[0])>>4;                                            \
00395     oute += (outstrides[0])>>4;                                            \
00396                                                                            \
00397     ui    += instrides_scl[1];                                             \
00398     vi    += instrides_scl[2];                                             \
00399     y1i   += instrides_scl[0];                                             \
00400     y2i   += instrides_scl[0];                                             \
00401   }                                                                        \
00402   return srcSliceH;                                                        \
00403 }
00404 
00405 
00406 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))(0)),c,b,a,ptr)
00407 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))(0)),ptr)
00408 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))(0)),ptr)
00409 #define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))(0)),a,b,c,ptr)
00410 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
00411 #define out_bgr24(a,b,c,ptr) vec_mstrgb24(c,b,a,ptr)
00412 
00413 DEFCSP420_CVT (yuv2_abgr32, out_abgr)
00414 DEFCSP420_CVT (yuv2_bgra32, out_argb)
00415 DEFCSP420_CVT (yuv2_rgba32, out_rgba)
00416 DEFCSP420_CVT (yuv2_argb32, out_argb)
00417 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
00418 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
00419 
00420 
00421 // uyvy|uyvy|uyvy|uyvy
00422 // 0123 4567 89ab cdef
00423 static
00424 const vector unsigned char
00425   demux_u = (vector unsigned char)(0x10,0x00,0x10,0x00,
00426                                    0x10,0x04,0x10,0x04,
00427                                    0x10,0x08,0x10,0x08,
00428                                    0x10,0x0c,0x10,0x0c),
00429   demux_v = (vector unsigned char)(0x10,0x02,0x10,0x02,
00430                                    0x10,0x06,0x10,0x06,
00431                                    0x10,0x0A,0x10,0x0A,
00432                                    0x10,0x0E,0x10,0x0E),
00433   demux_y = (vector unsigned char)(0x10,0x01,0x10,0x03,
00434                                    0x10,0x05,0x10,0x07,
00435                                    0x10,0x09,0x10,0x0B,
00436                                    0x10,0x0D,0x10,0x0F);
00437 
00438 /*
00439   this is so I can play live CCIR raw video
00440 */
00441 static int altivec_uyvy_rgb32 (SwsContext *c,
00442                                unsigned char **in, int *instrides,
00443                                int srcSliceY,   int srcSliceH,
00444                                unsigned char **oplanes, int *outstrides)
00445 {
00446   int w = c->srcW;
00447   int h = srcSliceH;
00448   int i,j;
00449   vector unsigned char uyvy;
00450   vector signed   short Y,U,V;
00451   vector signed   short vx,ux,uvx;
00452   vector signed   short R0,G0,B0,R1,G1,B1;
00453   vector unsigned char  R,G,B;
00454   vector unsigned char *out;
00455   ubyte *img;
00456 
00457   img = in[0];
00458   out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
00459 
00460   for (i=0;i<h;i++) {
00461     for (j=0;j<w/16;j++) {
00462       uyvy = vec_ld (0, img);
00463       U = (vector signed short)
00464         vec_perm (uyvy, (vector unsigned char)(0), demux_u);
00465 
00466       V = (vector signed short)
00467         vec_perm (uyvy, (vector unsigned char)(0), demux_v);
00468 
00469       Y = (vector signed short)
00470         vec_perm (uyvy, (vector unsigned char)(0), demux_y);
00471 
00472       cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
00473 
00474       uyvy = vec_ld (16, img);
00475       U = (vector signed short)
00476         vec_perm (uyvy, (vector unsigned char)(0), demux_u);
00477 
00478       V = (vector signed short)
00479         vec_perm (uyvy, (vector unsigned char)(0), demux_v);
00480 
00481       Y = (vector signed short)
00482         vec_perm (uyvy, (vector unsigned char)(0), demux_y);
00483 
00484       cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
00485 
00486       R  = vec_packclp (R0,R1);
00487       G  = vec_packclp (G0,G1);
00488       B  = vec_packclp (B0,B1);
00489 
00490       //      vec_mstbgr24 (R,G,B, out);
00491       out_rgba (R,G,B,out);
00492 
00493       img += 32;
00494     }
00495   }
00496   return srcSliceH;
00497 }
00498 
00499 
00500 
00501 /* Ok currently the acceleration routine only supports
00502    inputs of widths a multiple of 16
00503    and heights a multiple 2
00504 
00505    So we just fall back to the C codes for this.
00506 */
00507 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
00508 {
00509   if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))    
00510     return NULL;
00511 
00512   /*
00513     and this seems not to matter too much I tried a bunch of 
00514     videos with abnormal widths and mplayer crashes else where.
00515     mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv 
00516     boom with X11 bad match.
00517     
00518   */
00519   if ((c->srcW & 0xf) != 0)    return NULL;
00520 
00521   switch (c->srcFormat) {
00522   case IMGFMT_YVU9:
00523   case IMGFMT_IF09:
00524   case IMGFMT_YV12:
00525   case IMGFMT_I420:
00526   case IMGFMT_IYUV:
00527   case IMGFMT_CLPL:
00528   case IMGFMT_Y800:
00529   case IMGFMT_Y8:
00530   case IMGFMT_NV12:
00531   case IMGFMT_NV21:
00532     if ((c->srcH & 0x1) != 0)
00533       return NULL;
00534 
00535     switch(c->dstFormat){
00536     case IMGFMT_RGB24:
00537       MSG_WARN("ALTIVEC: Color Space RGB24\n");
00538       return altivec_yuv2_rgb24;
00539     case IMGFMT_BGR24:
00540       MSG_WARN("ALTIVEC: Color Space BGR24\n");
00541       return altivec_yuv2_bgr24;
00542     case IMGFMT_RGB32:
00543       MSG_WARN("ALTIVEC: Color Space ARGB32\n");
00544       return altivec_yuv2_argb32;
00545     case IMGFMT_BGR32:
00546       MSG_WARN("ALTIVEC: Color Space BGRA32\n");
00547       //      return profile_altivec_bgra32;
00548 
00549       return altivec_yuv2_bgra32;
00550     default: return NULL;
00551     }
00552     break;
00553 
00554   case IMGFMT_UYVY:
00555     switch(c->dstFormat){
00556     case IMGFMT_RGB32:
00557       MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
00558       return altivec_uyvy_rgb32;
00559     default: return NULL;
00560     }
00561     break;
00562 
00563   }
00564   return NULL;
00565 }
00566 
00567 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4])
00568 {
00569   vector signed short CY, CRV, CBU, CGU, CGV, OY, Y0;
00570   int64_t crv __attribute__ ((aligned(16))) = inv_table[0];
00571   int64_t cbu __attribute__ ((aligned(16))) = inv_table[1];
00572   int64_t cgu __attribute__ ((aligned(16))) = inv_table[2];
00573   int64_t cgv __attribute__ ((aligned(16))) = inv_table[3];
00574   int64_t cy = (1<<16)-1;
00575   int64_t oy = 0;
00576   short tmp __attribute__ ((aligned(16)));
00577 
00578   if ((c->flags & SWS_CPU_CAPS_ALTIVEC) == 0)
00579     return;
00580 
00581   cy = (cy *c->contrast             )>>17;
00582   crv= (crv*c->contrast * c->saturation)>>32;
00583   cbu= (cbu*c->contrast * c->saturation)>>32;
00584   cgu= (cgu*c->contrast * c->saturation)>>32;
00585   cgv= (cgv*c->contrast * c->saturation)>>32;
00586 
00587   oy -= 256*c->brightness;
00588 
00589   tmp = cy;
00590   CY = vec_lde (0, &tmp);
00591   CY  = vec_splat (CY, 0);
00592 
00593   tmp = oy;
00594   OY = vec_lde (0, &tmp);
00595   OY  = vec_splat (OY, 0);
00596 
00597   tmp = crv>>3;
00598   CRV = vec_lde (0, &tmp);
00599   CRV  = vec_splat (CRV, 0);
00600   tmp = cbu>>3;
00601   CBU = vec_lde (0, &tmp);
00602   CBU  = vec_splat (CBU, 0);
00603 
00604   tmp = -(cgu>>1);
00605   CGU = vec_lde (0, &tmp);
00606   CGU  = vec_splat (CGU, 0);
00607   tmp = -(cgv>>1);
00608   CGV = vec_lde (0, &tmp);
00609   CGV  = vec_splat (CGV, 0);
00610 
00611   c->CSHIFT = (vector unsigned short)(2);
00612   c->CY = CY;
00613   c->OY = OY;
00614   c->CRV = CRV;
00615   c->CBU = CBU;
00616   c->CGU = CGU;
00617   c->CGV = CGV;
00618 
00619 #if 0
00620   printf ("cy:  %hvx\n", CY);
00621   printf ("oy:  %hvx\n", OY);
00622   printf ("crv: %hvx\n", CRV);
00623   printf ("cbu: %hvx\n", CBU);
00624   printf ("cgv: %hvx\n", CGV);
00625   printf ("cgu: %hvx\n", CGU);
00626 #endif
00627 
00628  return;
00629 }
00630 
00631 
00632 void
00633 altivec_yuv2packedX (SwsContext *c,
00634                        int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
00635                        int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
00636                        uint8_t *dest, int dstW, int dstY)
00637 {
00638   int i,j;
00639   short tmp __attribute__((aligned (16)));
00640   short *p;
00641   short *f;
00642   vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
00643   vector signed short R0,G0,B0,R1,G1,B1;
00644 
00645   vector unsigned char R,G,B,pels[3];
00646   vector unsigned char *out,*nout;
00647   vector signed short   RND = (vector signed short)(1<<3);
00648   vector unsigned short SCL = (vector unsigned short)(4);
00649   unsigned long scratch[16] __attribute__ ((aligned (16)));
00650 
00651   vector signed short *vYCoeffsBank, *vCCoeffsBank;
00652 
00653   vector signed short *YCoeffs, *CCoeffs;
00654 
00655   vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*dstW);
00656   vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*dstW);
00657 
00658   for (i=0;i<lumFilterSize*dstW;i++) {
00659     tmp = c->vLumFilter[i];
00660     p = &vYCoeffsBank[i];
00661     for (j=0;j<8;j++)
00662       p[j] = tmp;
00663   }
00664 
00665   for (i=0;i<chrFilterSize*dstW;i++) {
00666     tmp = c->vChrFilter[i];
00667     p = &vCCoeffsBank[i];
00668     for (j=0;j<8;j++)
00669       p[j] = tmp;
00670   }
00671 
00672   YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
00673   CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
00674 
00675   out = (vector unsigned char *)dest;
00676 
00677   for(i=0; i<dstW; i+=16){
00678     Y0 = RND;
00679     Y1 = RND;
00680     /* extract 16 coeffs from lumSrc */
00681     for(j=0; j<lumFilterSize; j++) {
00682       X0 = vec_ld (0,  &lumSrc[j][i]);
00683       X1 = vec_ld (16, &lumSrc[j][i]);
00684       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
00685       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
00686     }
00687 
00688     U = RND;
00689     V = RND;
00690     /* extract 8 coeffs from U,V */
00691     for(j=0; j<chrFilterSize; j++) {
00692       X  = vec_ld (0, &chrSrc[j][i/2]);
00693       U  = vec_mradds (X, CCoeffs[j], U);
00694       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
00695       V  = vec_mradds (X, CCoeffs[j], V);
00696     }
00697 
00698     /* scale and clip signals */
00699     Y0 = vec_sra (Y0, SCL);
00700     Y1 = vec_sra (Y1, SCL);
00701     U  = vec_sra (U,  SCL);
00702     V  = vec_sra (V,  SCL);
00703 
00704     Y0 = vec_clip (Y0);
00705     Y1 = vec_clip (Y1);
00706     U  = vec_clip (U);
00707     V  = vec_clip (V);
00708 
00709     /* now we have
00710       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
00711       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
00712 
00713       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
00714       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
00715       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
00716     */
00717 
00718     U0 = vec_mergeh (U,U);
00719     V0 = vec_mergeh (V,V);
00720 
00721     U1 = vec_mergel (U,U);
00722     V1 = vec_mergel (V,V);
00723 
00724     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
00725     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
00726 
00727     R  = vec_packclp (R0,R1);
00728     G  = vec_packclp (G0,G1);
00729     B  = vec_packclp (B0,B1);
00730 
00731     out_rgba (R,G,B,out);
00732   }
00733 
00734   if (i < dstW) {
00735     i -= 16;
00736 
00737     Y0 = RND;
00738     Y1 = RND;
00739     /* extract 16 coeffs from lumSrc */
00740     for(j=0; j<lumFilterSize; j++) {
00741       X0 = vec_ld (0,  &lumSrc[j][i]);
00742       X1 = vec_ld (16, &lumSrc[j][i]);
00743       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
00744       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
00745     }
00746 
00747     U = RND;
00748     V = RND;
00749     /* extract 8 coeffs from U,V */
00750     for(j=0; j<chrFilterSize; j++) {
00751       X  = vec_ld (0, &chrSrc[j][i/2]);
00752       U  = vec_mradds (X, CCoeffs[j], U);
00753       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
00754       V  = vec_mradds (X, CCoeffs[j], V);
00755     }
00756 
00757     /* scale and clip signals */
00758     Y0 = vec_sra (Y0, SCL);
00759     Y1 = vec_sra (Y1, SCL);
00760     U  = vec_sra (U,  SCL);
00761     V  = vec_sra (V,  SCL);
00762 
00763     Y0 = vec_clip (Y0);
00764     Y1 = vec_clip (Y1);
00765     U  = vec_clip (U);
00766     V  = vec_clip (V);
00767 
00768     /* now we have
00769        Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
00770        U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
00771 
00772        Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
00773        U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
00774        V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
00775     */
00776 
00777     U0 = vec_mergeh (U,U);
00778     V0 = vec_mergeh (V,V);
00779 
00780     U1 = vec_mergel (U,U);
00781     V1 = vec_mergel (V,V);
00782 
00783     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
00784     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
00785 
00786     R  = vec_packclp (R0,R1);
00787     G  = vec_packclp (G0,G1);
00788     B  = vec_packclp (B0,B1);
00789 
00790     nout = (vector unsigned char *)scratch;
00791     out_rgba (R,G,B,nout);
00792 
00793     memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
00794   }
00795 
00796   if (vYCoeffsBank) free (vYCoeffsBank);
00797   if (vCCoeffsBank) free (vCCoeffsBank);
00798 
00799 }
yuv2rgb_altivec.c