guliverkli: C:/guliverkli/src/filters/transform/mpeg2decfilter/idct

00001 #include "stdafx.h"
00002 #include "libmpeg2.h"
00003 
00004 // Intel's SSE2 implementation of iDCT
00005 // AP-945
00006 // http://cache-www.intel.com/cd/00/00/01/76/17680_w_idct.pdf
00007 
00008 #define BITS_INV_ACC 4 // 4 or 5 for IEEE
00009 #define SHIFT_INV_ROW 16 - BITS_INV_ACC
00010 #define SHIFT_INV_COL 1 + BITS_INV_ACC
00011 const short RND_INV_ROW = 1024 * (6 - BITS_INV_ACC); //1 << (SHIFT_INV_ROW-1)
00012 const short RND_INV_COL = 16 * (BITS_INV_ACC - 3); // 1 << (SHIFT_INV_COL-1)
00013 const short RND_INV_CORR = RND_INV_COL - 1; // correction -1.0 and round
00014 
00015 __declspec(align(16)) short M128_one_corr[8] = {1,1,1,1,1,1,1,1};
00016 __declspec(align(16)) short M128_round_inv_row[8] = {RND_INV_ROW, 0, RND_INV_ROW, 0, RND_INV_ROW, 0, RND_INV_ROW, 0};
00017 __declspec(align(16)) short M128_round_inv_col[8] = {RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL};
00018 __declspec(align(16)) short M128_round_inv_corr[8]= {RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR};
00019 __declspec(align(16)) short M128_tg_1_16[8] = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036}; // tg * (2<<16) + 0.5
00020 __declspec(align(16)) short M128_tg_2_16[8] = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146}; // tg * (2<<16) + 0.5
00021 __declspec(align(16)) short M128_tg_3_16[8] = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746}; // tg * (2<<16) + 0.5
00022 __declspec(align(16)) short M128_cos_4_16[8] = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};// cos * (2<<16) + 0.5
00023 
00024 //-----------------------------------------------------------------------------
00025 // Table for rows 0,4 - constants are multiplied on cos_4_16
00026 //movq -> w13 w12 w09 w08 w05 w04 w01 w00
00027 // w15 w14 w11 w10 w07 w06 w03 w02
00028 // w29 w28 w25 w24 w21 w20 w17 w16
00029 // w31 w30 w27 w26 w23 w22 w19 w18
00030 
00031 __declspec(align(16)) short M128_tab_i_04[] = 
00032 {
00033         16384, 21407, 16384, 8867, //movq -> w05 w04 w01 w00
00034         16384, -8867, 16384, -21407, // w13 w12 w09 w08
00035         16384, 8867, -16384, -21407, // w07 w06 w03 w02
00036         -16384, 21407, 16384, -8867, // w15 w14 w11 w10
00037         22725, 19266, 19266, -4520, // w21 w20 w17 w16
00038         12873, -22725, 4520, -12873, // w29 w28 w25 w24
00039         12873, 4520, -22725, -12873, // w23 w22 w19 w18
00040         4520, 19266, 19266, -22725  // w31 w30 w27 w26
00041 };
00042 
00043 // Table for rows 1,7 - constants are multiplied on cos_1_16
00044 
00045 __declspec(align(16)) short M128_tab_i_17[] =
00046 {
00047         22725, 29692, 22725, 12299, //movq -> w05 w04 w01 w00
00048         22725, -12299, 22725, -29692, // w13 w12 w09 w08
00049         22725, 12299, -22725, -29692, // w07 w06 w03 w02
00050         -22725, 29692, 22725, -12299, // w15 w14 w11 w10
00051         31521, 26722, 26722, -6270, // w21 w20 w17 w16
00052         17855, -31521, 6270, -17855, // w29 w28 w25 w24
00053         17855, 6270, -31521, -17855, // w23 w22 w19 w18
00054         6270, 26722, 26722, -31521 // w31 w30 w27 w26
00055 };
00056 
00057 // Table for rows 2,6 - constants are multiplied on cos_2_16
00058 
00059 __declspec(align(16)) short M128_tab_i_26[] =
00060 {
00061         21407, 27969, 21407, 11585, //movq -> w05 w04 w01 w00
00062         21407, -11585, 21407, -27969, // w13 w12 w09 w08
00063         21407, 11585, -21407, -27969, // w07 w06 w03 w02
00064         -21407, 27969, 21407, -11585, // w15 w14 w11 w10
00065         29692, 25172, 25172, -5906, // w21 w20 w17 w16
00066         16819, -29692, 5906, -16819, // w29 w28 w25 w24
00067         16819, 5906, -29692, -16819, // w23 w22 w19 w18
00068         5906, 25172, 25172, -29692 // w31 w30 w27 w26
00069 };
00070 
00071 // Table for rows 3,5 - constants are multiplied on cos_3_16
00072 
00073 __declspec(align(16)) short M128_tab_i_35[] = 
00074 {
00075         19266, 25172, 19266, 10426, //movq -> w05 w04 w01 w00
00076         19266, -10426, 19266, -25172, // w13 w12 w09 w08
00077         19266, 10426, -19266, -25172, // w07 w06 w03 w02
00078         -19266, 25172, 19266, -10426, // w15 w14 w11 w10
00079         26722, 22654, 22654, -5315, // w21 w20 w17 w16
00080         15137, -26722, 5315, -15137, // w29 w28 w25 w24
00081         15137, 5315, -26722, -15137, // w23 w22 w19 w18
00082         5315, 22654, 22654, -26722 // w31 w30 w27 w26
00083 };
00084 
00085 //-----------------------------------------------------------------------------
00086 /*
00087 ;=============================================================================
00088 ;=============================================================================
00089 ;=============================================================================
00090 ;
00091 ; Inverse DCT
00092 ;
00093 ;-----------------------------------------------------------------------------
00094 ;
00095 ; This implementation calculates iDCT-2D by a row-column method.
00096 ; On the first stage the iDCT-1D is calculated for each row with use
00097 ; direct algorithm, on the second stage the calculation is executed
00098 ; at once for four columns with use of scaled iDCT-1D algorithm.
00099 ; Base R&Y algorithm for iDCT-1D is modified for second stage.
00100 ;
00101 ;=============================================================================
00102 ;-----------------------------------------------------------------------------
00103 ;
00104 ; The first stage - inverse DCTs of rows
00105 ;
00106 ;-----------------------------------------------------------------------------
00107 ; The 8-point inverse DCT direct algorithm
00108 ;-----------------------------------------------------------------------------
00109 ;
00110 ; static const short w[32] = {
00111 ; FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16),
00112 ; FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
00113 ; FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16),
00114 ; FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16),
00115 ; FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16),
00116 ; FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
00117 ; FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16),
00118 ; FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) };
00119 ;
00120 ; #define DCT_8_INV_ROW(x, y)
00121 ; {
00122 ; int a0, a1, a2, a3, b0, b1, b2, b3;
00123 ;
00124 ; a0 = x[0] * w[ 0] + x[2] * w[ 1] + x[4] * w[ 2] + x[6] * w[ 3];
00125 ; a1 = x[0] * w[ 4] + x[2] * w[ 5] + x[4] * w[ 6] + x[6] * w[ 7];
00126 ; a2 = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11];
00127 ; a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
00128 ; b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
00129 ; b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
00130 ; b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
00131 ; b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
00132 ;
00133 ; y[0] = SHIFT_ROUND ( a0 + b0 );
00134 ; y[1] = SHIFT_ROUND ( a1 + b1 );
00135 ; y[2] = SHIFT_ROUND ( a2 + b2 );
00136 ; y[3] = SHIFT_ROUND ( a3 + b3 );
00137 ; y[4] = SHIFT_ROUND ( a3 - b3 );
00138 ; y[5] = SHIFT_ROUND ( a2 - b2 );
00139 ; y[6] = SHIFT_ROUND ( a1 - b1 );
00140 ; y[7] = SHIFT_ROUND ( a0 - b0 );
00141 ; }
00142 ;
00143 ;-----------------------------------------------------------------------------
00144 ;
00145 ; In this implementation the outputs of the iDCT-1D are multiplied
00146 ; for rows 0,4 - on cos_4_16,
00147 ; for rows 1,7 - on cos_1_16,
00148 ; for rows 2,6 - on cos_2_16,
00149 ; for rows 3,5 - on cos_3_16
00150 ; and are shifted to the left for rise of accuracy
00151 ;
00152 ; For used constants
00153 ; FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
00154 ;
00155 ;-----------------------------------------------------------------------------
00156 ;-----------------------------------------------------------------------------
00157 ;
00158 ; The second stage - inverse DCTs of columns
00159 ;
00160 ; The inputs are multiplied
00161 ; for rows 0,4 - on cos_4_16,
00162 ; for rows 1,7 - on cos_1_16,
00163 ; for rows 2,6 - on cos_2_16,
00164 ; for rows 3,5 - on cos_3_16
00165 ; and are shifted to the left for rise of accuracy
00166 ;
00167 ;-----------------------------------------------------------------------------
00168 ;
00169 ; The 8-point scaled inverse DCT algorithm (26a8m)
00170 ;
00171 ;-----------------------------------------------------------------------------
00172 ;
00173 ; #define DCT_8_INV_COL(x, y)
00174 ; {
00175 ; short t0, t1, t2, t3, t4, t5, t6, t7;
00176 ; short tp03, tm03, tp12, tm12, tp65, tm65;
00177 ; short tp465, tm465, tp765, tm765;
00178 ;
00179 ; tp765 = x[1] + x[7] * tg_1_16;
00180 ; tp465 = x[1] * tg_1_16 - x[7];
00181 ; tm765 = x[5] * tg_3_16 + x[3];
00182 ; tm465 = x[5] - x[3] * tg_3_16;
00183 ;
00184 ; t7 = tp765 + tm765;
00185 ; tp65 = tp765 - tm765;
00186 ; t4 = tp465 + tm465;
00187 ; tm65 = tp465 - tm465;
00188 ;
00189 ; t6 = ( tp65 + tm65 ) * cos_4_16;
00190 ; t5 = ( tp65 - tm65 ) * cos_4_16;
00191 ;
00192 ; tp03 = x[0] + x[4];
00193 ; tp12 = x[0] - x[4];
00194 ;
00195 ; tm03 = x[2] + x[6] * tg_2_16;
00196 ; tm12 = x[2] * tg_2_16 - x[6];
00197 ;
00198 ; t0 = tp03 + tm03;
00199 ; t3 = tp03 - tm03;
00200 ; t1 = tp12 + tm12;
00201 ; t2 = tp12 - tm12;
00202 ;
00203 ; y[0] = SHIFT_ROUND ( t0 + t7 );
00204 ; y[7] = SHIFT_ROUND ( t0 - t7 );
00205 ; y[1] = SHIFT_ROUND ( t1 + t6 );
00206 ; y[6] = SHIFT_ROUND ( t1 - t6 );
00207 ; y[2] = SHIFT_ROUND ( t2 + t5 );
00208 ; y[5] = SHIFT_ROUND ( t2 - t5 );
00209 ; y[3] = SHIFT_ROUND ( t3 + t4 );
00210 ; y[4] = SHIFT_ROUND ( t3 - t4 );
00211 ; }
00212 ;
00213 ;-----------------------------------------------------------------------------
00214 */
00215 //xmm7 = round_inv_row
00216 #define DCT_8_INV_ROW __asm{ \
00217         __asm pshuflw xmm0, xmm0, 0xD8 \
00218         __asm pshufd xmm1, xmm0, 0 \
00219         __asm pmaddwd xmm1, [esi] \
00220         __asm pshufd xmm3, xmm0, 0x55 \
00221         __asm pshufhw xmm0, xmm0, 0xD8 \
00222         __asm pmaddwd xmm3, [esi+32] \
00223         __asm pshufd xmm2, xmm0, 0xAA \
00224         __asm pshufd xmm0, xmm0, 0xFF \
00225         __asm pmaddwd xmm2, [esi+16] \
00226         __asm pshufhw xmm4, xmm4, 0xD8 \
00227         __asm paddd xmm1, M128_round_inv_row \
00228         __asm pshuflw xmm4, xmm4, 0xD8 \
00229         __asm pmaddwd xmm0, [esi+48] \
00230         __asm pshufd xmm5, xmm4, 0 \
00231         __asm pshufd xmm6, xmm4, 0xAA \
00232         __asm pmaddwd xmm5, [ecx] \
00233         __asm paddd xmm1, xmm2 \
00234         __asm movdqa xmm2, xmm1 \
00235         __asm pshufd xmm7, xmm4, 0x55 \
00236         __asm pmaddwd xmm6, [ecx+16] \
00237         __asm paddd xmm0, xmm3 \
00238         __asm pshufd xmm4, xmm4, 0xFF \
00239         __asm psubd xmm2, xmm0 \
00240         __asm pmaddwd xmm7, [ecx+32] \
00241         __asm paddd xmm0, xmm1 \
00242         __asm psrad xmm2, 12 \
00243         __asm paddd xmm5, M128_round_inv_row \
00244         __asm pmaddwd xmm4, [ecx+48] \
00245         __asm paddd xmm5, xmm6 \
00246         __asm movdqa xmm6, xmm5 \
00247         __asm psrad xmm0, 12 \
00248         __asm pshufd xmm2, xmm2, 0x1B \
00249         __asm packssdw xmm0, xmm2 \
00250         __asm paddd xmm4, xmm7 \
00251         __asm psubd xmm6, xmm4 \
00252         __asm paddd xmm4, xmm5 \
00253         __asm psrad xmm6, 12 \
00254         __asm psrad xmm4, 12 \
00255         __asm pshufd xmm6, xmm6, 0x1B \
00256         __asm packssdw xmm4, xmm6 \
00257 }
00258 #define DCT_8_INV_COL_8 __asm{ \
00259         __asm movdqa xmm1, XMMWORD PTR M128_tg_3_16 \
00260         __asm movdqa xmm2, xmm0 \
00261         __asm movdqa xmm3, XMMWORD PTR [edx+3*16] \
00262         __asm pmulhw xmm0, xmm1 \
00263         __asm pmulhw xmm1, xmm3 \
00264         __asm movdqa xmm5, XMMWORD PTR M128_tg_1_16 \
00265         __asm movdqa xmm6, xmm4 \
00266         __asm pmulhw xmm4, xmm5 \
00267         __asm paddsw xmm0, xmm2 \
00268         __asm pmulhw xmm5, [edx+1*16] \
00269         __asm paddsw xmm1, xmm3 \
00270         __asm movdqa xmm7, XMMWORD PTR [edx+6*16] \
00271         __asm paddsw xmm0, xmm3 \
00272         __asm movdqa xmm3, XMMWORD PTR M128_tg_2_16 \
00273         __asm psubsw xmm2, xmm1 \
00274         __asm pmulhw xmm7, xmm3 \
00275         __asm movdqa xmm1, xmm0 \
00276         __asm pmulhw xmm3, [edx+2*16] \
00277         __asm psubsw xmm5, xmm6 \
00278         __asm paddsw xmm4, [edx+1*16] \
00279         __asm paddsw xmm0, xmm4 \
00280         __asm paddsw xmm0, XMMWORD PTR M128_one_corr \
00281         __asm psubsw xmm4, xmm1 \
00282         __asm movdqa xmm6, xmm5 \
00283         __asm psubsw xmm5, xmm2 \
00284         __asm paddsw xmm5, XMMWORD PTR M128_one_corr \
00285         __asm paddsw xmm6, xmm2 \
00286         __asm movdqa [edx+7*16], xmm0 \
00287         __asm movdqa xmm1, xmm4 \
00288         __asm movdqa xmm0, XMMWORD PTR M128_cos_4_16 \
00289         __asm paddsw xmm4, xmm5 \
00290         __asm movdqa xmm2, XMMWORD PTR M128_cos_4_16 \
00291         __asm pmulhw xmm2, xmm4 \
00292         __asm movdqa [edx+3*16], xmm6 \
00293         __asm psubsw xmm1, xmm5 \
00294         __asm paddsw xmm7, [edx+2*16] \
00295         __asm psubsw xmm3, [edx+6*16] \
00296         __asm movdqa xmm6, [edx] \
00297         __asm pmulhw xmm0, xmm1 \
00298         __asm movdqa xmm5, [edx+4*16] \
00299         __asm paddsw xmm5, xmm6 \
00300         __asm psubsw xmm6, [edx+4*16] \
00301         __asm paddsw xmm4, xmm2 \
00302         __asm por xmm4, XMMWORD PTR M128_one_corr \
00303         __asm paddsw xmm0, xmm1 \
00304         __asm por xmm0, XMMWORD PTR M128_one_corr \
00305         __asm movdqa xmm2, xmm5 \
00306         __asm paddsw xmm5, xmm7 \
00307         __asm movdqa xmm1, xmm6 \
00308         __asm paddsw xmm5, XMMWORD PTR M128_round_inv_col \
00309         __asm psubsw xmm2, xmm7 \
00310         __asm movdqa xmm7, [edx+7*16] \
00311         __asm paddsw xmm6, xmm3 \
00312         __asm paddsw xmm6, XMMWORD PTR M128_round_inv_col \
00313         __asm paddsw xmm7, xmm5 \
00314         __asm psraw xmm7, SHIFT_INV_COL \
00315         __asm psubsw xmm1, xmm3 \
00316         __asm paddsw xmm1, XMMWORD PTR M128_round_inv_corr \
00317         __asm movdqa xmm3, xmm6 \
00318         __asm paddsw xmm2, XMMWORD PTR M128_round_inv_corr \
00319         __asm paddsw xmm6, xmm4 \
00320         __asm movdqa [edx], xmm7 \
00321         __asm psraw xmm6, SHIFT_INV_COL \
00322         __asm movdqa xmm7, xmm1 \
00323         __asm paddsw xmm1, xmm0 \
00324         __asm movdqa [edx+1*16], xmm6 \
00325         __asm psraw xmm1, SHIFT_INV_COL \
00326         __asm movdqa xmm6, [edx+3*16] \
00327         __asm psubsw xmm7, xmm0 \
00328         __asm psraw xmm7, SHIFT_INV_COL \
00329         __asm movdqa [edx+2*16], xmm1 \
00330         __asm psubsw xmm5, [edx+7*16] \
00331         __asm psraw xmm5, SHIFT_INV_COL \
00332         __asm movdqa [edx+7*16], xmm5 \
00333         __asm psubsw xmm3, xmm4 \
00334         __asm paddsw xmm6, xmm2 \
00335         __asm psubsw xmm2, [edx+3*16] \
00336         __asm psraw xmm6, SHIFT_INV_COL \
00337         __asm psraw xmm2, SHIFT_INV_COL \
00338         __asm movdqa [edx+3*16], xmm6 \
00339         __asm psraw xmm3, SHIFT_INV_COL \
00340         __asm movdqa [edx+4*16], xmm2 \
00341         __asm movdqa [edx+5*16], xmm7 \
00342         __asm movdqa [edx+6*16], xmm3 \
00343 }
00344 
00345 //assumes src and destination are aligned on a 16-byte boundary
00346 
00347 static void idct_M128ASM(short* src)
00348 {
00349         ASSERT(((DWORD)src & 0xf) == 0); //aligned on 16-byte boundary
00350 
00351         __asm mov edx, src
00352 
00353         __asm movdqa xmm0, XMMWORD PTR[edx] //row 1
00354         __asm lea esi, M128_tab_i_04
00355         __asm movdqa xmm4, XMMWORD PTR[edx+16*2] //row 3
00356         __asm lea ecx, M128_tab_i_26
00357         DCT_8_INV_ROW; //Row 1, tab_i_04 and Row 3, tab_i_26
00358         __asm movdqa XMMWORD PTR[edx], xmm0
00359         __asm movdqa XMMWORD PTR[edx+16*2], xmm4
00360 
00361         __asm movdqa xmm0, XMMWORD PTR[edx+16*4] //row 5
00362         //__asm lea esi, M128_tab_i_04
00363         __asm movdqa xmm4, XMMWORD PTR[edx+16*6] //row 7
00364         //__asm lea ecx, M128_tab_i_26
00365         DCT_8_INV_ROW; //Row 5, tab_i_04 and Row 7, tab_i_26
00366         __asm movdqa XMMWORD PTR[edx+16*4], xmm0
00367         __asm movdqa XMMWORD PTR[edx+16*6], xmm4
00368 
00369         __asm movdqa xmm0, XMMWORD PTR[edx+16*3] //row 4
00370         __asm lea esi, M128_tab_i_35
00371         __asm movdqa xmm4, XMMWORD PTR[edx+16*1] //row 2
00372         __asm lea ecx, M128_tab_i_17
00373         DCT_8_INV_ROW; //Row 4, tab_i_35 and Row 2, tab_i_17
00374         __asm movdqa XMMWORD PTR[edx+16*3], xmm0
00375         __asm movdqa XMMWORD PTR[edx+16*1], xmm4
00376 
00377         __asm movdqa xmm0, XMMWORD PTR[edx+16*5] //row 6
00378         //__asm lea esi, M128_tab_i_35
00379         __asm movdqa xmm4, XMMWORD PTR[edx+16*7] //row 8
00380         //__asm lea ecx, M128_tab_i_17
00381         DCT_8_INV_ROW; //Row 6, tab_i_35 and Row 8, tab_i_17
00382         //__asm movdqa XMMWORD PTR[edx+80], xmm0
00383         //__asm movdqa xmm0, XMMWORD PTR [edx+80] /* 0 /* x5 */
00384         //__asm movdqa XMMWORD PTR[edx+16*7], xmm4
00385         //__asm movdqa xmm4, XMMWORD PTR [edx+7*16]/* 4 ; x7 */
00386         DCT_8_INV_COL_8
00387         // __asm emms
00388 }
00389 
00391 
00392 #define CLIP(x) (x < 0 ? 0 : x > 255 ? 255 : x)
00393 
00394 void mpeg2_idct_copy_sse2(int16_t* block, uint8_t* dest, const int stride)
00395 {
00396         idct_M128ASM(block);
00397 /*
00398     for(int i = 0; i < 8; i++)
00399         {
00400                 dest[0] = CLIP(block[0]);
00401                 dest[1] = CLIP(block[1]);
00402                 dest[2] = CLIP(block[2]);
00403                 dest[3] = CLIP(block[3]);
00404                 dest[4] = CLIP(block[4]);
00405                 dest[5] = CLIP(block[5]);
00406                 dest[6] = CLIP(block[6]);
00407                 dest[7] = CLIP(block[7]);
00408 
00409                 memset(block, 0, sizeof(short)*8);
00410 
00411                 dest += stride;
00412                 block += 8;
00413     }
00414 */
00415         __asm
00416         {
00417                 mov esi, block
00418                 mov edi, dest
00419                 mov edx, stride
00420                 lea ecx, [edx+edx]
00421 
00422                 movdqa xmm0, [esi+16*0]
00423                 movdqa xmm1, [esi+16*1]
00424                 movdqa xmm2, [esi+16*2]
00425                 movdqa xmm3, [esi+16*3]
00426                 movdqa xmm4, [esi+16*4]
00427                 movdqa xmm5, [esi+16*5]
00428                 movdqa xmm6, [esi+16*6]
00429                 movdqa xmm7, [esi+16*7]
00430                 
00431                 packuswb xmm0, xmm1
00432                 packuswb xmm2, xmm3
00433                 packuswb xmm4, xmm5
00434                 packuswb xmm6, xmm7
00435 
00436                 movlps [edi], xmm0
00437                 movhps [edi+edx], xmm0
00438                 add edi, ecx
00439                 movlps [edi], xmm2
00440                 movhps [edi+edx], xmm2
00441                 add edi, ecx
00442                 movlps [edi], xmm4
00443                 movhps [edi+edx], xmm4
00444                 add edi, ecx
00445                 movlps [edi], xmm6
00446                 movhps [edi+edx], xmm6
00447 
00448                 xorps xmm7, xmm7
00449                 movdqa [esi+16*0], xmm7
00450                 movdqa [esi+16*1], xmm7
00451                 movdqa [esi+16*2], xmm7
00452                 movdqa [esi+16*3], xmm7
00453                 movdqa [esi+16*4], xmm7
00454                 movdqa [esi+16*5], xmm7
00455                 movdqa [esi+16*6], xmm7
00456                 movdqa [esi+16*7], xmm7
00457         }
00458 
00459 }
00460 
00461 void mpeg2_idct_add_sse2(const int last, int16_t* block, uint8_t* dest, const int stride)
00462 {
00463         idct_M128ASM(block);
00464 /*
00465     for(int i = 0; i < 8; i++)
00466         {
00467                 dest[0] = CLIP(block[0] + dest[0]);
00468                 dest[1] = CLIP(block[1] + dest[1]);
00469                 dest[2] = CLIP(block[2] + dest[2]);
00470                 dest[3] = CLIP(block[3] + dest[3]);
00471                 dest[4] = CLIP(block[4] + dest[4]);
00472                 dest[5] = CLIP(block[5] + dest[5]);
00473                 dest[6] = CLIP(block[6] + dest[6]);
00474                 dest[7] = CLIP(block[7] + dest[7]);
00475 
00476                 memset(block, 0, sizeof(short)*8);
00477 
00478                 dest += stride;
00479                 block += 8;
00480     }
00481 */
00482         __asm
00483         {
00484                 mov esi, block
00485                 mov edi, dest
00486                 mov ecx, 4
00487                 mov edx, stride
00488                 xorps xmm7, xmm7
00489 
00490         mpeg2_idct_add_sse2_loop:
00491 
00492                 movdqa xmm0, [esi]
00493                 movdqa xmm1, [esi+16]
00494 
00495                 movlps xmm2, [edi]
00496                 punpcklbw xmm2, xmm7
00497                 paddsw xmm0, xmm2
00498 
00499                 movlps xmm2, [edi+edx]
00500                 punpcklbw xmm2, xmm7
00501                 paddsw xmm1, xmm2
00502 
00503                 packuswb xmm0, xmm1
00504 
00505                 movdqa [esi], xmm7
00506                 movdqa [esi+16], xmm7
00507 
00508                 movlps [edi], xmm0
00509                 movhps [edi+edx], xmm0
00510 
00511                 lea esi, [esi+16*2]
00512                 lea edi, [edi+edx*2]
00513 
00514                 dec     ecx
00515                 jnz     mpeg2_idct_add_sse2_loop
00516         }
00517 }
00518 
00519 void mpeg2_idct_init_sse2()
00520 {
00521 }
idct_sse2.cpp