00001 #include "stdafx.h"
00002 #include "libmpeg2.h"
00003
00004
00005
00006
00007
00008 #define BITS_INV_ACC 4 // 4 or 5 for IEEE
00009 #define SHIFT_INV_ROW 16 - BITS_INV_ACC
00010 #define SHIFT_INV_COL 1 + BITS_INV_ACC
00011 const short RND_INV_ROW = 1024 * (6 - BITS_INV_ACC);
00012 const short RND_INV_COL = 16 * (BITS_INV_ACC - 3);
00013 const short RND_INV_CORR = RND_INV_COL - 1;
00014
00015 __declspec(align(16)) short M128_one_corr[8] = {1,1,1,1,1,1,1,1};
00016 __declspec(align(16)) short M128_round_inv_row[8] = {RND_INV_ROW, 0, RND_INV_ROW, 0, RND_INV_ROW, 0, RND_INV_ROW, 0};
00017 __declspec(align(16)) short M128_round_inv_col[8] = {RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL};
00018 __declspec(align(16)) short M128_round_inv_corr[8]= {RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR};
00019 __declspec(align(16)) short M128_tg_1_16[8] = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036};
00020 __declspec(align(16)) short M128_tg_2_16[8] = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146};
00021 __declspec(align(16)) short M128_tg_3_16[8] = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746};
00022 __declspec(align(16)) short M128_cos_4_16[8] = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};
00023
00024
00025
00026
00027
00028
00029
00030
00031 __declspec(align(16)) short M128_tab_i_04[] =
00032 {
00033 16384, 21407, 16384, 8867,
00034 16384, -8867, 16384, -21407,
00035 16384, 8867, -16384, -21407,
00036 -16384, 21407, 16384, -8867,
00037 22725, 19266, 19266, -4520,
00038 12873, -22725, 4520, -12873,
00039 12873, 4520, -22725, -12873,
00040 4520, 19266, 19266, -22725
00041 };
00042
00043
00044
00045 __declspec(align(16)) short M128_tab_i_17[] =
00046 {
00047 22725, 29692, 22725, 12299,
00048 22725, -12299, 22725, -29692,
00049 22725, 12299, -22725, -29692,
00050 -22725, 29692, 22725, -12299,
00051 31521, 26722, 26722, -6270,
00052 17855, -31521, 6270, -17855,
00053 17855, 6270, -31521, -17855,
00054 6270, 26722, 26722, -31521
00055 };
00056
00057
00058
00059 __declspec(align(16)) short M128_tab_i_26[] =
00060 {
00061 21407, 27969, 21407, 11585,
00062 21407, -11585, 21407, -27969,
00063 21407, 11585, -21407, -27969,
00064 -21407, 27969, 21407, -11585,
00065 29692, 25172, 25172, -5906,
00066 16819, -29692, 5906, -16819,
00067 16819, 5906, -29692, -16819,
00068 5906, 25172, 25172, -29692
00069 };
00070
00071
00072
00073 __declspec(align(16)) short M128_tab_i_35[] =
00074 {
00075 19266, 25172, 19266, 10426,
00076 19266, -10426, 19266, -25172,
00077 19266, 10426, -19266, -25172,
00078 -19266, 25172, 19266, -10426,
00079 26722, 22654, 22654, -5315,
00080 15137, -26722, 5315, -15137,
00081 15137, 5315, -26722, -15137,
00082 5315, 22654, 22654, -26722
00083 };
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216 #define DCT_8_INV_ROW __asm{ \
00217 __asm pshuflw xmm0, xmm0, 0xD8 \
00218 __asm pshufd xmm1, xmm0, 0 \
00219 __asm pmaddwd xmm1, [esi] \
00220 __asm pshufd xmm3, xmm0, 0x55 \
00221 __asm pshufhw xmm0, xmm0, 0xD8 \
00222 __asm pmaddwd xmm3, [esi+32] \
00223 __asm pshufd xmm2, xmm0, 0xAA \
00224 __asm pshufd xmm0, xmm0, 0xFF \
00225 __asm pmaddwd xmm2, [esi+16] \
00226 __asm pshufhw xmm4, xmm4, 0xD8 \
00227 __asm paddd xmm1, M128_round_inv_row \
00228 __asm pshuflw xmm4, xmm4, 0xD8 \
00229 __asm pmaddwd xmm0, [esi+48] \
00230 __asm pshufd xmm5, xmm4, 0 \
00231 __asm pshufd xmm6, xmm4, 0xAA \
00232 __asm pmaddwd xmm5, [ecx] \
00233 __asm paddd xmm1, xmm2 \
00234 __asm movdqa xmm2, xmm1 \
00235 __asm pshufd xmm7, xmm4, 0x55 \
00236 __asm pmaddwd xmm6, [ecx+16] \
00237 __asm paddd xmm0, xmm3 \
00238 __asm pshufd xmm4, xmm4, 0xFF \
00239 __asm psubd xmm2, xmm0 \
00240 __asm pmaddwd xmm7, [ecx+32] \
00241 __asm paddd xmm0, xmm1 \
00242 __asm psrad xmm2, 12 \
00243 __asm paddd xmm5, M128_round_inv_row \
00244 __asm pmaddwd xmm4, [ecx+48] \
00245 __asm paddd xmm5, xmm6 \
00246 __asm movdqa xmm6, xmm5 \
00247 __asm psrad xmm0, 12 \
00248 __asm pshufd xmm2, xmm2, 0x1B \
00249 __asm packssdw xmm0, xmm2 \
00250 __asm paddd xmm4, xmm7 \
00251 __asm psubd xmm6, xmm4 \
00252 __asm paddd xmm4, xmm5 \
00253 __asm psrad xmm6, 12 \
00254 __asm psrad xmm4, 12 \
00255 __asm pshufd xmm6, xmm6, 0x1B \
00256 __asm packssdw xmm4, xmm6 \
00257 }
00258 #define DCT_8_INV_COL_8 __asm{ \
00259 __asm movdqa xmm1, XMMWORD PTR M128_tg_3_16 \
00260 __asm movdqa xmm2, xmm0 \
00261 __asm movdqa xmm3, XMMWORD PTR [edx+3*16] \
00262 __asm pmulhw xmm0, xmm1 \
00263 __asm pmulhw xmm1, xmm3 \
00264 __asm movdqa xmm5, XMMWORD PTR M128_tg_1_16 \
00265 __asm movdqa xmm6, xmm4 \
00266 __asm pmulhw xmm4, xmm5 \
00267 __asm paddsw xmm0, xmm2 \
00268 __asm pmulhw xmm5, [edx+1*16] \
00269 __asm paddsw xmm1, xmm3 \
00270 __asm movdqa xmm7, XMMWORD PTR [edx+6*16] \
00271 __asm paddsw xmm0, xmm3 \
00272 __asm movdqa xmm3, XMMWORD PTR M128_tg_2_16 \
00273 __asm psubsw xmm2, xmm1 \
00274 __asm pmulhw xmm7, xmm3 \
00275 __asm movdqa xmm1, xmm0 \
00276 __asm pmulhw xmm3, [edx+2*16] \
00277 __asm psubsw xmm5, xmm6 \
00278 __asm paddsw xmm4, [edx+1*16] \
00279 __asm paddsw xmm0, xmm4 \
00280 __asm paddsw xmm0, XMMWORD PTR M128_one_corr \
00281 __asm psubsw xmm4, xmm1 \
00282 __asm movdqa xmm6, xmm5 \
00283 __asm psubsw xmm5, xmm2 \
00284 __asm paddsw xmm5, XMMWORD PTR M128_one_corr \
00285 __asm paddsw xmm6, xmm2 \
00286 __asm movdqa [edx+7*16], xmm0 \
00287 __asm movdqa xmm1, xmm4 \
00288 __asm movdqa xmm0, XMMWORD PTR M128_cos_4_16 \
00289 __asm paddsw xmm4, xmm5 \
00290 __asm movdqa xmm2, XMMWORD PTR M128_cos_4_16 \
00291 __asm pmulhw xmm2, xmm4 \
00292 __asm movdqa [edx+3*16], xmm6 \
00293 __asm psubsw xmm1, xmm5 \
00294 __asm paddsw xmm7, [edx+2*16] \
00295 __asm psubsw xmm3, [edx+6*16] \
00296 __asm movdqa xmm6, [edx] \
00297 __asm pmulhw xmm0, xmm1 \
00298 __asm movdqa xmm5, [edx+4*16] \
00299 __asm paddsw xmm5, xmm6 \
00300 __asm psubsw xmm6, [edx+4*16] \
00301 __asm paddsw xmm4, xmm2 \
00302 __asm por xmm4, XMMWORD PTR M128_one_corr \
00303 __asm paddsw xmm0, xmm1 \
00304 __asm por xmm0, XMMWORD PTR M128_one_corr \
00305 __asm movdqa xmm2, xmm5 \
00306 __asm paddsw xmm5, xmm7 \
00307 __asm movdqa xmm1, xmm6 \
00308 __asm paddsw xmm5, XMMWORD PTR M128_round_inv_col \
00309 __asm psubsw xmm2, xmm7 \
00310 __asm movdqa xmm7, [edx+7*16] \
00311 __asm paddsw xmm6, xmm3 \
00312 __asm paddsw xmm6, XMMWORD PTR M128_round_inv_col \
00313 __asm paddsw xmm7, xmm5 \
00314 __asm psraw xmm7, SHIFT_INV_COL \
00315 __asm psubsw xmm1, xmm3 \
00316 __asm paddsw xmm1, XMMWORD PTR M128_round_inv_corr \
00317 __asm movdqa xmm3, xmm6 \
00318 __asm paddsw xmm2, XMMWORD PTR M128_round_inv_corr \
00319 __asm paddsw xmm6, xmm4 \
00320 __asm movdqa [edx], xmm7 \
00321 __asm psraw xmm6, SHIFT_INV_COL \
00322 __asm movdqa xmm7, xmm1 \
00323 __asm paddsw xmm1, xmm0 \
00324 __asm movdqa [edx+1*16], xmm6 \
00325 __asm psraw xmm1, SHIFT_INV_COL \
00326 __asm movdqa xmm6, [edx+3*16] \
00327 __asm psubsw xmm7, xmm0 \
00328 __asm psraw xmm7, SHIFT_INV_COL \
00329 __asm movdqa [edx+2*16], xmm1 \
00330 __asm psubsw xmm5, [edx+7*16] \
00331 __asm psraw xmm5, SHIFT_INV_COL \
00332 __asm movdqa [edx+7*16], xmm5 \
00333 __asm psubsw xmm3, xmm4 \
00334 __asm paddsw xmm6, xmm2 \
00335 __asm psubsw xmm2, [edx+3*16] \
00336 __asm psraw xmm6, SHIFT_INV_COL \
00337 __asm psraw xmm2, SHIFT_INV_COL \
00338 __asm movdqa [edx+3*16], xmm6 \
00339 __asm psraw xmm3, SHIFT_INV_COL \
00340 __asm movdqa [edx+4*16], xmm2 \
00341 __asm movdqa [edx+5*16], xmm7 \
00342 __asm movdqa [edx+6*16], xmm3 \
00343 }
00344
00345
00346
00347 static void idct_M128ASM(short* src)
00348 {
00349 ASSERT(((DWORD)src & 0xf) == 0);
00350
00351 __asm mov edx, src
00352
00353 __asm movdqa xmm0, XMMWORD PTR[edx]
00354 __asm lea esi, M128_tab_i_04
00355 __asm movdqa xmm4, XMMWORD PTR[edx+16*2]
00356 __asm lea ecx, M128_tab_i_26
00357 DCT_8_INV_ROW;
00358 __asm movdqa XMMWORD PTR[edx], xmm0
00359 __asm movdqa XMMWORD PTR[edx+16*2], xmm4
00360
00361 __asm movdqa xmm0, XMMWORD PTR[edx+16*4]
00362
00363 __asm movdqa xmm4, XMMWORD PTR[edx+16*6]
00364
00365 DCT_8_INV_ROW;
00366 __asm movdqa XMMWORD PTR[edx+16*4], xmm0
00367 __asm movdqa XMMWORD PTR[edx+16*6], xmm4
00368
00369 __asm movdqa xmm0, XMMWORD PTR[edx+16*3]
00370 __asm lea esi, M128_tab_i_35
00371 __asm movdqa xmm4, XMMWORD PTR[edx+16*1]
00372 __asm lea ecx, M128_tab_i_17
00373 DCT_8_INV_ROW;
00374 __asm movdqa XMMWORD PTR[edx+16*3], xmm0
00375 __asm movdqa XMMWORD PTR[edx+16*1], xmm4
00376
00377 __asm movdqa xmm0, XMMWORD PTR[edx+16*5]
00378
00379 __asm movdqa xmm4, XMMWORD PTR[edx+16*7]
00380
00381 DCT_8_INV_ROW;
00382
00383
00384
00385
00386 DCT_8_INV_COL_8
00387
00388 }
00389
00391
00392 #define CLIP(x) (x < 0 ? 0 : x > 255 ? 255 : x)
00393
00394 void mpeg2_idct_copy_sse2(int16_t* block, uint8_t* dest, const int stride)
00395 {
00396 idct_M128ASM(block);
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412
00413
00414
00415 __asm
00416 {
00417 mov esi, block
00418 mov edi, dest
00419 mov edx, stride
00420 lea ecx, [edx+edx]
00421
00422 movdqa xmm0, [esi+16*0]
00423 movdqa xmm1, [esi+16*1]
00424 movdqa xmm2, [esi+16*2]
00425 movdqa xmm3, [esi+16*3]
00426 movdqa xmm4, [esi+16*4]
00427 movdqa xmm5, [esi+16*5]
00428 movdqa xmm6, [esi+16*6]
00429 movdqa xmm7, [esi+16*7]
00430
00431 packuswb xmm0, xmm1
00432 packuswb xmm2, xmm3
00433 packuswb xmm4, xmm5
00434 packuswb xmm6, xmm7
00435
00436 movlps [edi], xmm0
00437 movhps [edi+edx], xmm0
00438 add edi, ecx
00439 movlps [edi], xmm2
00440 movhps [edi+edx], xmm2
00441 add edi, ecx
00442 movlps [edi], xmm4
00443 movhps [edi+edx], xmm4
00444 add edi, ecx
00445 movlps [edi], xmm6
00446 movhps [edi+edx], xmm6
00447
00448 xorps xmm7, xmm7
00449 movdqa [esi+16*0], xmm7
00450 movdqa [esi+16*1], xmm7
00451 movdqa [esi+16*2], xmm7
00452 movdqa [esi+16*3], xmm7
00453 movdqa [esi+16*4], xmm7
00454 movdqa [esi+16*5], xmm7
00455 movdqa [esi+16*6], xmm7
00456 movdqa [esi+16*7], xmm7
00457 }
00458
00459 }
00460
00461 void mpeg2_idct_add_sse2(const int last, int16_t* block, uint8_t* dest, const int stride)
00462 {
00463 idct_M128ASM(block);
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482 __asm
00483 {
00484 mov esi, block
00485 mov edi, dest
00486 mov ecx, 4
00487 mov edx, stride
00488 xorps xmm7, xmm7
00489
00490 mpeg2_idct_add_sse2_loop:
00491
00492 movdqa xmm0, [esi]
00493 movdqa xmm1, [esi+16]
00494
00495 movlps xmm2, [edi]
00496 punpcklbw xmm2, xmm7
00497 paddsw xmm0, xmm2
00498
00499 movlps xmm2, [edi+edx]
00500 punpcklbw xmm2, xmm7
00501 paddsw xmm1, xmm2
00502
00503 packuswb xmm0, xmm1
00504
00505 movdqa [esi], xmm7
00506 movdqa [esi+16], xmm7
00507
00508 movlps [edi], xmm0
00509 movhps [edi+edx], xmm0
00510
00511 lea esi, [esi+16*2]
00512 lea edi, [edi+edx*2]
00513
00514 dec ecx
00515 jnz mpeg2_idct_add_sse2_loop
00516 }
00517 }
00518
00519 void mpeg2_idct_init_sse2()
00520 {
00521 }