00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "stdafx.h"
00025 #include "libmpeg2.h"
00026
00027 __declspec(align(16)) static BYTE const_1_16_bytes[] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
00028
00029 static void MC_put_o_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00030 {
00031 __asm
00032 {
00033 mov edx, ref
00034 mov ecx, dest
00035 mov esi, height
00036 mov eax, stride
00037 lea edi, [eax+eax]
00038
00039 MC_put_o_16_sse2_loop:
00040
00041 movdqu xmm0, [edx]
00042 movdqu xmm1, [edx+eax]
00043 movdqa [ecx], xmm0
00044 movdqa [ecx+eax], xmm1
00045 add edx, edi
00046 add ecx, edi
00047 sub esi, 2
00048
00049 jg MC_put_o_16_sse2_loop
00050 }
00051 }
00052
00053 static void MC_put_o_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00054 {
00055 __asm
00056 {
00057 mov edx, ref
00058 mov ecx, dest
00059 mov esi, height
00060 mov eax, stride
00061 lea edi, [eax+eax]
00062
00063 MC_put_o_8_sse2_loop:
00064
00065 movlpd xmm0, [edx]
00066 movhpd xmm0, [edx+eax]
00067 movlpd [ecx], xmm0
00068 movhpd [ecx+eax], xmm0
00069 add edx, edi
00070 add ecx, edi
00071 sub esi, 2
00072
00073 jg MC_put_o_8_sse2_loop
00074 }
00075 }
00076
00077 static void MC_put_x_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00078 {
00079 __asm
00080 {
00081 mov edx, ref
00082 mov ecx, dest
00083 mov eax, stride
00084 mov esi, height
00085 lea edi, [eax+eax]
00086
00087 MC_put_x_16_sse2_loop:
00088
00089 movdqu xmm0, [edx]
00090 movdqu xmm1, [edx+1]
00091 movdqu xmm2, [edx+eax]
00092 movdqu xmm3, [edx+eax+1]
00093 pavgb xmm0, xmm1
00094 pavgb xmm2, xmm3
00095 movdqa [ecx], xmm0
00096 movdqa [ecx+eax], xmm2
00097 add edx, edi
00098 add ecx, edi
00099 sub esi, 2
00100
00101 jg MC_put_x_16_sse2_loop
00102 }
00103 }
00104
00105 static void MC_put_x_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00106 {
00107 __asm
00108 {
00109 mov edx, ref
00110 mov ecx, dest
00111 mov eax, stride
00112 mov esi, height
00113 lea edi, [eax+eax]
00114
00115 MC_put_x_8_sse2_loop:
00116
00117 movlpd xmm0, [edx]
00118 movlpd xmm1, [edx+1]
00119 movhpd xmm0, [edx+eax]
00120 movhpd xmm1, [edx+eax+1]
00121 pavgb xmm0, xmm1
00122 movlpd [ecx], xmm0
00123 movhpd [ecx+eax], xmm0
00124 add edx, edi
00125 add ecx, edi
00126 sub esi, 2
00127
00128 jg MC_put_x_8_sse2_loop
00129 }
00130 }
00131
00132 static void MC_put_y_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00133 {
00134 __asm
00135 {
00136 mov edx, ref
00137 mov ecx, dest
00138 mov eax, stride
00139 mov esi, height
00140 lea edi, [eax+eax]
00141
00142 movdqu xmm0, [edx]
00143
00144 MC_put_y_16_sse2_loop:
00145
00146 movdqu xmm1, [edx+eax]
00147 movdqu xmm2, [edx+edi]
00148 pavgb xmm0, xmm1
00149 pavgb xmm1, xmm2
00150 movdqa [ecx], xmm0
00151 movdqa [ecx+eax], xmm1
00152 movdqa xmm0, xmm2
00153 add edx, edi
00154 add ecx, edi
00155 sub esi, 2
00156
00157 jg MC_put_y_16_sse2_loop
00158 }
00159 }
00160
00161 static void MC_put_y_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00162 {
00163 __asm
00164 {
00165 mov edx, ref
00166 mov ecx, dest
00167 mov eax, stride
00168 mov esi, height
00169 lea edi, [eax+eax]
00170
00171 movhpd xmm0, [edx]
00172 movlpd xmm0, [edx+eax]
00173
00174 MC_put_y_8_sse2_loop:
00175
00176 movhpd xmm1, [edx+eax]
00177 movlpd xmm1, [edx+edi]
00178 pavgb xmm0, xmm1
00179 movhpd [ecx], xmm0
00180 movlpd [ecx+eax], xmm0
00181 movdqa xmm0, xmm1
00182 add edx, edi
00183 add ecx, edi
00184 sub esi, 2
00185
00186 jg MC_put_y_8_sse2_loop
00187 }
00188 }
00189
00190 static void MC_put_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00191 {
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228 __asm
00229 {
00230 mov edx, ref
00231 mov ecx, dest
00232 mov eax, stride
00233 mov esi, height
00234 lea edi, [eax+eax]
00235
00236 movdqa xmm7, [const_1_16_bytes]
00237 movdqu xmm0, [edx]
00238 movdqu xmm1, [edx+1]
00239
00240 MC_put_xy_16_sse2_loop:
00241
00242 movdqu xmm2, [edx+eax]
00243 movdqu xmm3, [edx+eax+1]
00244 movdqu xmm4, [edx+edi]
00245 movdqu xmm5, [edx+edi+1]
00246 pavgb xmm0, xmm1
00247 pavgb xmm2, xmm3
00248 movdqa xmm1, xmm5
00249 pavgb xmm5, xmm4
00250 psubusb xmm2, xmm7
00251 pavgb xmm0, xmm2
00252 pavgb xmm2, xmm5
00253 movdqa [ecx], xmm0
00254 movdqa xmm0, xmm4
00255 movdqa [ecx+eax], xmm2
00256 add edx, edi
00257 add ecx, edi
00258 sub esi, 2
00259
00260 jg MC_put_xy_16_sse2_loop
00261 }
00262 }
00263
00264 static void MC_put_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00265 {
00266 __asm
00267 {
00268 mov edx, ref
00269 mov ecx, dest
00270 mov eax, stride
00271 mov esi, height
00272 lea edi, [eax+eax]
00273
00274 movdqa xmm7, [const_1_16_bytes]
00275 movhpd xmm0, [edx]
00276 movlpd xmm0, [edx+eax]
00277 movhpd xmm2, [edx+1]
00278 movlpd xmm2, [edx+eax+1]
00279
00280 MC_put_xy_8_sse2_loop:
00281
00282 movhpd xmm1, [edx+eax]
00283 movlpd xmm1, [edx+edi]
00284 movhpd xmm3, [edx+eax+1]
00285 movlpd xmm3, [edx+edi+1]
00286 pavgb xmm0, xmm1
00287 pavgb xmm2, xmm3
00288 psubusb xmm0, xmm7
00289 pavgb xmm0, xmm1
00290 movhpd [ecx], xmm0
00291 movlpd [ecx+eax], xmm0
00292 movdqa xmm0, xmm1
00293 movdqa xmm2, xmm3
00294 add edx, edi
00295 add ecx, edi
00296 sub esi, 2
00297
00298 jg MC_put_xy_8_sse2_loop
00299 }
00300 }
00301
00302 static void MC_avg_o_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00303 {
00304 __asm
00305 {
00306 mov edx, ref
00307 mov ecx, dest
00308 mov esi, height
00309 mov eax, stride
00310 lea edi, [eax+eax]
00311
00312 MC_avg_o_16_sse2_loop:
00313
00314 movdqu xmm0, [edx]
00315 movdqu xmm1, [edx+eax]
00316 movdqa xmm2, [ecx]
00317 movdqa xmm3, [ecx+eax]
00318 pavgb xmm0, xmm2
00319 pavgb xmm1, xmm3
00320 movdqa [ecx], xmm0
00321 movdqa [ecx+eax], xmm1
00322 add edx, edi
00323 add ecx, edi
00324 sub esi, 2
00325
00326 jg MC_avg_o_16_sse2_loop
00327 }
00328 }
00329
00330 static void MC_avg_o_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00331 {
00332 __asm
00333 {
00334 mov edx, ref
00335 mov ecx, dest
00336 mov esi, height
00337 mov eax, stride
00338 lea edi, [eax+eax]
00339
00340 MC_avg_o_16_sse2_loop:
00341
00342 movlpd xmm0, [edx]
00343 movhpd xmm0, [edx+eax]
00344 movlpd xmm1, [ecx]
00345 movhpd xmm1, [ecx+eax]
00346 pavgb xmm0, xmm1
00347 movlpd [ecx], xmm0
00348 movhpd [ecx+eax], xmm0
00349 add edx, edi
00350 add ecx, edi
00351 sub esi, 2
00352
00353 jg MC_avg_o_16_sse2_loop
00354 }
00355 }
00356
00357 static void MC_avg_x_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00358 {
00359 __asm
00360 {
00361 mov edx, ref
00362 mov ecx, dest
00363 mov eax, stride
00364 mov esi, height
00365 lea edi, [eax+eax]
00366
00367 MC_avg_x_16_sse2_loop:
00368
00369 movdqu xmm0, [edx]
00370 movdqu xmm1, [edx+1]
00371 movdqu xmm2, [edx+eax]
00372 movdqu xmm3, [edx+eax+1]
00373 pavgb xmm0, xmm1
00374 pavgb xmm2, xmm3
00375 movdqa xmm4, [ecx]
00376 movdqa xmm5, [ecx+eax]
00377 pavgb xmm0, xmm4
00378 pavgb xmm2, xmm5
00379 movdqa [ecx], xmm0
00380 movdqa [ecx+eax], xmm2
00381 add edx, edi
00382 add ecx, edi
00383 sub esi, 2
00384
00385 jg MC_avg_x_16_sse2_loop
00386 }
00387 }
00388
00389 static void MC_avg_x_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00390 {
00391 __asm
00392 {
00393 mov edx, ref
00394 mov ecx, dest
00395 mov eax, stride
00396 mov esi, height
00397 lea edi, [eax+eax]
00398
00399 MC_avg_x_8_sse2_loop:
00400
00401 movlpd xmm0, [edx]
00402 movlpd xmm1, [edx+1]
00403 movhpd xmm0, [edx+eax]
00404 movhpd xmm1, [edx+eax+1]
00405 pavgb xmm0, xmm1
00406 movlpd xmm2, [ecx]
00407 movhpd xmm2, [ecx+eax]
00408 pavgb xmm0, xmm2
00409 movlpd [ecx], xmm0
00410 movhpd [ecx+eax], xmm0
00411 add edx, edi
00412 add ecx, edi
00413 sub esi, 2
00414
00415 jg MC_avg_x_8_sse2_loop
00416 }
00417 }
00418
00419 static void MC_avg_y_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00420 {
00421 __asm
00422 {
00423 mov edx, ref
00424 mov ecx, dest
00425 mov eax, stride
00426 mov esi, height
00427 lea edi, [eax+eax]
00428
00429 movdqu xmm0, [edx]
00430
00431 MC_avg_y_16_sse2_loop:
00432
00433 movdqu xmm1, [edx+eax]
00434 movdqu xmm2, [edx+edi]
00435 pavgb xmm0, xmm1
00436 pavgb xmm1, xmm2
00437 movdqa xmm3, [ecx]
00438 movdqa xmm4, [ecx+eax]
00439 pavgb xmm0, xmm3
00440 pavgb xmm1, xmm4
00441 movdqa [ecx], xmm0
00442 movdqa xmm0, xmm2
00443 movdqa [ecx+eax], xmm1
00444 add edx, edi
00445 add ecx, edi
00446 sub esi, 2
00447
00448 jg MC_avg_y_16_sse2_loop
00449 }
00450 }
00451
00452 static void MC_avg_y_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00453 {
00454 __asm
00455 {
00456 mov edx, ref
00457 mov ecx, dest
00458 mov eax, stride
00459 mov esi, height
00460 lea edi, [eax+eax]
00461
00462 movhpd xmm0, [edx]
00463 movlpd xmm0, [edx+eax]
00464
00465 MC_put_y_8_sse2_loop:
00466
00467 movhpd xmm1, [edx+eax]
00468 movlpd xmm1, [edx+edi]
00469 pavgb xmm0, xmm1
00470 movhpd xmm2, [ecx]
00471 movlpd xmm2, [ecx+eax]
00472 pavgb xmm0, xmm2
00473 movhpd [ecx], xmm0
00474 movlpd [ecx+eax], xmm0
00475 movdqa xmm0, xmm1
00476 add edx, edi
00477 add ecx, edi
00478 sub esi, 2
00479
00480 jg MC_put_y_8_sse2_loop
00481 }
00482 }
00483
00484 static void MC_avg_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00485 {
00486 __asm
00487 {
00488 mov edx, ref
00489 mov ecx, dest
00490 mov eax, stride
00491 mov esi, height
00492 lea edi, [eax+eax]
00493
00494 movdqa xmm7, [const_1_16_bytes]
00495 movdqu xmm0, [edx]
00496 movdqu xmm1, [edx+1]
00497
00498 MC_avg_xy_16_sse2_loop:
00499
00500 movdqu xmm2, [edx+eax]
00501 movdqu xmm3, [edx+eax+1]
00502 movdqu xmm4, [edx+edi]
00503 movdqu xmm5, [edx+edi+1]
00504 pavgb xmm0, xmm1
00505 pavgb xmm2, xmm3
00506 movdqa xmm1, xmm5
00507 pavgb xmm5, xmm4
00508 psubusb xmm2, xmm7
00509 pavgb xmm0, xmm2
00510 pavgb xmm2, xmm5
00511 movdqa xmm5, [ecx]
00512 movdqa xmm6, [ecx+eax]
00513 pavgb xmm0, xmm5
00514 pavgb xmm2, xmm6
00515 movdqa [ecx], xmm0
00516 movdqa xmm0, xmm4
00517 movdqa [ecx+eax], xmm2
00518 add edx, edi
00519 add ecx, edi
00520 sub esi, 2
00521
00522 jg MC_avg_xy_16_sse2_loop
00523 }
00524 }
00525
00526 static void MC_avg_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00527 {
00528 __asm
00529 {
00530 mov edx, ref
00531 mov ecx, dest
00532 mov eax, stride
00533 mov esi, height
00534 lea edi, [eax+eax]
00535
00536 movdqa xmm7, [const_1_16_bytes]
00537 movhpd xmm0, [edx]
00538 movlpd xmm0, [edx+eax]
00539 movhpd xmm2, [edx+1]
00540 movlpd xmm2, [edx+eax+1]
00541
00542 MC_avg_xy_8_sse2_loop:
00543
00544 movhpd xmm1, [edx+eax]
00545 movlpd xmm1, [edx+edi]
00546 movhpd xmm3, [edx+eax+1]
00547 movlpd xmm3, [edx+edi+1]
00548 pavgb xmm0, xmm1
00549 pavgb xmm2, xmm3
00550 psubusb xmm0, xmm7
00551 pavgb xmm0, xmm2
00552 movhpd xmm4, [ecx]
00553 movlpd xmm4, [ecx+eax]
00554 pavgb xmm0, xmm4
00555 movhpd [ecx], xmm0
00556 movlpd [ecx+eax], xmm0
00557 movdqa xmm0, xmm1
00558 movdqa xmm2, xmm3
00559 add edx, edi
00560 add ecx, edi
00561 sub esi, 2
00562
00563 jg MC_avg_xy_8_sse2_loop
00564 }
00565 }
00566
00567 mpeg2_mc_t mpeg2_mc_sse2 =
00568 {
00569 {MC_put_o_16_sse2, MC_put_x_16_sse2, MC_put_y_16_sse2, MC_put_xy_16_sse2,
00570 MC_put_o_8_sse2, MC_put_x_8_sse2, MC_put_y_8_sse2, MC_put_xy_8_sse2},
00571 {MC_avg_o_16_sse2, MC_avg_x_16_sse2, MC_avg_y_16_sse2, MC_avg_xy_16_sse2,
00572 MC_avg_o_8_sse2, MC_avg_x_8_sse2, MC_avg_y_8_sse2, MC_avg_xy_8_sse2}
00573 };