00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "stdafx.h"
00025 #include "vd.h"
00026
00027 #pragma warning(disable : 4799) // no emms... blahblahblah
00028
00029 #define ReadTSC( x ) __asm cpuid \
00030 __asm rdtsc \
00031 __asm mov dword ptr x,eax \
00032 __asm mov dword ptr x+4,edx
00033
00034 CCpuID::CCpuID()
00035 {
00036 DWORD flags = 0;
00037
00038 __asm
00039 {
00040 mov eax, 1
00041 cpuid
00042 test edx, 0x00800000
00043 jz TEST_SSE
00044 or [flags], 1
00045 TEST_SSE:
00046 test edx, 0x02000000
00047 jz TEST_SSE2
00048 or [flags], 2
00049 or [flags], 4
00050 TEST_SSE2:
00051 test edx, 0x04000000
00052 jz TEST_3DNOW
00053 or [flags], 8
00054 TEST_3DNOW:
00055 mov eax, 0x80000001
00056 cpuid
00057 test edx, 0x80000000
00058 jz TEST_SSEMMX
00059 or [flags], 16
00060 TEST_SSEMMX:
00061 test edx, 0x00400000
00062 jz TEST_END
00063 or [flags], 2
00064 TEST_END:
00065 }
00066
00067 m_flags = (flag_t)flags;
00068 }
00069
00070 CCpuID g_cpuid;
00071
00072 void memcpy_accel(void* dst, const void* src, size_t len)
00073 {
00074 if((g_cpuid.m_flags & CCpuID::sse2) && len >= 128
00075 && !((DWORD)src&15) && !((DWORD)dst&15))
00076 {
00077 __asm
00078 {
00079 mov esi, dword ptr [src]
00080 mov edi, dword ptr [dst]
00081 mov ecx, len
00082 shr ecx, 7
00083 memcpy_accel_sse2_loop:
00084 prefetchnta [esi+16*8]
00085 movdqa xmm0, [esi]
00086 movdqa xmm1, [esi+16*1]
00087 movdqa xmm2, [esi+16*2]
00088 movdqa xmm3, [esi+16*3]
00089 movdqa xmm4, [esi+16*4]
00090 movdqa xmm5, [esi+16*5]
00091 movdqa xmm6, [esi+16*6]
00092 movdqa xmm7, [esi+16*7]
00093 movntps [edi], xmm0
00094 movntps [edi+16*1], xmm1
00095 movntps [edi+16*2], xmm2
00096 movntps [edi+16*3], xmm3
00097 movntps [edi+16*4], xmm4
00098 movntps [edi+16*5], xmm5
00099 movntps [edi+16*6], xmm6
00100 movntps [edi+16*7], xmm7
00101 add esi, 128
00102 add edi, 128
00103 dec ecx
00104 jnz memcpy_accel_sse2_loop
00105 mov ecx, len
00106 and ecx, 127
00107 cmp ecx, 0
00108 je memcpy_accel_sse2_end
00109 memcpy_accel_sse2_loop2:
00110 mov dl, byte ptr[esi]
00111 mov byte ptr[edi], dl
00112 inc esi
00113 inc edi
00114 dec ecx
00115 jne memcpy_accel_sse2_loop2
00116 memcpy_accel_sse2_end:
00117 emms
00118 sfence
00119 }
00120 }
00121 else if((g_cpuid.m_flags & CCpuID::ssefpu) && len >= 128
00122 && !((DWORD)src&15) && !((DWORD)dst&15))
00123 {
00124 __asm
00125 {
00126 mov esi, dword ptr [src]
00127 mov edi, dword ptr [dst]
00128 mov ecx, len
00129 shr ecx, 7
00130 memcpy_accel_sse_loop:
00131 prefetchnta [esi+16*8]
00132 movaps xmm0, [esi]
00133 movaps xmm1, [esi+16*1]
00134 movaps xmm2, [esi+16*2]
00135 movaps xmm3, [esi+16*3]
00136 movaps xmm4, [esi+16*4]
00137 movaps xmm5, [esi+16*5]
00138 movaps xmm6, [esi+16*6]
00139 movaps xmm7, [esi+16*7]
00140 movntps [edi], xmm0
00141 movntps [edi+16*1], xmm1
00142 movntps [edi+16*2], xmm2
00143 movntps [edi+16*3], xmm3
00144 movntps [edi+16*4], xmm4
00145 movntps [edi+16*5], xmm5
00146 movntps [edi+16*6], xmm6
00147 movntps [edi+16*7], xmm7
00148 add esi, 128
00149 add edi, 128
00150 dec ecx
00151 jnz memcpy_accel_sse_loop
00152 mov ecx, len
00153 and ecx, 127
00154 cmp ecx, 0
00155 je memcpy_accel_sse_end
00156 memcpy_accel_sse_loop2:
00157 mov dl, byte ptr[esi]
00158 mov byte ptr[edi], dl
00159 inc esi
00160 inc edi
00161 dec ecx
00162 jne memcpy_accel_sse_loop2
00163 memcpy_accel_sse_end:
00164 emms
00165 sfence
00166 }
00167 }
00168 else if((g_cpuid.m_flags & CCpuID::mmx) && len >= 64
00169 && !((DWORD)src&7) && !((DWORD)dst&7))
00170 {
00171 __asm
00172 {
00173 mov esi, dword ptr [src]
00174 mov edi, dword ptr [dst]
00175 mov ecx, len
00176 shr ecx, 6
00177 memcpy_accel_mmx_loop:
00178 movq mm0, qword ptr [esi]
00179 movq mm1, qword ptr [esi+8*1]
00180 movq mm2, qword ptr [esi+8*2]
00181 movq mm3, qword ptr [esi+8*3]
00182 movq mm4, qword ptr [esi+8*4]
00183 movq mm5, qword ptr [esi+8*5]
00184 movq mm6, qword ptr [esi+8*6]
00185 movq mm7, qword ptr [esi+8*7]
00186 movq qword ptr [edi], mm0
00187 movq qword ptr [edi+8*1], mm1
00188 movq qword ptr [edi+8*2], mm2
00189 movq qword ptr [edi+8*3], mm3
00190 movq qword ptr [edi+8*4], mm4
00191 movq qword ptr [edi+8*5], mm5
00192 movq qword ptr [edi+8*6], mm6
00193 movq qword ptr [edi+8*7], mm7
00194 add esi, 64
00195 add edi, 64
00196 dec ecx
00197 jnz memcpy_accel_mmx_loop
00198 mov ecx, len
00199 and ecx, 63
00200 cmp ecx, 0
00201 je memcpy_accel_mmx_end
00202 memcpy_accel_mmx_loop2:
00203 mov dl, byte ptr [esi]
00204 mov byte ptr [edi], dl
00205 inc esi
00206 inc edi
00207 dec ecx
00208 jne memcpy_accel_mmx_loop2
00209 memcpy_accel_mmx_end:
00210 emms
00211 }
00212 }
00213 else
00214 {
00215 memcpy(dst, src, len);
00216 }
00217 }
00218
00219 bool BitBltFromI420ToI420(int w, int h, BYTE* dsty, BYTE* dstu, BYTE* dstv, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch)
00220 {
00221 if(w&1) return(false);
00222
00223 int pitch = min(abs(srcpitch), abs(dstpitch));
00224
00225 for(int y = 0; y < h; y++, srcy += srcpitch, dsty += dstpitch)
00226 memcpy_accel(dsty, srcy, pitch);
00227
00228 srcpitch >>= 1;
00229 dstpitch >>= 1;
00230
00231 pitch = min(abs(srcpitch), abs(dstpitch));
00232
00233 for(int y = 0; y < h; y+=2, srcu += srcpitch, dstu += dstpitch)
00234 memcpy_accel(dstu, srcu, pitch);
00235
00236 for(int y = 0; y < h; y+=2, srcv += srcpitch, dstv += dstpitch)
00237 memcpy_accel(dstv, srcv, pitch);
00238
00239 return(true);
00240 }
00241
00242 bool BitBltFromYUY2ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* src, int srcpitch)
00243 {
00244 int pitch = min(abs(srcpitch), abs(dstpitch));
00245
00246 for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
00247 memcpy_accel(dst, src, pitch);
00248
00249 return(true);
00250 }
00251
00252 extern "C" void asm_YUVtoRGB32_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00253 extern "C" void asm_YUVtoRGB24_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00254 extern "C" void asm_YUVtoRGB16_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00255 extern "C" void asm_YUVtoRGB32_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00256 extern "C" void asm_YUVtoRGB24_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00257 extern "C" void asm_YUVtoRGB16_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00258 extern "C" void asm_YUVtoRGB32_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00259 extern "C" void asm_YUVtoRGB24_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00260 extern "C" void asm_YUVtoRGB16_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00261
00262 bool BitBltFromI420ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch)
00263 {
00264 if(w<=0 || h<=0 || (w&1) || (h&1))
00265 return(false);
00266
00267 void (*asm_YUVtoRGB_row)(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width) = NULL;;
00268
00269 if((g_cpuid.m_flags & CCpuID::ssefpu) && !(w&7))
00270 {
00271 switch(dbpp)
00272 {
00273 case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row; break;
00274 case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row_ISSE; break;
00275 case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row_ISSE; break;
00276 }
00277 }
00278 else if((g_cpuid.m_flags & CCpuID::mmx) && !(w&7))
00279 {
00280 switch(dbpp)
00281 {
00282 case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row; break;
00283 case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row_MMX; break;
00284 case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row_MMX; break;
00285 }
00286 }
00287 else
00288 {
00289 switch(dbpp)
00290 {
00291 case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row; break;
00292 case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row; break;
00293 case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row; break;
00294 }
00295 }
00296
00297 if(!asm_YUVtoRGB_row)
00298 return(false);
00299
00300 do
00301 {
00302 asm_YUVtoRGB_row(dst + dstpitch, dst, srcy + srcpitch, srcy, srcu, srcv, w/2);
00303
00304 dst += 2*dstpitch;
00305 srcy += srcpitch*2;
00306 srcu += srcpitch/2;
00307 srcv += srcpitch/2;
00308 }
00309 while(h -= 2);
00310
00311 if(g_cpuid.m_flags & CCpuID::mmx)
00312 __asm emms
00313
00314 if(g_cpuid.m_flags & CCpuID::ssefpu)
00315 __asm sfence
00316
00317 return true;
00318 }
00319
00320 static void yuvtoyuy2row_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width)
00321 {
00322 WORD* dstw = (WORD*)dst;
00323 for(; width > 1; width -= 2)
00324 {
00325 *dstw++ = (*srcu++<<8)|*srcy++;
00326 *dstw++ = (*srcv++<<8)|*srcy++;
00327 }
00328 }
00329
00330 static void __declspec(naked) yuvtoyuy2row_MMX(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width)
00331 {
00332 __asm {
00333 push ebp
00334 push edi
00335 push esi
00336 push ebx
00337
00338 mov edi, [esp+20]
00339 mov ebp, [esp+24]
00340 mov ebx, [esp+28]
00341 mov esi, [esp+32]
00342 mov ecx, [esp+36]
00343
00344 shr ecx, 3
00345
00346 yuvtoyuy2row_loop:
00347
00348 movd mm0, [ebx]
00349 punpcklbw mm0, [esi]
00350
00351 movq mm1, [ebp]
00352 movq mm2, mm1
00353 punpcklbw mm1, mm0
00354 punpckhbw mm2, mm0
00355
00356 movq [edi], mm1
00357 movq [edi+8], mm2
00358
00359 add ebp, 8
00360 add ebx, 4
00361 add esi, 4
00362 add edi, 16
00363
00364 dec ecx
00365 jnz yuvtoyuy2row_loop
00366
00367 pop ebx
00368 pop esi
00369 pop edi
00370 pop ebp
00371 ret
00372 };
00373 }
00374
00375 static void yuvtoyuy2row_avg_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv)
00376 {
00377 WORD* dstw = (WORD*)dst;
00378 for(; width > 1; width -= 2, srcu++, srcv++)
00379 {
00380 *dstw++ = (((srcu[0]+srcu[pitchuv])>>1)<<8)|*srcy++;
00381 *dstw++ = (((srcv[0]+srcv[pitchuv])>>1)<<8)|*srcy++;
00382 }
00383 }
00384
00385 static void __declspec(naked) yuvtoyuy2row_avg_MMX(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv)
00386 {
00387 static const __int64 mask = 0x7f7f7f7f7f7f7f7fi64;
00388
00389 __asm {
00390 push ebp
00391 push edi
00392 push esi
00393 push ebx
00394
00395 movq mm7, mask
00396
00397 mov edi, [esp+20]
00398 mov ebp, [esp+24]
00399 mov ebx, [esp+28]
00400 mov esi, [esp+32]
00401 mov ecx, [esp+36]
00402 mov eax, [esp+40]
00403
00404 shr ecx, 3
00405
00406 yuvtoyuy2row_avg_loop:
00407
00408 movd mm0, [ebx]
00409 punpcklbw mm0, [esi]
00410 movq mm1, mm0
00411
00412 movd mm2, [ebx + eax]
00413 punpcklbw mm2, [esi + eax]
00414 movq mm3, mm2
00415
00416
00417
00418 pand mm0, mm2
00419 pxor mm1, mm3
00420 psrlq mm1, 1
00421 pand mm1, mm7
00422 paddb mm0, mm1
00423
00424 movq mm1, [ebp]
00425 movq mm2, mm1
00426 punpcklbw mm1, mm0
00427 punpckhbw mm2, mm0
00428
00429 movq [edi], mm1
00430 movq [edi+8], mm2
00431
00432 add ebp, 8
00433 add ebx, 4
00434 add esi, 4
00435 add edi, 16
00436
00437 dec ecx
00438 jnz yuvtoyuy2row_avg_loop
00439
00440 pop ebx
00441 pop esi
00442 pop edi
00443 pop ebp
00444 ret
00445 };
00446 }
00447
00448 bool BitBltFromI420ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch)
00449 {
00450 if(w<=0 || h<=0 || (w&1) || (h&1))
00451 return(false);
00452
00453 if(srcpitch == 0) srcpitch = w;
00454
00455 void (*yuvtoyuy2row)(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width) = NULL;
00456 void (*yuvtoyuy2row_avg)(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv) = NULL;
00457
00458 if((g_cpuid.m_flags & CCpuID::mmx) && !(w&7))
00459 {
00460 yuvtoyuy2row = yuvtoyuy2row_MMX;
00461 yuvtoyuy2row_avg = yuvtoyuy2row_avg_MMX;
00462 }
00463 else
00464 {
00465 yuvtoyuy2row = yuvtoyuy2row_c;
00466 yuvtoyuy2row_avg = yuvtoyuy2row_avg_c;
00467 }
00468
00469 if(!yuvtoyuy2row)
00470 return(false);
00471
00472 do
00473 {
00474 yuvtoyuy2row(dst, srcy, srcu, srcv, w);
00475 yuvtoyuy2row_avg(dst + dstpitch, srcy + srcpitch, srcu, srcv, w, srcpitch/2);
00476
00477 dst += 2*dstpitch;
00478 srcy += srcpitch*2;
00479 srcu += srcpitch/2;
00480 srcv += srcpitch/2;
00481 }
00482 while((h -= 2) > 2);
00483
00484 yuvtoyuy2row(dst, srcy, srcu, srcv, w);
00485 yuvtoyuy2row(dst + dstpitch, srcy + srcpitch, srcu, srcv, w);
00486
00487 if(g_cpuid.m_flags & CCpuID::mmx)
00488 __asm emms
00489
00490 return(true);
00491 }
00492
00493 bool BitBltFromRGBToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch, int sbpp)
00494 {
00495 if(dbpp == sbpp)
00496 {
00497 int bytes = w*dbpp>>3;
00498 for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
00499 memcpy_accel(dst, src, bytes);
00500 return(true);
00501 }
00502
00503 if(sbpp != 16 && sbpp != 24 && sbpp != 32
00504 || dbpp != 16 && dbpp != 24 && dbpp != 32)
00505 return(false);
00506
00507 if(dbpp == 16)
00508 {
00509 for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
00510 {
00511 if(sbpp == 24)
00512 {
00513 BYTE* s = (BYTE*)src;
00514 WORD* d = (WORD*)dst;
00515 for(int x = 0; x < w; x++, s+=3, d++)
00516 *d = (WORD)(((*((DWORD*)s)>>8)&0xf800)|((*((DWORD*)s)>>5)&0x07e0)|((*((DWORD*)s)>>3)&0x1f));
00517 }
00518 else if(sbpp == 32)
00519 {
00520 DWORD* s = (DWORD*)src;
00521 WORD* d = (WORD*)dst;
00522 for(int x = 0; x < w; x++, s++, d++)
00523 *d = (WORD)(((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x1f));
00524 }
00525 }
00526 }
00527 else if(dbpp == 24)
00528 {
00529 for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
00530 {
00531 if(sbpp == 16)
00532 {
00533 WORD* s = (WORD*)src;
00534 BYTE* d = (BYTE*)dst;
00535 for(int x = 0; x < w; x++, s++, d+=3)
00536 {
00537 d[0] = (*s&0x001f)<<3;
00538 d[1] = (*s&0x07e0)<<5;
00539 d[2] = (*s&0xf800)<<8;
00540 }
00541 }
00542 else if(sbpp == 32)
00543 {
00544 BYTE* s = (BYTE*)src;
00545 BYTE* d = (BYTE*)dst;
00546 for(int x = 0; x < w; x++, s+=4, d+=3)
00547 {d[0] = s[0]; d[1] = s[1]; d[2] = s[2];}
00548 }
00549 }
00550 }
00551 else if(dbpp == 32)
00552 {
00553 for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
00554 {
00555 if(sbpp == 16)
00556 {
00557 WORD* s = (WORD*)src;
00558 DWORD* d = (DWORD*)dst;
00559 for(int x = 0; x < w; x++, s++, d++)
00560 *d = ((*s&0xf800)<<8)|((*s&0x07e0)<<5)|((*s&0x001f)<<3);
00561 }
00562 else if(sbpp == 24)
00563 {
00564 BYTE* s = (BYTE*)src;
00565 DWORD* d = (DWORD*)dst;
00566 for(int x = 0; x < w; x++, s+=3, d++)
00567 *d = *((DWORD*)s)&0xffffff;
00568 }
00569 }
00570 }
00571
00572 return(true);
00573 }
00574
00575 static void asm_blend_row_clipped_c(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
00576 {
00577 BYTE* src2 = src + srcpitch;
00578 do {*dst++ = (*src++ + *src2++ + 1) >> 1;}
00579 while(w--);
00580 }
00581
00582 static void asm_blend_row_c(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
00583 {
00584 BYTE* src2 = src + srcpitch;
00585 BYTE* src3 = src2 + srcpitch;
00586 do {*dst++ = (*src++ + (*src2++ << 1) + *src3++ + 2) >> 2;}
00587 while(w--);
00588 }
00589
00590 static void __declspec(naked) asm_blend_row_clipped_MMX(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
00591 {
00592 static const __int64 _x0001000100010001 = 0x0001000100010001;
00593
00594 __asm {
00595 push ebp
00596 push edi
00597 push esi
00598 push ebx
00599
00600 mov edi,[esp+20]
00601 mov esi,[esp+24]
00602 sub edi,esi
00603 mov ebp,[esp+28]
00604 mov edx,[esp+32]
00605
00606 shr ebp, 3
00607
00608 movq mm6, _x0001000100010001
00609 pxor mm7, mm7
00610
00611 xloop:
00612 movq mm0, [esi]
00613 movq mm3, mm0
00614 punpcklbw mm0, mm7
00615 punpckhbw mm3, mm7
00616
00617 movq mm1, [esi+edx]
00618 movq mm4, mm1
00619 punpcklbw mm1, mm7
00620 punpckhbw mm4, mm7
00621
00622 paddw mm1, mm0
00623 paddw mm1, mm6
00624 psrlw mm1, 1
00625
00626 paddw mm4, mm3
00627 paddw mm4, mm6
00628 psrlw mm4, 1
00629
00630 add esi, 8
00631 packuswb mm1, mm4
00632 movq [edi+esi-8], mm1
00633
00634 dec ebp
00635 jne xloop
00636
00637 pop ebx
00638 pop esi
00639 pop edi
00640 pop ebp
00641 ret
00642 };
00643 }
00644
00645 static void __declspec(naked) asm_blend_row_MMX(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
00646 {
00647 static const __int64 mask0 = 0xfcfcfcfcfcfcfcfci64;
00648 static const __int64 mask1 = 0x7f7f7f7f7f7f7f7fi64;
00649 static const __int64 mask2 = 0x3f3f3f3f3f3f3f3fi64;
00650 static const __int64 _x0002000200020002 = 0x0002000200020002;
00651
00652 __asm {
00653 push ebp
00654 push edi
00655 push esi
00656 push ebx
00657
00658 mov edi, [esp+20]
00659 mov esi, [esp+24]
00660 sub edi, esi
00661 mov ebp, [esp+28]
00662 mov edx, [esp+32]
00663
00664 shr ebp, 3
00665
00666 movq mm6, _x0002000200020002
00667 pxor mm7, mm7
00668
00669 xloop:
00670 movq mm0, [esi]
00671 movq mm3, mm0
00672 punpcklbw mm0, mm7
00673 punpckhbw mm3, mm7
00674
00675 movq mm1, [esi+edx]
00676 movq mm4, mm1
00677 punpcklbw mm1, mm7
00678 punpckhbw mm4, mm7
00679
00680 movq mm2, [esi+edx*2]
00681 movq mm5, mm2
00682 punpcklbw mm2, mm7
00683 punpckhbw mm5, mm7
00684
00685 psllw mm1, 1
00686 paddw mm1, mm0
00687 paddw mm1, mm2
00688 paddw mm1, mm6
00689 psrlw mm1, 2
00690
00691 psllw mm4, 1
00692 paddw mm4, mm3
00693 paddw mm4, mm5
00694 paddw mm4, mm6
00695 psrlw mm4, 2
00696
00697 add esi, 8
00698 packuswb mm1, mm4
00699 movq [edi+esi-8], mm1
00700
00701 dec ebp
00702 jne xloop
00703
00704
00705
00706
00707
00708
00709
00710
00711
00712
00713
00714
00715
00716
00717
00718
00719
00720
00721
00722
00723
00724
00725
00726
00727
00728
00729
00730
00731
00732
00733
00734
00735
00736
00737
00738
00739
00740
00741
00742
00743
00744
00745
00746
00747
00748
00749
00750
00751
00752
00753
00754
00755
00756
00757
00758
00759
00760
00761
00762
00763
00764 pop ebx
00765 pop esi
00766 pop edi
00767 pop ebp
00768 ret
00769 };
00770 }
00771
00772 __declspec(align(16)) static BYTE const_1_16_bytes[] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
00773
00774 static void asm_blend_row_SSE2(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
00775 {
00776 __asm
00777 {
00778 mov edx, srcpitch
00779 mov esi, src
00780 mov edi, dst
00781 sub edi, esi
00782 mov ecx, w
00783 mov ebx, ecx
00784 shr ecx, 4
00785 and ebx, 15
00786
00787 movdqa xmm7, [const_1_16_bytes]
00788
00789 asm_blend_row_SSE2_loop:
00790 movdqa xmm0, [esi]
00791 movdqa xmm1, [esi+edx]
00792 movdqa xmm2, [esi+edx*2]
00793 pavgb xmm0, xmm1
00794 pavgb xmm2, xmm1
00795 psubusb xmm0, xmm7
00796 pavgb xmm0, xmm2
00797 movdqa [esi+edi], xmm0
00798 add esi, 16
00799 dec ecx
00800 jnz asm_blend_row_SSE2_loop
00801
00802 test ebx,15
00803 jz asm_blend_row_SSE2_end
00804
00805 mov ecx, ebx
00806 xor ax, ax
00807 xor bx, bx
00808 xor dx, dx
00809 asm_blend_row_SSE2_loop2:
00810 mov al, [esi]
00811 mov bl, [esi+edx]
00812 mov dl, [esi+edx*2]
00813 add ax, bx
00814 inc ax
00815 shr ax, 1
00816 add dx, bx
00817 inc dx
00818 shr dx, 1
00819 add ax, dx
00820 shr ax, 1
00821 mov [esi+edi], al
00822 inc esi
00823 dec ecx
00824 jnz asm_blend_row_SSE2_loop2
00825
00826 asm_blend_row_SSE2_end:
00827 }
00828 }
00829
00830 static void asm_blend_row_clipped_SSE2(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
00831 {
00832 __asm
00833 {
00834 mov edx, srcpitch
00835 mov esi, src
00836 mov edi, dst
00837 sub edi, esi
00838 mov ecx, w
00839 mov ebx, ecx
00840 shr ecx, 4
00841 and ebx, 15
00842
00843 movdqa xmm7, [const_1_16_bytes]
00844
00845 asm_blend_row_clipped_SSE2_loop:
00846 movdqa xmm0, [esi]
00847 movdqa xmm1, [esi+edx]
00848 pavgb xmm0, xmm1
00849 movdqa [esi+edi], xmm0
00850 add esi, 16
00851 dec ecx
00852 jnz asm_blend_row_clipped_SSE2_loop
00853
00854 test ebx,15
00855 jz asm_blend_row_clipped_SSE2_end
00856
00857 mov ecx, ebx
00858 xor ax, ax
00859 xor bx, bx
00860 asm_blend_row_clipped_SSE2_loop2:
00861 mov al, [esi]
00862 mov bl, [esi+edx]
00863 add ax, bx
00864 inc ax
00865 shr ax, 1
00866 mov [esi+edi], al
00867 inc esi
00868 dec ecx
00869 jnz asm_blend_row_clipped_SSE2_loop2
00870
00871 asm_blend_row_clipped_SSE2_end:
00872 }
00873 }
00874
00875 void DeinterlaceBlend(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch)
00876 {
00877 void (*asm_blend_row_clipped)(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) = NULL;
00878 void (*asm_blend_row)(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) = NULL;
00879
00880 if((g_cpuid.m_flags & CCpuID::sse2) && !((DWORD)src&0xf) && !((DWORD)dst&0xf) && !(srcpitch&0xf))
00881 {
00882 asm_blend_row_clipped = asm_blend_row_clipped_SSE2;
00883 asm_blend_row = asm_blend_row_SSE2;
00884 }
00885 else if(g_cpuid.m_flags & CCpuID::mmx)
00886 {
00887 asm_blend_row_clipped = asm_blend_row_clipped_MMX;
00888 asm_blend_row = asm_blend_row_MMX;
00889 }
00890 else
00891 {
00892 asm_blend_row_clipped = asm_blend_row_clipped_c;
00893 asm_blend_row = asm_blend_row_c;
00894 }
00895
00896 if(!asm_blend_row_clipped)
00897 return;
00898
00899 asm_blend_row_clipped(dst, src, rowbytes, srcpitch);
00900
00901 if((h -= 2) > 0) do
00902 {
00903 dst += dstpitch;
00904 asm_blend_row(dst, src, rowbytes, srcpitch);
00905 src += srcpitch;
00906 }
00907 while(--h);
00908
00909 asm_blend_row_clipped(dst + dstpitch, src, rowbytes, srcpitch);
00910
00911 if(g_cpuid.m_flags & CCpuID::mmx)
00912 __asm emms
00913 }
00914
00915 void AvgLines8(BYTE* dst, DWORD h, DWORD pitch)
00916 {
00917 if(h <= 1) return;
00918
00919 BYTE* s = dst;
00920 BYTE* d = dst + (h-2)*pitch;
00921
00922 for(; s < d; s += pitch*2)
00923 {
00924 BYTE* tmp = s;
00925
00926 if((g_cpuid.m_flags & CCpuID::sse2) && !((DWORD)tmp&0xf) && !((DWORD)pitch&0xf))
00927 {
00928 __asm
00929 {
00930 mov esi, tmp
00931 mov ebx, pitch
00932
00933 mov ecx, ebx
00934 shr ecx, 4
00935
00936 AvgLines8_sse2_loop:
00937 movdqa xmm0, [esi]
00938 pavgb xmm0, [esi+ebx*2]
00939 movdqa [esi+ebx], xmm0
00940 add esi, 16
00941
00942 dec ecx
00943 jnz AvgLines8_sse2_loop
00944
00945 mov tmp, esi
00946 }
00947
00948 for(int i = pitch&7; i--; tmp++)
00949 {
00950 tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
00951 }
00952 }
00953 else if(g_cpuid.m_flags & CCpuID::mmx)
00954 {
00955 __asm
00956 {
00957 mov esi, tmp
00958 mov ebx, pitch
00959
00960 mov ecx, ebx
00961 shr ecx, 3
00962
00963 pxor mm7, mm7
00964 AvgLines8_mmx_loop:
00965 movq mm0, [esi]
00966 movq mm1, mm0
00967
00968 punpcklbw mm0, mm7
00969 punpckhbw mm1, mm7
00970
00971 movq mm2, [esi+ebx*2]
00972 movq mm3, mm2
00973
00974 punpcklbw mm2, mm7
00975 punpckhbw mm3, mm7
00976
00977 paddw mm0, mm2
00978 psrlw mm0, 1
00979
00980 paddw mm1, mm3
00981 psrlw mm1, 1
00982
00983 packuswb mm0, mm1
00984
00985 movq [esi+ebx], mm0
00986
00987 lea esi, [esi+8]
00988
00989 dec ecx
00990 jnz AvgLines8_mmx_loop
00991
00992 mov tmp, esi
00993 }
00994
00995 for(int i = pitch&7; i--; tmp++)
00996 {
00997 tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
00998 }
00999 }
01000 else
01001 {
01002 for(int i = pitch; i--; tmp++)
01003 {
01004 tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
01005 }
01006 }
01007 }
01008
01009 if(!(h&1) && h >= 2)
01010 {
01011 dst += (h-2)*pitch;
01012 memcpy_accel(dst + pitch, dst, pitch);
01013 }
01014
01015 __asm emms;
01016 }
01017
01018 void AvgLines555(BYTE* dst, DWORD h, DWORD pitch)
01019 {
01020 if(h <= 1) return;
01021
01022 unsigned __int64 __0x7c007c007c007c00 = 0x7c007c007c007c00;
01023 unsigned __int64 __0x03e003e003e003e0 = 0x03e003e003e003e0;
01024 unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f;
01025
01026 BYTE* s = dst;
01027 BYTE* d = dst + (h-2)*pitch;
01028
01029 for(; s < d; s += pitch*2)
01030 {
01031 BYTE* tmp = s;
01032
01033 __asm
01034 {
01035 mov esi, tmp
01036 mov ebx, pitch
01037
01038 mov ecx, ebx
01039 shr ecx, 3
01040
01041 movq mm6, __0x03e003e003e003e0
01042 movq mm7, __0x001f001f001f001f
01043
01044 AvgLines555_loop:
01045 movq mm0, [esi]
01046 movq mm1, mm0
01047 movq mm2, mm0
01048
01049 psrlw mm0, 10
01050 pand mm1, mm6
01051 pand mm2, mm7
01052
01053 movq mm3, [esi+ebx*2]
01054 movq mm4, mm3
01055 movq mm5, mm3
01056
01057 psrlw mm3, 10
01058 pand mm4, mm6
01059 pand mm5, mm7
01060
01061 paddw mm0, mm3
01062 psrlw mm0, 1
01063 psllw mm0, 10
01064
01065 paddw mm1, mm4
01066 psrlw mm1, 1
01067 pand mm1, mm6
01068
01069 paddw mm2, mm5
01070 psrlw mm2, 1
01071
01072
01073 por mm0, mm1
01074 por mm0, mm2
01075
01076 movq [esi+ebx], mm0
01077
01078 lea esi, [esi+8]
01079
01080 dec ecx
01081 jnz AvgLines555_loop
01082
01083 mov tmp, esi
01084 }
01085
01086 for(int i = (pitch&7)>>1; i--; tmp++)
01087 {
01088 tmp[pitch] =
01089 ((((*tmp&0x7c00) + (tmp[pitch<<1]&0x7c00)) >> 1)&0x7c00)|
01090 ((((*tmp&0x03e0) + (tmp[pitch<<1]&0x03e0)) >> 1)&0x03e0)|
01091 ((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f);
01092 }
01093 }
01094
01095 if(!(h&1) && h >= 2)
01096 {
01097 dst += (h-2)*pitch;
01098 memcpy_accel(dst + pitch, dst, pitch);
01099 }
01100
01101 __asm emms;
01102 }
01103
01104 void AvgLines565(BYTE* dst, DWORD h, DWORD pitch)
01105 {
01106 if(h <= 1) return;
01107
01108 unsigned __int64 __0xf800f800f800f800 = 0xf800f800f800f800;
01109 unsigned __int64 __0x07e007e007e007e0 = 0x07e007e007e007e0;
01110 unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f;
01111
01112 BYTE* s = dst;
01113 BYTE* d = dst + (h-2)*pitch;
01114
01115 for(; s < d; s += pitch*2)
01116 {
01117 WORD* tmp = (WORD*)s;
01118
01119 __asm
01120 {
01121 mov esi, tmp
01122 mov ebx, pitch
01123
01124 mov ecx, ebx
01125 shr ecx, 3
01126
01127 movq mm6, __0x07e007e007e007e0
01128 movq mm7, __0x001f001f001f001f
01129
01130 AvgLines565_loop:
01131 movq mm0, [esi]
01132 movq mm1, mm0
01133 movq mm2, mm0
01134
01135 psrlw mm0, 11
01136 pand mm1, mm6
01137 pand mm2, mm7
01138
01139 movq mm3, [esi+ebx*2]
01140 movq mm4, mm3
01141 movq mm5, mm3
01142
01143 psrlw mm3, 11
01144 pand mm4, mm6
01145 pand mm5, mm7
01146
01147 paddw mm0, mm3
01148 psrlw mm0, 1
01149 psllw mm0, 11
01150
01151 paddw mm1, mm4
01152 psrlw mm1, 1
01153 pand mm1, mm6
01154
01155 paddw mm2, mm5
01156 psrlw mm2, 1
01157
01158
01159 por mm0, mm1
01160 por mm0, mm2
01161
01162 movq [esi+ebx], mm0
01163
01164 lea esi, [esi+8]
01165
01166 dec ecx
01167 jnz AvgLines565_loop
01168
01169 mov tmp, esi
01170 }
01171
01172 for(int i = (pitch&7)>>1; i--; tmp++)
01173 {
01174 tmp[pitch] =
01175 ((((*tmp&0xf800) + (tmp[pitch<<1]&0xf800)) >> 1)&0xf800)|
01176 ((((*tmp&0x07e0) + (tmp[pitch<<1]&0x07e0)) >> 1)&0x07e0)|
01177 ((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f);
01178 }
01179 }
01180
01181 if(!(h&1) && h >= 2)
01182 {
01183 dst += (h-2)*pitch;
01184 memcpy_accel(dst + pitch, dst, pitch);
01185 }
01186
01187 __asm emms;
01188 }
01189
01190 extern "C" void mmx_YUY2toRGB24(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709);
01191 extern "C" void mmx_YUY2toRGB32(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709);
01192
01193 bool BitBltFromYUY2ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch)
01194 {
01195 void (* YUY2toRGB)(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709) = NULL;
01196
01197 if(g_cpuid.m_flags & CCpuID::mmx)
01198 {
01199 YUY2toRGB =
01200 dbpp == 32 ? mmx_YUY2toRGB32 :
01201 dbpp == 24 ? mmx_YUY2toRGB24 :
01202
01203 NULL;
01204 }
01205 else
01206 {
01207
01208 }
01209
01210 if(!YUY2toRGB) return(false);
01211
01212 YUY2toRGB(src, dst, src + h*srcpitch, srcpitch, w, false);
01213
01214 return(true);
01215 }