vd.cpp

00001 //      VirtualDub - Video processing and capture application
00002 //      Copyright (C) 1998-2001 Avery Lee
00003 //
00004 //      This program is free software; you can redistribute it and/or modify
00005 //      it under the terms of the GNU General Public License as published by
00006 //      the Free Software Foundation; either version 2 of the License, or
00007 //      (at your option) any later version.
00008 //
00009 //      This program is distributed in the hope that it will be useful,
00010 //      but WITHOUT ANY WARRANTY; without even the implied warranty of
00011 //      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012 //      GNU General Public License for more details.
00013 //
00014 //      You should have received a copy of the GNU General Public License
00015 //      along with this program; if not, write to the Free Software
00016 //      Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00017 //
00018 //  Notes: 
00019 //  - BitBltFromI420ToRGB is from VirtualDub
00020 //      - The core assembly function of CCpuID is from DVD2AVI
00021 //      (- vd.cpp/h should be renamed to something more sensible already :)
00022 
00023 
00024 #include "stdafx.h"
00025 #include "vd.h"
00026 
00027 #pragma warning(disable : 4799) // no emms... blahblahblah
00028 
00029 #define ReadTSC( x ) __asm cpuid \
00030         __asm rdtsc \
00031         __asm mov dword ptr x,eax \
00032         __asm mov dword ptr x+4,edx
00033 
00034 CCpuID::CCpuID()
00035 {
00036         DWORD flags = 0;
00037 
00038         __asm
00039         {
00040                 mov                     eax, 1
00041                 cpuid
00042                 test            edx, 0x00800000         // STD MMX
00043                 jz                      TEST_SSE
00044                 or                      [flags], 1
00045 TEST_SSE:
00046                 test            edx, 0x02000000         // STD SSE
00047                 jz                      TEST_SSE2
00048                 or                      [flags], 2
00049                 or                      [flags], 4
00050 TEST_SSE2:
00051                 test            edx, 0x04000000         // SSE2 
00052                 jz                      TEST_3DNOW
00053                 or                      [flags], 8
00054 TEST_3DNOW:
00055                 mov                     eax, 0x80000001
00056                 cpuid
00057                 test            edx, 0x80000000         // 3D NOW
00058                 jz                      TEST_SSEMMX
00059                 or                      [flags], 16
00060 TEST_SSEMMX:
00061                 test            edx, 0x00400000         // SSE MMX
00062                 jz                      TEST_END
00063                 or                      [flags], 2
00064 TEST_END:
00065         }
00066 
00067         m_flags = (flag_t)flags;
00068 }
00069 
00070 CCpuID g_cpuid;
00071 
00072 void memcpy_accel(void* dst, const void* src, size_t len)
00073 {
00074         if((g_cpuid.m_flags & CCpuID::sse2) && len >= 128 
00075                 && !((DWORD)src&15) && !((DWORD)dst&15))
00076         {
00077                 __asm
00078                 {
00079                         mov     esi, dword ptr [src]
00080                         mov     edi, dword ptr [dst]
00081                         mov     ecx, len
00082                         shr     ecx, 7
00083         memcpy_accel_sse2_loop:
00084                         prefetchnta     [esi+16*8]
00085                         movdqa          xmm0, [esi]
00086                         movdqa          xmm1, [esi+16*1]
00087                         movdqa          xmm2, [esi+16*2]
00088                         movdqa          xmm3, [esi+16*3]
00089                         movdqa          xmm4, [esi+16*4]
00090                         movdqa          xmm5, [esi+16*5]
00091                         movdqa          xmm6, [esi+16*6]
00092                         movdqa          xmm7, [esi+16*7]
00093                         movntps         [edi], xmm0
00094                         movntps         [edi+16*1], xmm1
00095                         movntps         [edi+16*2], xmm2
00096                         movntps         [edi+16*3], xmm3
00097                         movntps         [edi+16*4], xmm4
00098                         movntps         [edi+16*5], xmm5
00099                         movntps         [edi+16*6], xmm6
00100                         movntps         [edi+16*7], xmm7
00101                         add                     esi, 128
00102                         add                     edi, 128
00103                         dec                     ecx
00104                         jnz                     memcpy_accel_sse2_loop
00105                         mov     ecx, len
00106                         and     ecx, 127
00107                         cmp     ecx, 0
00108                         je              memcpy_accel_sse2_end
00109         memcpy_accel_sse2_loop2:
00110                         mov             dl, byte ptr[esi] 
00111                         mov             byte ptr[edi], dl
00112                         inc             esi
00113                         inc             edi
00114                         dec             ecx
00115                         jne             memcpy_accel_sse2_loop2
00116         memcpy_accel_sse2_end:
00117                         emms
00118                         sfence
00119                 }
00120         }
00121         else if((g_cpuid.m_flags & CCpuID::ssefpu) && len >= 128 
00122                 && !((DWORD)src&15) && !((DWORD)dst&15))
00123         {
00124                 __asm
00125                 {
00126                         mov     esi, dword ptr [src]
00127                         mov     edi, dword ptr [dst]
00128                         mov     ecx, len
00129                         shr     ecx, 7
00130         memcpy_accel_sse_loop:
00131                         prefetchnta     [esi+16*8]
00132                         movaps          xmm0, [esi]
00133                         movaps          xmm1, [esi+16*1]
00134                         movaps          xmm2, [esi+16*2]
00135                         movaps          xmm3, [esi+16*3]
00136                         movaps          xmm4, [esi+16*4]
00137                         movaps          xmm5, [esi+16*5]
00138                         movaps          xmm6, [esi+16*6]
00139                         movaps          xmm7, [esi+16*7]
00140                         movntps         [edi], xmm0
00141                         movntps         [edi+16*1], xmm1
00142                         movntps         [edi+16*2], xmm2
00143                         movntps         [edi+16*3], xmm3
00144                         movntps         [edi+16*4], xmm4
00145                         movntps         [edi+16*5], xmm5
00146                         movntps         [edi+16*6], xmm6
00147                         movntps         [edi+16*7], xmm7
00148                         add                     esi, 128
00149                         add                     edi, 128
00150                         dec                     ecx
00151                         jnz                     memcpy_accel_sse_loop
00152                         mov     ecx, len
00153                         and     ecx, 127
00154                         cmp     ecx, 0
00155                         je              memcpy_accel_sse_end
00156         memcpy_accel_sse_loop2:
00157                         mov             dl, byte ptr[esi] 
00158                         mov             byte ptr[edi], dl
00159                         inc             esi
00160                         inc             edi
00161                         dec             ecx
00162                         jne             memcpy_accel_sse_loop2
00163         memcpy_accel_sse_end:
00164                         emms
00165                         sfence
00166                 }
00167         }
00168         else if((g_cpuid.m_flags & CCpuID::mmx) && len >= 64
00169                 && !((DWORD)src&7) && !((DWORD)dst&7))
00170         {
00171                 __asm 
00172                 {
00173                         mov     esi, dword ptr [src]
00174                         mov     edi, dword ptr [dst]
00175                         mov     ecx, len
00176                         shr     ecx, 6
00177         memcpy_accel_mmx_loop:
00178                         movq    mm0, qword ptr [esi]
00179                         movq    mm1, qword ptr [esi+8*1]
00180                         movq    mm2, qword ptr [esi+8*2]
00181                         movq    mm3, qword ptr [esi+8*3]
00182                         movq    mm4, qword ptr [esi+8*4]
00183                         movq    mm5, qword ptr [esi+8*5]
00184                         movq    mm6, qword ptr [esi+8*6]
00185                         movq    mm7, qword ptr [esi+8*7]
00186                         movq    qword ptr [edi], mm0
00187                         movq    qword ptr [edi+8*1], mm1
00188                         movq    qword ptr [edi+8*2], mm2
00189                         movq    qword ptr [edi+8*3], mm3
00190                         movq    qword ptr [edi+8*4], mm4
00191                         movq    qword ptr [edi+8*5], mm5
00192                         movq    qword ptr [edi+8*6], mm6
00193                         movq    qword ptr [edi+8*7], mm7
00194                         add     esi, 64
00195                         add     edi, 64
00196                         dec             ecx
00197                         jnz             memcpy_accel_mmx_loop
00198                         mov     ecx, len
00199                         and     ecx, 63
00200                         cmp     ecx, 0
00201                         je              memcpy_accel_mmx_end
00202         memcpy_accel_mmx_loop2:
00203                         mov             dl, byte ptr [esi] 
00204                         mov             byte ptr [edi], dl
00205                         inc             esi
00206                         inc             edi
00207                         dec             ecx
00208                         jne             memcpy_accel_mmx_loop2
00209         memcpy_accel_mmx_end:
00210                         emms
00211                 }
00212         }
00213         else
00214         {
00215                 memcpy(dst, src, len);
00216         }
00217 }
00218 
00219 bool BitBltFromI420ToI420(int w, int h, BYTE* dsty, BYTE* dstu, BYTE* dstv, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch)
00220 {
00221         if(w&1) return(false);
00222 
00223         int pitch = min(abs(srcpitch), abs(dstpitch));
00224 
00225         for(int y = 0; y < h; y++, srcy += srcpitch, dsty += dstpitch)
00226                 memcpy_accel(dsty, srcy, pitch);
00227 
00228         srcpitch >>= 1;
00229         dstpitch >>= 1;
00230 
00231         pitch = min(abs(srcpitch), abs(dstpitch));
00232 
00233         for(int y = 0; y < h; y+=2, srcu += srcpitch, dstu += dstpitch)
00234                 memcpy_accel(dstu, srcu, pitch);
00235 
00236         for(int y = 0; y < h; y+=2, srcv += srcpitch, dstv += dstpitch)
00237                 memcpy_accel(dstv, srcv, pitch);
00238 
00239         return(true);
00240 }
00241 
00242 bool BitBltFromYUY2ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* src, int srcpitch)
00243 {
00244         int pitch = min(abs(srcpitch), abs(dstpitch));
00245 
00246         for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
00247                 memcpy_accel(dst, src, pitch);
00248 
00249         return(true);
00250 }
00251 
00252 extern "C" void asm_YUVtoRGB32_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00253 extern "C" void asm_YUVtoRGB24_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00254 extern "C" void asm_YUVtoRGB16_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00255 extern "C" void asm_YUVtoRGB32_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00256 extern "C" void asm_YUVtoRGB24_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00257 extern "C" void asm_YUVtoRGB16_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00258 extern "C" void asm_YUVtoRGB32_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00259 extern "C" void asm_YUVtoRGB24_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00260 extern "C" void asm_YUVtoRGB16_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
00261 
00262 bool BitBltFromI420ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch)
00263 {
00264         if(w<=0 || h<=0 || (w&1) || (h&1))
00265                 return(false);
00266 
00267         void (*asm_YUVtoRGB_row)(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width) = NULL;;
00268 
00269         if((g_cpuid.m_flags & CCpuID::ssefpu) && !(w&7))
00270         {
00271                 switch(dbpp)
00272                 {
00273                 case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row/*_ISSE*/; break; // TODO: fix _ISSE (555->565)
00274                 case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row_ISSE; break;
00275                 case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row_ISSE; break;
00276                 }
00277         }
00278         else if((g_cpuid.m_flags & CCpuID::mmx) && !(w&7))
00279         {
00280                 switch(dbpp)
00281                 {
00282                 case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row/*_MMX*/; break; // TODO: fix _MMX (555->565)
00283                 case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row_MMX; break;
00284                 case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row_MMX; break;
00285                 }
00286         }
00287         else
00288         {
00289                 switch(dbpp)
00290                 {
00291                 case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row; break;
00292                 case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row; break;
00293                 case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row; break;
00294                 }
00295         }
00296 
00297         if(!asm_YUVtoRGB_row) 
00298                 return(false);
00299 
00300         do
00301         {
00302                 asm_YUVtoRGB_row(dst + dstpitch, dst, srcy + srcpitch, srcy, srcu, srcv, w/2);
00303 
00304                 dst += 2*dstpitch;
00305                 srcy += srcpitch*2;
00306                 srcu += srcpitch/2;
00307                 srcv += srcpitch/2;
00308         }
00309         while(h -= 2);
00310 
00311         if(g_cpuid.m_flags & CCpuID::mmx)
00312                 __asm emms
00313 
00314         if(g_cpuid.m_flags & CCpuID::ssefpu)
00315                 __asm sfence
00316 
00317         return true;
00318 }
00319 
00320 static void yuvtoyuy2row_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width)
00321 {
00322         WORD* dstw = (WORD*)dst;
00323         for(; width > 1; width -= 2)
00324         {
00325                 *dstw++ = (*srcu++<<8)|*srcy++;
00326                 *dstw++ = (*srcv++<<8)|*srcy++;
00327         }
00328 }
00329 
00330 static void __declspec(naked) yuvtoyuy2row_MMX(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width)
00331 {
00332         __asm {
00333                 push    ebp
00334                 push    edi
00335                 push    esi
00336                 push    ebx
00337 
00338                 mov             edi, [esp+20] // dst
00339                 mov             ebp, [esp+24] // srcy
00340                 mov             ebx, [esp+28] // srcu
00341                 mov             esi, [esp+32] // srcv
00342                 mov             ecx, [esp+36] // width
00343 
00344                 shr             ecx, 3
00345 
00346 yuvtoyuy2row_loop:
00347 
00348                 movd            mm0, [ebx]
00349                 punpcklbw       mm0, [esi]
00350 
00351                 movq            mm1, [ebp]
00352                 movq            mm2, mm1
00353                 punpcklbw       mm1, mm0
00354                 punpckhbw       mm2, mm0
00355 
00356                 movq            [edi], mm1
00357                 movq            [edi+8], mm2
00358 
00359                 add             ebp, 8
00360                 add             ebx, 4
00361                 add             esi, 4
00362         add             edi, 16
00363 
00364                 dec             ecx
00365                 jnz             yuvtoyuy2row_loop
00366 
00367                 pop             ebx
00368                 pop             esi
00369                 pop             edi
00370                 pop             ebp
00371                 ret
00372         };
00373 }
00374 
00375 static void yuvtoyuy2row_avg_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv)
00376 {
00377         WORD* dstw = (WORD*)dst;
00378         for(; width > 1; width -= 2, srcu++, srcv++)
00379         {
00380                 *dstw++ = (((srcu[0]+srcu[pitchuv])>>1)<<8)|*srcy++;
00381                 *dstw++ = (((srcv[0]+srcv[pitchuv])>>1)<<8)|*srcy++;
00382         }
00383 }
00384 
00385 static void __declspec(naked) yuvtoyuy2row_avg_MMX(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv)
00386 {
00387         static const __int64 mask = 0x7f7f7f7f7f7f7f7fi64;
00388 
00389         __asm {
00390                 push    ebp
00391                 push    edi
00392                 push    esi
00393                 push    ebx
00394 
00395                 movq    mm7, mask
00396 
00397                 mov             edi, [esp+20] // dst
00398                 mov             ebp, [esp+24] // srcy
00399                 mov             ebx, [esp+28] // srcu
00400                 mov             esi, [esp+32] // srcv
00401                 mov             ecx, [esp+36] // width
00402                 mov             eax, [esp+40] // pitchuv
00403 
00404                 shr             ecx, 3
00405 
00406 yuvtoyuy2row_avg_loop:
00407 
00408                 movd            mm0, [ebx]
00409                 punpcklbw       mm0, [esi]
00410                 movq            mm1, mm0
00411 
00412                 movd            mm2, [ebx + eax]
00413                 punpcklbw       mm2, [esi + eax]
00414                 movq            mm3, mm2
00415 
00416                 // (x+y)>>1 == (x&y)+((x^y)>>1)
00417 
00418                 pand            mm0, mm2
00419                 pxor            mm1, mm3
00420                 psrlq           mm1, 1
00421                 pand            mm1, mm7
00422                 paddb           mm0, mm1
00423 
00424                 movq            mm1, [ebp]
00425                 movq            mm2, mm1
00426                 punpcklbw       mm1, mm0
00427                 punpckhbw       mm2, mm0
00428 
00429                 movq            [edi], mm1
00430                 movq            [edi+8], mm2
00431 
00432                 add             ebp, 8
00433                 add             ebx, 4
00434                 add             esi, 4
00435         add             edi, 16
00436 
00437                 dec             ecx
00438                 jnz             yuvtoyuy2row_avg_loop
00439 
00440                 pop             ebx
00441                 pop             esi
00442                 pop             edi
00443                 pop             ebp
00444                 ret
00445         };
00446 }
00447 
00448 bool BitBltFromI420ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch)
00449 {
00450         if(w<=0 || h<=0 || (w&1) || (h&1))
00451                 return(false);
00452 
00453         if(srcpitch == 0) srcpitch = w;
00454 
00455         void (*yuvtoyuy2row)(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width) = NULL;
00456         void (*yuvtoyuy2row_avg)(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv) = NULL;
00457 
00458         if((g_cpuid.m_flags & CCpuID::mmx) && !(w&7))
00459         {
00460                 yuvtoyuy2row = yuvtoyuy2row_MMX;
00461                 yuvtoyuy2row_avg = yuvtoyuy2row_avg_MMX;
00462         }
00463         else
00464         {
00465                 yuvtoyuy2row = yuvtoyuy2row_c;
00466                 yuvtoyuy2row_avg = yuvtoyuy2row_avg_c;
00467         }
00468 
00469         if(!yuvtoyuy2row) 
00470                 return(false);
00471 
00472         do
00473         {
00474                 yuvtoyuy2row(dst, srcy, srcu, srcv, w);
00475                 yuvtoyuy2row_avg(dst + dstpitch, srcy + srcpitch, srcu, srcv, w, srcpitch/2);
00476 
00477                 dst += 2*dstpitch;
00478                 srcy += srcpitch*2;
00479                 srcu += srcpitch/2;
00480                 srcv += srcpitch/2;
00481         }
00482         while((h -= 2) > 2);
00483 
00484         yuvtoyuy2row(dst, srcy, srcu, srcv, w);
00485         yuvtoyuy2row(dst + dstpitch, srcy + srcpitch, srcu, srcv, w);
00486 
00487         if(g_cpuid.m_flags & CCpuID::mmx)
00488                 __asm emms
00489 
00490         return(true);
00491 }
00492 
00493 bool BitBltFromRGBToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch, int sbpp)
00494 {
00495         if(dbpp == sbpp)
00496         {
00497                 int bytes = w*dbpp>>3;
00498                 for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
00499                         memcpy_accel(dst, src, bytes);
00500                 return(true);
00501         }
00502         
00503         if(sbpp != 16 && sbpp != 24 && sbpp != 32
00504         || dbpp != 16 && dbpp != 24 && dbpp != 32)
00505                 return(false);
00506 
00507         if(dbpp == 16)
00508         {
00509                 for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
00510                 {
00511                         if(sbpp == 24)
00512                         {
00513                                 BYTE* s = (BYTE*)src;
00514                                 WORD* d = (WORD*)dst;
00515                                 for(int x = 0; x < w; x++, s+=3, d++)
00516                                         *d = (WORD)(((*((DWORD*)s)>>8)&0xf800)|((*((DWORD*)s)>>5)&0x07e0)|((*((DWORD*)s)>>3)&0x1f));
00517                         }
00518                         else if(sbpp == 32)
00519                         {
00520                                 DWORD* s = (DWORD*)src;
00521                                 WORD* d = (WORD*)dst;
00522                                 for(int x = 0; x < w; x++, s++, d++)
00523                                         *d = (WORD)(((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x1f));
00524                         }
00525                 }
00526         }
00527         else if(dbpp == 24)
00528         {
00529                 for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
00530                 {
00531                         if(sbpp == 16)
00532                         {
00533                                 WORD* s = (WORD*)src;
00534                                 BYTE* d = (BYTE*)dst;
00535                                 for(int x = 0; x < w; x++, s++, d+=3)
00536                                 {       // not tested, r-g-b might be in reverse
00537                                         d[0] = (*s&0x001f)<<3;
00538                                         d[1] = (*s&0x07e0)<<5;
00539                                         d[2] = (*s&0xf800)<<8;
00540                                 }
00541                         }
00542                         else if(sbpp == 32)
00543                         {
00544                                 BYTE* s = (BYTE*)src;
00545                                 BYTE* d = (BYTE*)dst;
00546                                 for(int x = 0; x < w; x++, s+=4, d+=3)
00547                                         {d[0] = s[0]; d[1] = s[1]; d[2] = s[2];}
00548                         }
00549                 }
00550         }
00551         else if(dbpp == 32)
00552         {
00553                 for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
00554                 {
00555                         if(sbpp == 16)
00556                         {
00557                                 WORD* s = (WORD*)src;
00558                                 DWORD* d = (DWORD*)dst;
00559                                 for(int x = 0; x < w; x++, s++, d++)
00560                                         *d = ((*s&0xf800)<<8)|((*s&0x07e0)<<5)|((*s&0x001f)<<3);
00561                         }
00562                         else if(sbpp == 24)
00563                         {       
00564                                 BYTE* s = (BYTE*)src;
00565                                 DWORD* d = (DWORD*)dst;
00566                                 for(int x = 0; x < w; x++, s+=3, d++)
00567                                         *d = *((DWORD*)s)&0xffffff;
00568                         }
00569                 }
00570         }
00571 
00572         return(true);
00573 }
00574 
00575 static void asm_blend_row_clipped_c(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
00576 {
00577         BYTE* src2 = src + srcpitch;
00578         do {*dst++ = (*src++ + *src2++ + 1) >> 1;}
00579         while(w--);
00580 }
00581 
00582 static void asm_blend_row_c(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
00583 {
00584         BYTE* src2 = src + srcpitch;
00585         BYTE* src3 = src2 + srcpitch;
00586         do {*dst++ = (*src++ + (*src2++ << 1) + *src3++ + 2) >> 2;}
00587         while(w--);
00588 }
00589 
00590 static void __declspec(naked) asm_blend_row_clipped_MMX(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
00591 {
00592         static const __int64 _x0001000100010001 = 0x0001000100010001;
00593 
00594         __asm {
00595                 push    ebp
00596                 push    edi
00597                 push    esi
00598                 push    ebx
00599 
00600                 mov             edi,[esp+20]
00601                 mov             esi,[esp+24]
00602                 sub             edi,esi
00603                 mov             ebp,[esp+28]
00604                 mov             edx,[esp+32]
00605 
00606                 shr             ebp, 3
00607 
00608                 movq    mm6, _x0001000100010001
00609                 pxor    mm7, mm7
00610 
00611 xloop:
00612                 movq            mm0, [esi]
00613                 movq            mm3, mm0
00614                 punpcklbw       mm0, mm7
00615                 punpckhbw       mm3, mm7
00616 
00617                 movq            mm1, [esi+edx]
00618                 movq            mm4, mm1
00619                 punpcklbw       mm1, mm7
00620                 punpckhbw       mm4, mm7
00621 
00622                 paddw           mm1, mm0
00623                 paddw           mm1, mm6
00624                 psrlw           mm1, 1
00625 
00626                 paddw           mm4, mm3
00627                 paddw           mm4, mm6
00628                 psrlw           mm4, 1
00629 
00630                 add                     esi, 8
00631                 packuswb        mm1, mm4
00632                 movq            [edi+esi-8], mm1
00633 
00634                 dec             ebp
00635                 jne             xloop
00636 
00637                 pop             ebx
00638                 pop             esi
00639                 pop             edi
00640                 pop             ebp
00641                 ret
00642         };
00643 }
00644 
00645 static void __declspec(naked) asm_blend_row_MMX(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
00646 {
00647         static const __int64 mask0 = 0xfcfcfcfcfcfcfcfci64;
00648         static const __int64 mask1 = 0x7f7f7f7f7f7f7f7fi64;
00649         static const __int64 mask2 = 0x3f3f3f3f3f3f3f3fi64;
00650         static const __int64 _x0002000200020002 = 0x0002000200020002;
00651 
00652         __asm {
00653                 push    ebp
00654                 push    edi
00655                 push    esi
00656                 push    ebx
00657 
00658                 mov             edi, [esp+20]
00659                 mov             esi, [esp+24]
00660                 sub             edi, esi
00661                 mov             ebp, [esp+28]
00662                 mov             edx, [esp+32]
00663 
00664                 shr             ebp, 3
00665 
00666                 movq    mm6, _x0002000200020002
00667                 pxor    mm7, mm7
00668 
00669 xloop:
00670                 movq            mm0, [esi]
00671                 movq            mm3, mm0
00672                 punpcklbw       mm0, mm7
00673                 punpckhbw       mm3, mm7
00674 
00675                 movq            mm1, [esi+edx]
00676                 movq            mm4, mm1
00677                 punpcklbw       mm1, mm7
00678                 punpckhbw       mm4, mm7
00679 
00680                 movq            mm2, [esi+edx*2]
00681                 movq            mm5, mm2
00682                 punpcklbw       mm2, mm7
00683                 punpckhbw       mm5, mm7
00684 
00685                 psllw           mm1, 1
00686                 paddw           mm1, mm0
00687                 paddw           mm1, mm2
00688                 paddw           mm1, mm6
00689                 psrlw           mm1, 2
00690 
00691                 psllw           mm4, 1
00692                 paddw           mm4, mm3
00693                 paddw           mm4, mm5
00694                 paddw           mm4, mm6
00695                 psrlw           mm4, 2
00696 
00697                 add                     esi, 8
00698                 packuswb        mm1, mm4
00699                 movq            [edi+esi-8], mm1
00700 
00701                 dec             ebp
00702                 jne             xloop
00703 
00704                 // sadly the original code makes a lot of visible banding artifacts on yuv
00705                 // (it seems those shiftings without rounding introduce too much error)
00706 /*
00707                 mov             edi,[esp+20]
00708                 mov             esi,[esp+24]
00709                 sub             edi,esi
00710                 mov             ebp,[esp+28]
00711                 mov             edx,[esp+32]
00712 
00713                 movq    mm5,mask0
00714                 movq    mm6,mask1
00715                 movq    mm7,mask2
00716                 shr             ebp,1
00717                 jz              oddpart
00718 
00719 xloop:
00720                 movq    mm2,[esi]
00721                 movq    mm0,mm5
00722 
00723                 movq    mm1,[esi+edx]
00724                 pand    mm0,mm2
00725 
00726                 psrlq   mm1,1
00727                 movq    mm2,[esi+edx*2]
00728 
00729                 psrlq   mm2,2
00730                 pand    mm1,mm6
00731 
00732                 psrlq   mm0,2
00733                 pand    mm2,mm7
00734 
00735                 paddb   mm0,mm1
00736                 add             esi,8
00737 
00738                 paddb   mm0,mm2
00739                 dec             ebp
00740 
00741                 movq    [edi+esi-8],mm0
00742                 jne             xloop
00743 
00744 oddpart:
00745                 test    byte ptr [esp+28],1
00746                 jz              nooddpart
00747 
00748                 mov             ecx,[esi]
00749                 mov             eax,0fcfcfcfch
00750                 mov             ebx,[esi+edx]
00751                 and             eax,ecx
00752                 shr             ebx,1
00753                 mov             ecx,[esi+edx*2]
00754                 shr             ecx,2
00755                 and             ebx,07f7f7f7fh
00756                 shr             eax,2
00757                 and             ecx,03f3f3f3fh
00758                 add             eax,ebx
00759                 add             eax,ecx
00760                 mov             [edi+esi],eax
00761 
00762 nooddpart:
00763 */
00764                 pop             ebx
00765                 pop             esi
00766                 pop             edi
00767                 pop             ebp
00768                 ret
00769         };
00770 }
00771 
00772 __declspec(align(16)) static BYTE const_1_16_bytes[] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
00773 
00774 static void asm_blend_row_SSE2(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
00775 {
00776         __asm
00777         {
00778                 mov edx, srcpitch
00779                 mov esi, src
00780                 mov edi, dst
00781                 sub edi, esi
00782                 mov ecx, w
00783                 mov ebx, ecx
00784                 shr ecx, 4
00785                 and ebx, 15
00786 
00787                 movdqa xmm7, [const_1_16_bytes] 
00788 
00789 asm_blend_row_SSE2_loop:
00790                 movdqa xmm0, [esi]
00791                 movdqa xmm1, [esi+edx]
00792                 movdqa xmm2, [esi+edx*2]
00793                 pavgb xmm0, xmm1
00794                 pavgb xmm2, xmm1
00795                 psubusb xmm0, xmm7
00796                 pavgb xmm0, xmm2
00797                 movdqa [esi+edi], xmm0
00798                 add esi, 16
00799                 dec     ecx
00800                 jnz asm_blend_row_SSE2_loop
00801 
00802                 test ebx,15
00803                 jz asm_blend_row_SSE2_end
00804 
00805                 mov ecx, ebx
00806                 xor ax, ax
00807                 xor bx, bx
00808                 xor dx, dx
00809 asm_blend_row_SSE2_loop2:
00810                 mov al, [esi]
00811                 mov bl, [esi+edx]
00812                 mov dl, [esi+edx*2]
00813                 add ax, bx
00814                 inc ax
00815                 shr ax, 1
00816                 add dx, bx
00817                 inc dx
00818                 shr dx, 1
00819                 add ax, dx
00820                 shr ax, 1
00821                 mov [esi+edi], al
00822                 inc esi
00823                 dec     ecx
00824                 jnz asm_blend_row_SSE2_loop2
00825 
00826 asm_blend_row_SSE2_end:
00827         }
00828 }
00829 
00830 static void asm_blend_row_clipped_SSE2(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
00831 {
00832         __asm
00833         {
00834                 mov edx, srcpitch
00835                 mov esi, src
00836                 mov edi, dst
00837                 sub edi, esi
00838                 mov ecx, w
00839                 mov ebx, ecx
00840                 shr ecx, 4
00841                 and ebx, 15
00842 
00843                 movdqa xmm7, [const_1_16_bytes] 
00844 
00845 asm_blend_row_clipped_SSE2_loop:
00846                 movdqa xmm0, [esi]
00847                 movdqa xmm1, [esi+edx]
00848                 pavgb xmm0, xmm1
00849                 movdqa [esi+edi], xmm0
00850                 add esi, 16
00851                 dec     ecx
00852                 jnz asm_blend_row_clipped_SSE2_loop
00853 
00854                 test ebx,15
00855                 jz asm_blend_row_clipped_SSE2_end
00856 
00857                 mov ecx, ebx
00858                 xor ax, ax
00859                 xor bx, bx
00860 asm_blend_row_clipped_SSE2_loop2:
00861                 mov al, [esi]
00862                 mov bl, [esi+edx]
00863                 add ax, bx
00864                 inc ax
00865                 shr ax, 1
00866                 mov [esi+edi], al
00867                 inc esi
00868                 dec     ecx
00869                 jnz asm_blend_row_clipped_SSE2_loop2
00870 
00871 asm_blend_row_clipped_SSE2_end:
00872         }
00873 }
00874 
00875 void DeinterlaceBlend(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch)
00876 {
00877         void (*asm_blend_row_clipped)(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) = NULL;
00878         void (*asm_blend_row)(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) = NULL;
00879 
00880         if((g_cpuid.m_flags & CCpuID::sse2) && !((DWORD)src&0xf) && !((DWORD)dst&0xf) && !(srcpitch&0xf))
00881         {
00882                 asm_blend_row_clipped = asm_blend_row_clipped_SSE2;
00883                 asm_blend_row = asm_blend_row_SSE2;
00884         }
00885         else if(g_cpuid.m_flags & CCpuID::mmx)
00886         {
00887                 asm_blend_row_clipped = asm_blend_row_clipped_MMX;
00888                 asm_blend_row = asm_blend_row_MMX;
00889         }
00890         else
00891         {
00892                 asm_blend_row_clipped = asm_blend_row_clipped_c;
00893                 asm_blend_row = asm_blend_row_c;
00894         }
00895 
00896         if(!asm_blend_row_clipped)
00897                 return;
00898 
00899         asm_blend_row_clipped(dst, src, rowbytes, srcpitch);
00900 
00901         if((h -= 2) > 0) do
00902         {
00903                 dst += dstpitch;
00904                 asm_blend_row(dst, src, rowbytes, srcpitch);
00905         src += srcpitch;
00906         }
00907         while(--h);
00908 
00909         asm_blend_row_clipped(dst + dstpitch, src, rowbytes, srcpitch);
00910 
00911         if(g_cpuid.m_flags & CCpuID::mmx)
00912                 __asm emms
00913 }
00914 
00915 void AvgLines8(BYTE* dst, DWORD h, DWORD pitch)
00916 {
00917         if(h <= 1) return;
00918 
00919         BYTE* s = dst;
00920         BYTE* d = dst + (h-2)*pitch;
00921 
00922         for(; s < d; s += pitch*2)
00923         {
00924                 BYTE* tmp = s;
00925 
00926                 if((g_cpuid.m_flags & CCpuID::sse2) && !((DWORD)tmp&0xf) && !((DWORD)pitch&0xf))
00927                 {
00928                         __asm
00929                         {
00930                                 mov             esi, tmp
00931                                 mov             ebx, pitch
00932 
00933                                 mov             ecx, ebx
00934                                 shr             ecx, 4
00935 
00936 AvgLines8_sse2_loop:
00937                                 movdqa  xmm0, [esi]
00938                                 pavgb   xmm0, [esi+ebx*2]
00939                                 movdqa  [esi+ebx], xmm0
00940                                 add             esi, 16
00941 
00942                                 dec             ecx
00943                                 jnz             AvgLines8_sse2_loop
00944 
00945                                 mov             tmp, esi
00946                         }
00947 
00948                         for(int i = pitch&7; i--; tmp++)
00949                         {
00950                                 tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
00951                         }
00952                 }
00953                 else if(g_cpuid.m_flags & CCpuID::mmx)
00954                 {
00955                         __asm
00956                         {
00957                                 mov             esi, tmp
00958                                 mov             ebx, pitch
00959 
00960                                 mov             ecx, ebx
00961                                 shr             ecx, 3
00962 
00963                                 pxor    mm7, mm7
00964 AvgLines8_mmx_loop:
00965                                 movq    mm0, [esi]
00966                                 movq    mm1, mm0
00967 
00968                                 punpcklbw       mm0, mm7
00969                                 punpckhbw       mm1, mm7
00970 
00971                                 movq    mm2, [esi+ebx*2]
00972                                 movq    mm3, mm2
00973 
00974                                 punpcklbw       mm2, mm7
00975                                 punpckhbw       mm3, mm7
00976 
00977                                 paddw   mm0, mm2
00978                                 psrlw   mm0, 1
00979 
00980                                 paddw   mm1, mm3
00981                                 psrlw   mm1, 1
00982 
00983                                 packuswb        mm0, mm1
00984 
00985                                 movq    [esi+ebx], mm0
00986 
00987                                 lea             esi, [esi+8]
00988 
00989                                 dec             ecx
00990                                 jnz             AvgLines8_mmx_loop
00991 
00992                                 mov             tmp, esi
00993                         }
00994 
00995                         for(int i = pitch&7; i--; tmp++)
00996                         {
00997                                 tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
00998                         }
00999                 }
01000                 else
01001                 {
01002                         for(int i = pitch; i--; tmp++)
01003                         {
01004                                 tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
01005                         }
01006                 }
01007         }
01008 
01009         if(!(h&1) && h >= 2)
01010         {
01011                 dst += (h-2)*pitch;
01012                 memcpy_accel(dst + pitch, dst, pitch);
01013         }
01014 
01015         __asm emms;
01016 }
01017 
01018 void AvgLines555(BYTE* dst, DWORD h, DWORD pitch)
01019 {
01020         if(h <= 1) return;
01021 
01022         unsigned __int64 __0x7c007c007c007c00 = 0x7c007c007c007c00;
01023         unsigned __int64 __0x03e003e003e003e0 = 0x03e003e003e003e0;
01024         unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f;
01025 
01026         BYTE* s = dst;
01027         BYTE* d = dst + (h-2)*pitch;
01028 
01029         for(; s < d; s += pitch*2)
01030         {
01031                 BYTE* tmp = s;
01032 
01033                 __asm
01034                 {
01035                         mov             esi, tmp
01036                         mov             ebx, pitch
01037 
01038                         mov             ecx, ebx
01039                         shr             ecx, 3
01040 
01041                         movq    mm6, __0x03e003e003e003e0
01042                         movq    mm7, __0x001f001f001f001f
01043 
01044 AvgLines555_loop:
01045                         movq    mm0, [esi]
01046                         movq    mm1, mm0
01047                         movq    mm2, mm0
01048 
01049                         psrlw   mm0, 10                         // red1 bits: mm0 = 001f001f001f001f
01050                         pand    mm1, mm6                        // green1 bits: mm1 = 03e003e003e003e0
01051                         pand    mm2, mm7                        // blue1 bits: mm2 = 001f001f001f001f
01052 
01053                         movq    mm3, [esi+ebx*2]
01054                         movq    mm4, mm3
01055                         movq    mm5, mm3
01056 
01057                         psrlw   mm3, 10                         // red2 bits: mm3 = 001f001f001f001f
01058                         pand    mm4, mm6                        // green2 bits: mm4 = 03e003e003e003e0
01059                         pand    mm5, mm7                        // blue2 bits: mm5 = 001f001f001f001f
01060 
01061                         paddw   mm0, mm3
01062                         psrlw   mm0, 1                          // (red1+red2)/2
01063                         psllw   mm0, 10                         // red bits at 7c007c007c007c00
01064 
01065                         paddw   mm1, mm4
01066                         psrlw   mm1, 1                          // (green1+green2)/2
01067                         pand    mm1, mm6                        // green bits at 03e003e003e003e0
01068 
01069                         paddw   mm2, mm5
01070                         psrlw   mm2, 1                          // (blue1+blue2)/2
01071                                                                                 // blue bits at 001f001f001f001f (no need to pand, lower bits were discareded)
01072 
01073                         por             mm0, mm1
01074                         por             mm0, mm2
01075 
01076                         movq    [esi+ebx], mm0
01077 
01078                         lea             esi, [esi+8]
01079 
01080                         dec             ecx
01081                         jnz             AvgLines555_loop
01082 
01083                         mov             tmp, esi
01084                 }
01085 
01086                 for(int i = (pitch&7)>>1; i--; tmp++)
01087                 {
01088                         tmp[pitch] = 
01089                                 ((((*tmp&0x7c00) + (tmp[pitch<<1]&0x7c00)) >> 1)&0x7c00)|
01090                                 ((((*tmp&0x03e0) + (tmp[pitch<<1]&0x03e0)) >> 1)&0x03e0)|
01091                                 ((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f);
01092                 }
01093         }
01094 
01095         if(!(h&1) && h >= 2)
01096         {
01097                 dst += (h-2)*pitch;
01098                 memcpy_accel(dst + pitch, dst, pitch);
01099         }
01100 
01101         __asm emms;
01102 }
01103 
01104 void AvgLines565(BYTE* dst, DWORD h, DWORD pitch)
01105 {
01106         if(h <= 1) return;
01107 
01108         unsigned __int64 __0xf800f800f800f800 = 0xf800f800f800f800;
01109         unsigned __int64 __0x07e007e007e007e0 = 0x07e007e007e007e0;
01110         unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f;
01111 
01112         BYTE* s = dst;
01113         BYTE* d = dst + (h-2)*pitch;
01114 
01115         for(; s < d; s += pitch*2)
01116         {
01117                 WORD* tmp = (WORD*)s;
01118 
01119                 __asm
01120                 {
01121                         mov             esi, tmp
01122                         mov             ebx, pitch
01123 
01124                         mov             ecx, ebx
01125                         shr             ecx, 3
01126 
01127                         movq    mm6, __0x07e007e007e007e0
01128                         movq    mm7, __0x001f001f001f001f
01129 
01130 AvgLines565_loop:
01131                         movq    mm0, [esi]
01132                         movq    mm1, mm0
01133                         movq    mm2, mm0
01134 
01135                         psrlw   mm0, 11                         // red1 bits: mm0 = 001f001f001f001f
01136                         pand    mm1, mm6                        // green1 bits: mm1 = 07e007e007e007e0
01137                         pand    mm2, mm7                        // blue1 bits: mm2 = 001f001f001f001f
01138 
01139                         movq    mm3, [esi+ebx*2]
01140                         movq    mm4, mm3
01141                         movq    mm5, mm3
01142 
01143                         psrlw   mm3, 11                         // red2 bits: mm3 = 001f001f001f001f
01144                         pand    mm4, mm6                        // green2 bits: mm4 = 07e007e007e007e0
01145                         pand    mm5, mm7                        // blue2 bits: mm5 = 001f001f001f001f
01146 
01147                         paddw   mm0, mm3
01148                         psrlw   mm0, 1                          // (red1+red2)/2
01149                         psllw   mm0, 11                         // red bits at f800f800f800f800
01150 
01151                         paddw   mm1, mm4
01152                         psrlw   mm1, 1                          // (green1+green2)/2
01153                         pand    mm1, mm6                        // green bits at 03e003e003e003e0
01154 
01155                         paddw   mm2, mm5
01156                         psrlw   mm2, 1                          // (blue1+blue2)/2
01157                                                                                 // blue bits at 001f001f001f001f (no need to pand, lower bits were discareded)
01158 
01159                         por             mm0, mm1
01160                         por             mm0, mm2
01161 
01162                         movq    [esi+ebx], mm0
01163 
01164                         lea             esi, [esi+8]
01165 
01166                         dec             ecx
01167                         jnz             AvgLines565_loop
01168 
01169                         mov             tmp, esi
01170                 }
01171 
01172                 for(int i = (pitch&7)>>1; i--; tmp++)
01173                 {
01174                         tmp[pitch] = 
01175                                 ((((*tmp&0xf800) + (tmp[pitch<<1]&0xf800)) >> 1)&0xf800)|
01176                                 ((((*tmp&0x07e0) + (tmp[pitch<<1]&0x07e0)) >> 1)&0x07e0)|
01177                                 ((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f);
01178                 }
01179         }
01180 
01181         if(!(h&1) && h >= 2)
01182         {
01183                 dst += (h-2)*pitch;
01184                 memcpy_accel(dst + pitch, dst, pitch);
01185         }
01186 
01187         __asm emms;
01188 }
01189 
01190 extern "C" void mmx_YUY2toRGB24(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709);
01191 extern "C" void mmx_YUY2toRGB32(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709);
01192 
01193 bool BitBltFromYUY2ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch)
01194 {
01195         void (* YUY2toRGB)(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709) = NULL;
01196 
01197         if(g_cpuid.m_flags & CCpuID::mmx)
01198         {
01199                 YUY2toRGB = 
01200                         dbpp == 32 ? mmx_YUY2toRGB32 :
01201                         dbpp == 24 ? mmx_YUY2toRGB24 :
01202                         // dbpp == 16 ? mmx_YUY2toRGB16 : // TODO
01203                         NULL;
01204         }
01205         else
01206         {
01207                 // TODO
01208         }
01209 
01210         if(!YUY2toRGB) return(false);
01211 
01212         YUY2toRGB(src, dst, src + h*srcpitch, srcpitch, w, false);
01213 
01214         return(true);
01215 }

Generated on Tue Dec 13 14:47:07 2005 for guliverkli by  doxygen 1.4.5