idctfpu.cpp

00001 #include "stdafx.h"
00002 
00003 /* idct.c, inverse fast discrete cosine transform                           */
00004 
00005 
00006 /*************************************************************/
00007 /* inverse two dimensional DCT, Chen-Wang algorithm          */
00008 /* (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984)                */
00009 /*                                                           */
00010 /* floating point conversion by Miha Peternel                */
00011 /* x87 hand-optimized assembly by Miha Peternel              */
00012 /*                                    27.11. - 11.12.2000    */
00013 /*                                                           */
00014 /* You are free to use this code in your project if:         */
00015 /* - no changes are made to this message                     */
00016 /* - any changes to this code are publicly available         */
00017 /* - your project documentation contains the following text: */
00018 /*   "This software contains fast high-quality IDCT decoder  */
00019 /*    by Miha Peternel."                                     */
00020 /*                                                           */
00021 /*************************************************************/
00022 
00023 
00025 //
00026 // TODO:
00027 // - loops can be easily vectorized for SIMD
00028 //
00030 
00031 #include <math.h>
00032 #  define PI 3.1415926535897932384626433832795
00033 
00034 #define FLOAT double
00035 
00036 const static double RC = 1.0*1024*1024*1024*1024*256*16 + 1024; // magic + clip center
00037 
00038 static FLOAT W1; // /* sqrt(2)*cos(1*pi/16) */
00039 static FLOAT W2; // /* sqrt(2)*cos(2*pi/16) */
00040 static FLOAT W5; // /* sqrt(2)*cos(5*pi/16) */
00041 
00042 static FLOAT W1_8;
00043 static FLOAT W2_8;
00044 static FLOAT W5_8;
00045 
00046 static FLOAT W7; // /* sqrt(2)*cos(7*pi/16) */
00047 static FLOAT W1mW7; // W1-W7
00048 static FLOAT W1pW7; // W1+W7
00049 
00050 static FLOAT W3; // /* sqrt(2)*cos(3*pi/16) */
00051 static FLOAT W3mW5; // W3-W5
00052 static FLOAT W3pW5; // W3+W5
00053 
00054 static FLOAT W6; // /* sqrt(2)*cos(6*pi/16) */
00055 static FLOAT W2mW6; // W2-W6
00056 static FLOAT W2pW6; // W2+W6
00057 
00058 static FLOAT S2; // 1/sqrt(2)
00059 static FLOAT D8 = 1.0/8;
00060 
00061 static FLOAT W7_8;
00062 static FLOAT W1mW7_8;
00063 static FLOAT W1pW7_8;
00064 
00065 static FLOAT W3_8;
00066 static FLOAT W3mW5_8;
00067 static FLOAT W3pW5_8;
00068 
00069 static FLOAT W6_8;
00070 static FLOAT W2mW6_8;
00071 static FLOAT W2pW6_8;
00072 
00073 /* global declarations */
00074 
00075 /* private data */
00076 static short iclip[1024+1024]; /* clipping table */
00077 static short *iclp;
00078 
00079 void Initialize_FPU_IDCT()
00080 {
00081   int i;
00082 
00083   S2 = sqrt(0.5); // 1.0/sqrt(2);
00084 
00085   W1 = sqrt(2.0)*cos(PI*(1.0/16)); 
00086         W1_8 = W1/8;
00087   W2 = sqrt(2.0)*cos(PI*(2.0/16)); 
00088         W2_8 = W2/8;
00089   W3 = sqrt(2.0)*cos(PI*(3.0/16)); 
00090         W3_8 = W3/8;
00091   W5 = sqrt(2.0)*cos(PI*(5.0/16)); 
00092         W5_8 = W5/8;
00093   W6 = sqrt(2.0)*cos(PI*(6.0/16)); 
00094         W6_8 = W6/8;
00095   W7 = sqrt(2.0)*cos(PI*(7.0/16));
00096         W7_8 = W7/8;
00097 
00098   W1mW7 = W1-W7;  W1mW7_8 = W1mW7/8;
00099   W1pW7 = W1+W7;  W1pW7_8 = W1pW7/8;
00100   W3mW5 = W3-W5;  W3mW5_8 = W3mW5/8;
00101   W3pW5 = W3+W5;  W3pW5_8 = W3pW5/8;
00102   W2mW6 = W2-W6;  W2mW6_8 = W2mW6/8;
00103   W2pW6 = W2+W6;  W2pW6_8 = W2pW6/8;
00104 
00105   iclp = iclip+1024;
00106   for (i= -1024; i<1024; i++)
00107     iclp[i] = (i<-256) ? -256 : ((i>255) ? 255 : i);
00108 }
00109 
00110 void FPU_IDCT(short *block)
00111 {
00112         int *b = (int *) block;
00113   if( b[0]==0 && (b[31]==0x10000 || b[31]==0) )
00114         {
00115           if( b[ 1]|b[ 2]|b[ 3]|b[ 4]|b[ 5] )
00116                   goto normal;
00117           if( b[ 6]|b[ 7]|b[ 8]|b[ 9]|b[10] )
00118                   goto normal;
00119           if( b[11]|b[12]|b[13]|b[14]|b[15] )
00120                   goto normal;
00121           if( b[16]|b[17]|b[18]|b[19]|b[20] )
00122                   goto normal;
00123           if( b[21]|b[22]|b[23]|b[24]|b[25] )
00124                   goto normal;
00125           if( b[26]|b[27]|b[28]|b[29]|b[30] )
00126                   goto normal;
00127                 b[31]=0;
00129                 return;
00130         }
00131 normal:
00132 
00133 #define tmp  ebx
00134 #define tmp1 ebx-1*8
00135 #define tmp2 ebx-2*8
00136 #define tmp3 ebx-3*8
00137 #define int0 ebx-3*8-1*4
00138 #define int1 ebx-3*8-2*4
00139 #define int2 ebx-3*8-3*4
00140 #define int3 ebx-3*8-4*4
00141 #define int4 ebx-3*8-5*4
00142 #define int5 ebx-3*8-6*4
00143 #define int6 ebx-3*8-7*4
00144 #define int7 ebx-3*8-8*4
00145 #define SIZE 8*8*8+3*8+8*4+16 // locals + 16-byte alignment area
00146         __asm
00147         {
00148           lea ebx,[esp-8*8*8]
00149                 sub esp,SIZE
00150                 and ebx,-16 // force 16-byte alignment of locals
00151 
00152 // rows
00153                 mov esi,[block]
00154                 lea edi,[tmp]
00155                 mov ecx,8
00156 
00157                 align 16
00158 Lrows:
00159     movsx eax,word ptr [esi+2]
00160                 or    eax,         [esi+4]
00161                 or    eax,         [esi+8]
00162                 or    eax,         [esi+12]
00163                 jnz L1
00164 
00165                 fild word ptr [esi+0*2]
00166                 fst  qword ptr [edi+7*8]
00167                 fst  qword ptr [edi+6*8]
00168                 fst  qword ptr [edi+5*8]
00169                 fst  qword ptr [edi+4*8]
00170                 fst  qword ptr [edi+3*8]
00171                 fst  qword ptr [edi+2*8]
00172                 fst  qword ptr [edi+1*8]
00173                 fstp qword ptr [edi+0*8]
00174                 jmp L2
00175 
00176                 align 16
00177         L1:
00178 
00179                 fild word ptr [esi+7*2]
00180                 fld st(0)
00181                 fild word ptr [esi+1*2]
00182                 fadd st(1),st(0)
00183                 fld qword ptr [W7]
00184                 fxch st(1)
00185                 fmul qword ptr [W1mW7]
00186                 fxch st(1)
00187                 fmulp st(2),st(0)
00188                 fadd st(0),st(1)
00189                 fstp qword ptr [tmp1]
00190                 fild word ptr [esi+3*2]
00191                 fld st(0)
00192                 fxch st(3)
00193                 fmul qword ptr [W1pW7]
00194                 fild word ptr [esi+5*2]
00195                 fadd st(4),st(0)
00196                 fmul qword ptr [W3mW5]
00197                 fxch st(1)
00198                 fsubp st(3),st(0)//fsubrp
00199                 fld qword ptr [W3]
00200                 fmulp st(4),st(0)
00201                 fsubr st(0),st(3)
00202                 fstp qword ptr [tmp2]
00203                 fmul qword ptr [W3pW5]
00204                 fsubp st(2),st(0)//fsubrp
00205                 fxch st(1)
00206                 fstp qword ptr [tmp3]
00207                 fild word ptr [esi+0*2]
00208                 fild word ptr [esi+4*2]
00209                 fild word ptr [esi+2*2]
00210                 fld st(0)
00211                 fmul qword ptr [W2mW6]
00212                 fld st(3)
00213                 fild word ptr [esi+6*2]
00214                 fxch st(5)
00215                 fsub st(0),st(4)
00216                 fxch st(3)
00217                 fadd st(0),st(5)
00218                 fxch st(1)
00219                 faddp st(4),st(0)
00220                 fld qword ptr [W6]
00221                 fmulp st(1),st(0)
00222                 fxch st(4)
00223                 fmul qword ptr [W2pW6]
00224                 fld qword ptr [tmp1]
00225                 fsub qword ptr [tmp2]
00226                 fld st(5)
00227                 fxch st(3)
00228                 faddp st(6),st(0)
00229                 fld qword ptr [tmp1]
00230                 fxch st(1)
00231                 fstp qword ptr [tmp1]
00232                 fld st(6)
00233                 fadd qword ptr [tmp3]
00234                 fxch st(1)
00235                 fadd qword ptr [tmp2]
00236                 fxch st(7)
00237                 fsub qword ptr [tmp3]
00238                 fxch st(1)
00239                 fstp qword ptr [tmp2]
00240                 fld st(4)
00241                 fxch st(3)
00242                 fsubrp st(2),st(0)//fsubp
00243                 fxch st(4)
00244                 fsub st(0),st(5)
00245                 fxch st(2)
00246                 faddp st(5),st(0)
00247                 fld st(2)
00248                 fsub st(0),st(1)
00249                 fxch st(5)
00250                 fstp qword ptr [tmp3]
00251                 fld qword ptr [tmp1]
00252                 fld qword ptr [S2]
00253                 fxch st(4)
00254                 faddp st(2),st(0)
00255                 fld st(3)
00256                 fxch st(1)
00257                 fadd st(0),st(5)
00258                 fmulp st(1),st(0)
00259 
00260                 fld qword ptr [tmp3]
00261                 fadd st(0),st(7)
00262                 fxch st(5)
00263                 fsubr qword ptr [tmp1]
00264                 fxch st(5)
00265                 fstp qword ptr [edi+0*8]
00266                 fxch st(6)
00267                 fsubr qword ptr [tmp3]
00268                 fld st(2)
00269                 fxch st(1)
00270                 fstp qword ptr [edi+7*8]
00271                 fadd qword ptr [tmp2]
00272                 fxch st(3)
00273                 fmulp st(4),st(0)
00274                 fxch st(2)
00275                 fstp qword ptr [edi+3*8]
00276                 fld st(1)
00277                 fadd st(0),st(5)
00278                 fxch st(1)
00279                 fsub qword ptr [tmp2]
00280                 fxch st(2)
00281                 fsubrp st(5),st(0)//fsubp
00282                 fstp qword ptr [edi+1*8]
00283                 fld st(2)
00284                 fxch st(1)
00285                 fstp qword ptr [edi+4*8]
00286                 fxch st(2)
00287                 fsub st(0),st(1)
00288                 fxch st(2)
00289                 faddp st(1),st(0)
00290                 fxch st(2)
00291                 fstp qword ptr [edi+6*8]
00292                 fstp qword ptr [edi+5*8]
00293                 fstp qword ptr [edi+2*8]
00294         L2:
00295           add esi,8*2
00296                 add edi,8*8
00297                 dec ecx
00298                 jnz Lrows
00299 
00300 // columns
00301     lea esi,[tmp]
00302                 mov edi,[block]
00303                 lea edx,[iclip+1024*2]
00304                 mov ecx,8
00305 
00306     align 16
00307 Lcols:
00308                 fld qword ptr [esi+7*8*8]
00309                 fld st(0)
00310                 fld qword ptr [esi+1*8*8]
00311                 fadd st(1),st(0)
00312                 fld qword ptr [W7_8]
00313                 fxch st(1)
00314                 fmul qword ptr [W1mW7_8]
00315                 fxch st(1)
00316                 fmulp st(2),st(0)
00317                 fadd st(0),st(1)
00318                 fstp qword ptr [tmp2]
00319                 fld qword ptr [esi+3*8*8]
00320                 fld st(0)
00321                 fxch st(3)
00322                 fmul qword ptr [W1pW7_8]
00323                 fld qword ptr [esi+5*8*8]
00324                 fadd st(4),st(0)
00325                 fmul qword ptr [W3mW5_8]
00326                 fxch st(1)
00327                 fsubp st(3),st(0)//fsubrp
00328                 fld qword ptr [W3_8]
00329                 fmulp st(4),st(0)
00330                 fsubr st(0),st(3)
00331                 fstp qword ptr [tmp3]
00332                 fld qword ptr [D8]
00333                 fld qword ptr [esi+0*8*8]
00334                 fmul st(0),st(1)
00335                 fxch st(2)
00336                 fmul qword ptr [W3pW5_8]
00337                 fld qword ptr [esi+4*8*8]
00338                 fmulp st(2),st(0)
00339                 fld qword ptr [esi+6*8*8]
00340                 fld st(3)
00341                 fxch st(6)
00342                 fsubrp st(2),st(0)//fsubp
00343                 fld qword ptr [esi+2*8*8]
00344                 fld st(0)
00345                 fxch st(5)
00346                 fsub st(0),st(4)
00347                 fxch st(7)
00348                 faddp st(4),st(0)
00349                 fxch st(4)
00350                 fadd st(0),st(1)
00351                 fld qword ptr [W6_8]
00352                 fxch st(2)
00353                 fmul qword ptr [W2pW6_8]
00354                 fxch st(2)
00355                 fmulp st(1),st(0)
00356                 fxch st(4)
00357                 fmul qword ptr [W2mW6_8]
00358                 fld qword ptr [tmp2]
00359                 fsub qword ptr [tmp3]
00360                 fxch st(2)
00361                 fsubr st(0),st(5)
00362                 fxch st(1)
00363                 faddp st(5),st(0)
00364                 fld qword ptr [tmp2]
00365                 fxch st(2)
00366                 fstp qword ptr [tmp2]
00367                 fld st(5)
00368                 fxch st(2)
00369                 fadd qword ptr [tmp3]
00370                 fxch st(6)
00371                 fsub st(0),st(3)
00372                 fxch st(2)
00373                 faddp st(3),st(0)
00374                 fld st(3)
00375                 fsub st(0),st(5)
00376                 fxch st(3)
00377                 fstp qword ptr [tmp3]
00378                 fxch st(3)
00379                 faddp st(4),st(0)
00380                 fld st(5)
00381                 fld qword ptr [tmp2]
00382                 fxch st(7)
00383                 fsub st(0),st(4)
00384                 fxch st(7)
00385                 fadd st(0),st(2)
00386                 fxch st(1)
00387                 faddp st(4),st(0)
00388                 fld qword ptr [S2]
00389                 fmul st(1),st(0)
00390                 fxch st(1)
00391                 fstp qword ptr [tmp1]
00392                 fld st(4)
00393                 fadd st(0),st(6)
00394                 fxch st(2)
00395                 fsubr qword ptr [tmp2]
00396                 fxch st(5)
00397                 fsubrp st(6),st(0)//fsubp
00398                 fxch st(1)
00399                 fistp dword ptr [int0]
00400                 fxch st(4)
00401                 mov eax,[int0]
00402                 movsx eax,word ptr [edx+2*eax]
00403                 mov [edi+0*8*2],ax
00404                 fistp dword ptr [int7]
00405                 mov eax,[int7]
00406                 fld st(0)
00407                 movsx eax,word ptr [edx+2*eax]
00408                 mov [edi+7*8*2],ax
00409                 fadd qword ptr [tmp3]
00410                 fistp dword ptr [int3]
00411                 mov eax,[int3]
00412                 movsx eax,word ptr [edx+2*eax]
00413                 mov [edi+3*8*2],ax
00414                 fsub qword ptr [tmp3]
00415                 fld st(1)
00416                 fxch st(1)
00417                 fistp dword ptr [int4]
00418                 mov eax,[int4]
00419                 movsx eax,word ptr [edx+2*eax]
00420                 mov [edi+4*8*2],ax
00421                 fadd qword ptr [tmp1]
00422                 fxch st(3)
00423                 fmulp st(2),st(0)
00424                 fxch st(2)
00425                 fistp dword ptr [int1]
00426                 fxch st(1)
00427                 mov eax,[int1]
00428                 movsx eax,word ptr [edx+2*eax]
00429                 mov [edi+1*8*2],ax
00430                 fsub qword ptr [tmp1]
00431                 fld st(2)
00432                 fsub st(0),st(2)
00433                 fxch st(1)
00434                 fistp dword ptr [int6]
00435                 fxch st(2)
00436                 mov eax,[int6]
00437                 faddp st(1),st(0)
00438                 movsx eax,word ptr [edx+2*eax]
00439                 mov [edi+6*8*2],ax
00440                 fistp dword ptr [int2]
00441                 mov eax,[int2]
00442                 movsx eax,word ptr [edx+2*eax]
00443                 mov [edi+2*8*2],ax
00444                 fistp dword ptr [int5]
00445                 mov eax,[int5]
00446                 movsx eax,word ptr [edx+2*eax]
00447                 mov [edi+5*8*2],ax
00448 
00449           add esi,8
00450                 add edi,2
00451                 dec ecx
00452                 jnz Lcols
00453 
00454                 add esp,SIZE
00455   }
00456 }

Generated on Tue Dec 13 14:47:24 2005 for guliverkli by  doxygen 1.4.5