idctref.cpp

00001 #include "stdafx.h"
00002 
00003 /* idctref_miha.c, Inverse Discrete Fourier Transform, double precision */
00004 
00005 /*************************************************************/
00006 /*                                                           */
00007 /* x87 hand-optimized assembly by Miha Peternel              */
00008 /*                                     27.11. - 20.1.2001    */
00009 /*                                                           */
00010 /* You are free to use this code in your project if:         */
00011 /* - no changes are made to this message                     */
00012 /* - any changes to this code are publicly available         */
00013 /* - your project documentation contains the following text: */
00014 /*   "This software contains fast high-quality IDCT decoder  */
00015 /*    by Miha Peternel."                                     */
00016 /*                                                           */
00017 /*************************************************************/
00018 
00019 /*  Perform IEEE 1180 reference (64-bit floating point, separable 8x1
00020  *  direct matrix multiply) Inverse Discrete Cosine Transform
00021 */
00022 
00023 #define ModelX 123 // enable C-level optimizations by Miha Peternel
00024 
00025 /* Here we use math.h to generate constants.  Compiler results may
00026    vary a little */
00027 
00028 #include <math.h>
00029 
00030 #define M_PI    3.1415926535897932384626433832795
00031 const static double HALF = 0.5;
00032 
00033 /* private data */
00034 static short iclip[1024+1024]; /* clipping table */
00035 static short *iclp;
00036 
00037 /* cosine transform matrix for 8x1 IDCT */
00038 static double c[8][8];
00039 
00040 /* initialize DCT coefficient matrix */
00041 void Initialize_REF_IDCT()
00042 {
00043   int freq, time, i;
00044   double scale;
00045 
00046   for (freq=0; freq < 8; freq++)
00047   {
00048     scale = (freq == 0) ? sqrt(0.125) : 0.5;
00049     for (time=0; time<8; time++)
00050       c[freq][time] = scale*cos((M_PI/8.0)*freq*(time + 0.5));
00051   }
00052 
00053 #ifdef ModelX
00054   iclp = iclip+1024;
00055   for (i= -1024; i<1024; i++)
00056     iclp[i] = (i<-256) ? -256 : ((i>255) ? 255 : i);
00057 #endif
00058 }
00059 
00060 void REF_IDCT(short *block)
00061 {
00062   double tmp[64];
00063         double rnd[64];
00064         int int0, int1, int2, int3, int4, int5, int6, int7;
00065         unsigned short fpold;
00066         unsigned short fpnew;
00067 
00068         int *b = (int *) block;
00069 
00070   if( !(b[0]|(b[31]&~0x10000)) )
00071         {
00072           if( b[ 1]|b[ 2]|b[ 3]|b[ 4]|b[ 5]|b[ 6] )
00073                   goto normal;
00074           if( b[ 7]|b[ 8]|b[ 9]|b[10]|b[11]|b[12] )
00075                   goto normal;
00076           if( b[13]|b[14]|b[15]|b[16]|b[17]|b[18] )
00077                   goto normal;
00078           if( b[19]|b[20]|b[21]|b[22]|b[23]|b[24] )
00079                   goto normal;
00080           if( b[25]|b[26]|b[27]|b[28]|b[29]|b[30] )
00081                   goto normal;
00082                 b[31]=0;
00083                 return;
00084         }
00085 normal:
00086 
00087         __asm
00088         {
00089                 // do the IDCT
00090                 mov esi,[block]
00091                 lea eax,[c]
00092                 lea edi,[tmp]
00093                 //mov ebx,8
00094                 mov ebx,8 // 0x77000000 // 8
00095                 align 16
00096         __col1:
00097             movzx edx,[esi+1*2]
00098                         mov   ecx,[esi+2*2]
00099                         or    edx,[esi+4*2]
00100                         or    ecx,[esi+6*2]
00101                         or edx,ecx
00102                         //mov ecx,8
00103                         mov ecx,8/2 // 0x77000000 // 8
00104 
00105                         jnz __row1
00106                                 fild  word ptr [esi+0*2]
00107                                 fmul qword ptr [eax+0*8*8]
00108                                 fst  qword ptr [edi+0*8]
00109                                 fst  qword ptr [edi+1*8]
00110                                 fst  qword ptr [edi+2*8]
00111                                 fst  qword ptr [edi+3*8]
00112                                 fst  qword ptr [edi+4*8]
00113                                 fst  qword ptr [edi+5*8]
00114                                 fst  qword ptr [edi+6*8]
00115                                 fstp qword ptr [edi+7*8]
00116                                 add edi,8*8
00117                                 jmp __next1
00118                         align 16
00119                 __row1:
00120                                 fild  word ptr [esi+0*2]
00121                                 fmul qword ptr [eax+0*8*8]
00122                                 fild  word ptr [esi+1*2]
00123                                 fmul qword ptr [eax+1*8*8]
00124                                 fadd
00125                                 fild  word ptr [esi+2*2]
00126                                 fmul qword ptr [eax+2*8*8]
00127                                 fadd
00128                                 fild  word ptr [esi+3*2]
00129                                 fmul qword ptr [eax+3*8*8]
00130                                 fadd
00131                                 fild  word ptr [esi+4*2]
00132                                 fmul qword ptr [eax+4*8*8]
00133                                 fadd
00134                                 fild  word ptr [esi+5*2]
00135                                 fmul qword ptr [eax+5*8*8]
00136                                 fadd
00137                                 fild  word ptr [esi+6*2]
00138                                 fmul qword ptr [eax+6*8*8]
00139                                 fadd
00140                                 fild  word ptr [esi+7*2]
00141                                 fmul qword ptr [eax+7*8*8]
00142                                 fadd
00143 
00144                                 fild  word ptr [esi+0*2]
00145                                 fmul qword ptr [eax+0*8*8+8]
00146                                 fild  word ptr [esi+1*2]
00147                                 fmul qword ptr [eax+1*8*8+8]
00148                                 fadd
00149                                 fild  word ptr [esi+2*2]
00150                                 fmul qword ptr [eax+2*8*8+8]
00151                                 fadd
00152                                 fild  word ptr [esi+3*2]
00153                                 fmul qword ptr [eax+3*8*8+8]
00154                                 fadd
00155                                 fild  word ptr [esi+4*2]
00156                                 fmul qword ptr [eax+4*8*8+8]
00157                                 fadd
00158                                 fild  word ptr [esi+5*2]
00159                                 fmul qword ptr [eax+5*8*8+8]
00160                                 fadd
00161                                 fild  word ptr [esi+6*2]
00162                                 fmul qword ptr [eax+6*8*8+8]
00163                                 fadd
00164                                 fild  word ptr [esi+7*2]
00165                                 fmul qword ptr [eax+7*8*8+8]
00166                                 fadd
00167                                 add eax,8*2
00168                                 fxch st(1)
00169                                 fstp qword ptr [edi]//
00170                                 fstp qword ptr [edi+8]
00171                                 add edi,8*2
00172                         dec ecx
00173 
00174                         jnz __row1
00175                         add eax,-8*8
00176                           //align 16
00177                 __next1:
00178                         add esi,+8*2
00179 
00180                 sub ebx,0x80000001 // add ebx,ebx 
00181                 js  __col1
00182                         //align 16
00183                         test ebx,ebx // align jump &| redo flags
00184                 jnz __col1
00185 
00186                 lea esi,[tmp]
00187                 lea eax,[c]
00188                 lea edi,[rnd]
00189                 //mov edi,[block]
00190     fld qword ptr [HALF]
00191                 mov ebx,8
00192         __row2:
00193                         mov ecx,8/2
00194                         align 16
00195                         __col2:
00196                                 fld  qword ptr [esi+0*8*8]
00197                                 fmul qword ptr [eax+0*8*8]
00198                                 fld  qword ptr [esi+1*8*8]
00199                                 fmul qword ptr [eax+1*8*8]
00200                                 fadd
00201                                 fld  qword ptr [esi+2*8*8]
00202                                 fmul qword ptr [eax+2*8*8]
00203                                 fadd
00204                                 fld  qword ptr [esi+3*8*8]
00205                                 fmul qword ptr [eax+3*8*8]
00206                                 fadd
00207                                 fld  qword ptr [esi+4*8*8]
00208                                 fmul qword ptr [eax+4*8*8]
00209                                 fadd
00210                                 fld  qword ptr [esi+5*8*8]
00211                                 fmul qword ptr [eax+5*8*8]
00212                                 fadd
00213                                 fld  qword ptr [esi+6*8*8]
00214                                 fmul qword ptr [eax+6*8*8]
00215                                 fadd
00216                                 fld  qword ptr [esi+7*8*8]
00217                                 fmul qword ptr [eax+7*8*8]
00218                                 fadd
00219                                 fadd st(0),st(1)
00220 
00221                                 fxch st(1)
00222 
00223                                 fld  qword ptr [esi+0*8*8]
00224                                 fmul qword ptr [eax+0*8*8+8]
00225                                 fld  qword ptr [esi+1*8*8]
00226                                 fmul qword ptr [eax+1*8*8+8]
00227                                 fadd
00228                                 fld  qword ptr [esi+2*8*8]
00229                                 fmul qword ptr [eax+2*8*8+8]
00230                                 fadd
00231                                 fld  qword ptr [esi+3*8*8]
00232                                 fmul qword ptr [eax+3*8*8+8]
00233                                 fadd
00234                                 fld  qword ptr [esi+4*8*8]
00235                                 fmul qword ptr [eax+4*8*8+8]
00236                                 fadd
00237                                 fld  qword ptr [esi+5*8*8]
00238                                 fmul qword ptr [eax+5*8*8+8]
00239                                 fadd
00240                                 fld  qword ptr [esi+6*8*8]
00241                                 fmul qword ptr [eax+6*8*8+8]
00242                                 fadd
00243                                 fld  qword ptr [esi+7*8*8]
00244                                 fmul qword ptr [eax+7*8*8+8]
00245                                 fadd
00246                                 fadd st(0),st(1)
00247                                 add eax,8*2
00248 
00249                                 fxch st(2)
00250                                 fstp qword ptr [edi]
00251                                 fxch st(1)
00252                                 fstp qword ptr [edi+8*8]
00253                                 add edi,8*8*2
00254 
00255                         dec ecx
00256 
00257                         jnz __col2
00258                         add eax,-8*8
00259                         add esi,+8
00260                         add edi,8-8*8*8
00261 
00262                 sub ebx,0x80000001
00263                 js  __row2
00264                           //align 16
00265                                 test ebx,ebx // align jump &| redo flags
00266                 jnz __row2
00267                 ffree st(0) // bye bye 0.5
00268 
00269           // set x87 to floor mode
00270                 fstcw [fpold]
00271                 movzx eax, [fpold]
00272 
00273                 or eax, 0x0400 // round down - floor
00274                 mov [fpnew], ax
00275                 fldcw [fpnew]
00276 
00277                 // now floor the damn array
00278                 lea esi, [rnd]
00279                 mov edi, [block]
00280                 mov ebx, -256 // clip min
00281                 mov edx, +255 // clip max
00282                 mov ecx, 8
00283                 align 16
00284         __floor:
00285                   fld   qword ptr [esi+0*8]
00286                         fistp dword ptr [int0]
00287                           mov eax,[int0]
00288                                 cmp   eax,ebx
00289                                 cmovl eax,ebx
00290                                 cmp   eax,edx
00291                                 cmovg eax,edx
00292                   fld   qword ptr [esi+1*8]
00293                         fistp dword ptr [int1]
00294                                 mov word ptr [edi+0*2],ax
00295                           mov eax,[int1]
00296                                 cmp   eax,ebx
00297                                 cmovl eax,ebx
00298                                 cmp   eax,edx
00299                                 cmovg eax,edx
00300                   fld   qword ptr [esi+2*8]
00301                         fistp dword ptr [int2]
00302                                 mov word ptr [edi+1*2],ax
00303                           mov eax,[int2]
00304                                 cmp   eax,ebx
00305                                 cmovl eax,ebx
00306                                 cmp   eax,edx
00307                                 cmovg eax,edx
00308                   fld   qword ptr [esi+3*8]
00309                         fistp dword ptr [int3]
00310                                 mov word ptr [edi+2*2],ax
00311                           mov eax,[int3]
00312                                 cmp   eax,ebx
00313                                 cmovl eax,ebx
00314                                 cmp   eax,edx
00315                                 cmovg eax,edx
00316                   fld   qword ptr [esi+4*8]
00317                         fistp dword ptr [int4]
00318                                 mov word ptr [edi+3*2],ax
00319                           mov eax,[int4]
00320                                 cmp   eax,ebx
00321                                 cmovl eax,ebx
00322                                 cmp   eax,edx
00323                                 cmovg eax,edx
00324                   fld   qword ptr [esi+5*8]
00325                         fistp dword ptr [int5]
00326                                 mov word ptr [edi+4*2],ax
00327                           mov eax,[int5]
00328                                 cmp   eax,ebx
00329                                 cmovl eax,ebx
00330                                 cmp   eax,edx
00331                                 cmovg eax,edx
00332                   fld   qword ptr [esi+6*8]
00333                         fistp dword ptr [int6]
00334                                 mov word ptr [edi+5*2],ax
00335                           mov eax,[int6]
00336                                 cmp   eax,ebx
00337                                 cmovl eax,ebx
00338                                 cmp   eax,edx
00339                                 cmovg eax,edx
00340                   fld   qword ptr [esi+7*8]
00341                         fistp dword ptr [int7]
00342                                 mov word ptr [edi+6*2],ax
00343                           mov eax,[int7]
00344                                 cmp   eax,ebx
00345                                 cmovl eax,ebx
00346                                 cmp   eax,edx
00347                                 cmovg eax,edx
00348                                 mov word ptr [edi+7*2],ax
00349 
00350                         add esi, 8*8
00351                         add edi, 8*2
00352 
00353                 sub ecx,0x80000001
00354                 js  __floor
00355                           //align 16
00356                                 test ecx,ecx // align jump &| redo flags
00357                 jnz __floor
00358 
00359                 // set x87 to default mode
00360                 fldcw [fpold]
00361         };
00362 }

Generated on Tue Dec 13 14:47:24 2005 for guliverkli by  doxygen 1.4.5