00001 #include "stdafx.h"
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00025
00026
00027
00028
00030
00031 #include <math.h>
00032 # define PI 3.1415926535897932384626433832795
00033
00034 #define FLOAT double
00035
00036 const static double RC = 1.0*1024*1024*1024*1024*256*16 + 1024;
00037
00038 static FLOAT W1;
00039 static FLOAT W2;
00040 static FLOAT W5;
00041
00042 static FLOAT W1_8;
00043 static FLOAT W2_8;
00044 static FLOAT W5_8;
00045
00046 static FLOAT W7;
00047 static FLOAT W1mW7;
00048 static FLOAT W1pW7;
00049
00050 static FLOAT W3;
00051 static FLOAT W3mW5;
00052 static FLOAT W3pW5;
00053
00054 static FLOAT W6;
00055 static FLOAT W2mW6;
00056 static FLOAT W2pW6;
00057
00058 static FLOAT S2;
00059 static FLOAT D8 = 1.0/8;
00060
00061 static FLOAT W7_8;
00062 static FLOAT W1mW7_8;
00063 static FLOAT W1pW7_8;
00064
00065 static FLOAT W3_8;
00066 static FLOAT W3mW5_8;
00067 static FLOAT W3pW5_8;
00068
00069 static FLOAT W6_8;
00070 static FLOAT W2mW6_8;
00071 static FLOAT W2pW6_8;
00072
00073
00074
00075
00076 static short iclip[1024+1024];
00077 static short *iclp;
00078
00079 void Initialize_FPU_IDCT()
00080 {
00081 int i;
00082
00083 S2 = sqrt(0.5);
00084
00085 W1 = sqrt(2.0)*cos(PI*(1.0/16));
00086 W1_8 = W1/8;
00087 W2 = sqrt(2.0)*cos(PI*(2.0/16));
00088 W2_8 = W2/8;
00089 W3 = sqrt(2.0)*cos(PI*(3.0/16));
00090 W3_8 = W3/8;
00091 W5 = sqrt(2.0)*cos(PI*(5.0/16));
00092 W5_8 = W5/8;
00093 W6 = sqrt(2.0)*cos(PI*(6.0/16));
00094 W6_8 = W6/8;
00095 W7 = sqrt(2.0)*cos(PI*(7.0/16));
00096 W7_8 = W7/8;
00097
00098 W1mW7 = W1-W7; W1mW7_8 = W1mW7/8;
00099 W1pW7 = W1+W7; W1pW7_8 = W1pW7/8;
00100 W3mW5 = W3-W5; W3mW5_8 = W3mW5/8;
00101 W3pW5 = W3+W5; W3pW5_8 = W3pW5/8;
00102 W2mW6 = W2-W6; W2mW6_8 = W2mW6/8;
00103 W2pW6 = W2+W6; W2pW6_8 = W2pW6/8;
00104
00105 iclp = iclip+1024;
00106 for (i= -1024; i<1024; i++)
00107 iclp[i] = (i<-256) ? -256 : ((i>255) ? 255 : i);
00108 }
00109
00110 void FPU_IDCT(short *block)
00111 {
00112 int *b = (int *) block;
00113 if( b[0]==0 && (b[31]==0x10000 || b[31]==0) )
00114 {
00115 if( b[ 1]|b[ 2]|b[ 3]|b[ 4]|b[ 5] )
00116 goto normal;
00117 if( b[ 6]|b[ 7]|b[ 8]|b[ 9]|b[10] )
00118 goto normal;
00119 if( b[11]|b[12]|b[13]|b[14]|b[15] )
00120 goto normal;
00121 if( b[16]|b[17]|b[18]|b[19]|b[20] )
00122 goto normal;
00123 if( b[21]|b[22]|b[23]|b[24]|b[25] )
00124 goto normal;
00125 if( b[26]|b[27]|b[28]|b[29]|b[30] )
00126 goto normal;
00127 b[31]=0;
00129 return;
00130 }
00131 normal:
00132
00133 #define tmp ebx
00134 #define tmp1 ebx-1*8
00135 #define tmp2 ebx-2*8
00136 #define tmp3 ebx-3*8
00137 #define int0 ebx-3*8-1*4
00138 #define int1 ebx-3*8-2*4
00139 #define int2 ebx-3*8-3*4
00140 #define int3 ebx-3*8-4*4
00141 #define int4 ebx-3*8-5*4
00142 #define int5 ebx-3*8-6*4
00143 #define int6 ebx-3*8-7*4
00144 #define int7 ebx-3*8-8*4
00145 #define SIZE 8*8*8+3*8+8*4+16 // locals + 16-byte alignment area
00146 __asm
00147 {
00148 lea ebx,[esp-8*8*8]
00149 sub esp,SIZE
00150 and ebx,-16
00151
00152
00153 mov esi,[block]
00154 lea edi,[tmp]
00155 mov ecx,8
00156
00157 align 16
00158 Lrows:
00159 movsx eax,word ptr [esi+2]
00160 or eax, [esi+4]
00161 or eax, [esi+8]
00162 or eax, [esi+12]
00163 jnz L1
00164
00165 fild word ptr [esi+0*2]
00166 fst qword ptr [edi+7*8]
00167 fst qword ptr [edi+6*8]
00168 fst qword ptr [edi+5*8]
00169 fst qword ptr [edi+4*8]
00170 fst qword ptr [edi+3*8]
00171 fst qword ptr [edi+2*8]
00172 fst qword ptr [edi+1*8]
00173 fstp qword ptr [edi+0*8]
00174 jmp L2
00175
00176 align 16
00177 L1:
00178
00179 fild word ptr [esi+7*2]
00180 fld st(0)
00181 fild word ptr [esi+1*2]
00182 fadd st(1),st(0)
00183 fld qword ptr [W7]
00184 fxch st(1)
00185 fmul qword ptr [W1mW7]
00186 fxch st(1)
00187 fmulp st(2),st(0)
00188 fadd st(0),st(1)
00189 fstp qword ptr [tmp1]
00190 fild word ptr [esi+3*2]
00191 fld st(0)
00192 fxch st(3)
00193 fmul qword ptr [W1pW7]
00194 fild word ptr [esi+5*2]
00195 fadd st(4),st(0)
00196 fmul qword ptr [W3mW5]
00197 fxch st(1)
00198 fsubp st(3),st(0)
00199 fld qword ptr [W3]
00200 fmulp st(4),st(0)
00201 fsubr st(0),st(3)
00202 fstp qword ptr [tmp2]
00203 fmul qword ptr [W3pW5]
00204 fsubp st(2),st(0)
00205 fxch st(1)
00206 fstp qword ptr [tmp3]
00207 fild word ptr [esi+0*2]
00208 fild word ptr [esi+4*2]
00209 fild word ptr [esi+2*2]
00210 fld st(0)
00211 fmul qword ptr [W2mW6]
00212 fld st(3)
00213 fild word ptr [esi+6*2]
00214 fxch st(5)
00215 fsub st(0),st(4)
00216 fxch st(3)
00217 fadd st(0),st(5)
00218 fxch st(1)
00219 faddp st(4),st(0)
00220 fld qword ptr [W6]
00221 fmulp st(1),st(0)
00222 fxch st(4)
00223 fmul qword ptr [W2pW6]
00224 fld qword ptr [tmp1]
00225 fsub qword ptr [tmp2]
00226 fld st(5)
00227 fxch st(3)
00228 faddp st(6),st(0)
00229 fld qword ptr [tmp1]
00230 fxch st(1)
00231 fstp qword ptr [tmp1]
00232 fld st(6)
00233 fadd qword ptr [tmp3]
00234 fxch st(1)
00235 fadd qword ptr [tmp2]
00236 fxch st(7)
00237 fsub qword ptr [tmp3]
00238 fxch st(1)
00239 fstp qword ptr [tmp2]
00240 fld st(4)
00241 fxch st(3)
00242 fsubrp st(2),st(0)
00243 fxch st(4)
00244 fsub st(0),st(5)
00245 fxch st(2)
00246 faddp st(5),st(0)
00247 fld st(2)
00248 fsub st(0),st(1)
00249 fxch st(5)
00250 fstp qword ptr [tmp3]
00251 fld qword ptr [tmp1]
00252 fld qword ptr [S2]
00253 fxch st(4)
00254 faddp st(2),st(0)
00255 fld st(3)
00256 fxch st(1)
00257 fadd st(0),st(5)
00258 fmulp st(1),st(0)
00259
00260 fld qword ptr [tmp3]
00261 fadd st(0),st(7)
00262 fxch st(5)
00263 fsubr qword ptr [tmp1]
00264 fxch st(5)
00265 fstp qword ptr [edi+0*8]
00266 fxch st(6)
00267 fsubr qword ptr [tmp3]
00268 fld st(2)
00269 fxch st(1)
00270 fstp qword ptr [edi+7*8]
00271 fadd qword ptr [tmp2]
00272 fxch st(3)
00273 fmulp st(4),st(0)
00274 fxch st(2)
00275 fstp qword ptr [edi+3*8]
00276 fld st(1)
00277 fadd st(0),st(5)
00278 fxch st(1)
00279 fsub qword ptr [tmp2]
00280 fxch st(2)
00281 fsubrp st(5),st(0)
00282 fstp qword ptr [edi+1*8]
00283 fld st(2)
00284 fxch st(1)
00285 fstp qword ptr [edi+4*8]
00286 fxch st(2)
00287 fsub st(0),st(1)
00288 fxch st(2)
00289 faddp st(1),st(0)
00290 fxch st(2)
00291 fstp qword ptr [edi+6*8]
00292 fstp qword ptr [edi+5*8]
00293 fstp qword ptr [edi+2*8]
00294 L2:
00295 add esi,8*2
00296 add edi,8*8
00297 dec ecx
00298 jnz Lrows
00299
00300
00301 lea esi,[tmp]
00302 mov edi,[block]
00303 lea edx,[iclip+1024*2]
00304 mov ecx,8
00305
00306 align 16
00307 Lcols:
00308 fld qword ptr [esi+7*8*8]
00309 fld st(0)
00310 fld qword ptr [esi+1*8*8]
00311 fadd st(1),st(0)
00312 fld qword ptr [W7_8]
00313 fxch st(1)
00314 fmul qword ptr [W1mW7_8]
00315 fxch st(1)
00316 fmulp st(2),st(0)
00317 fadd st(0),st(1)
00318 fstp qword ptr [tmp2]
00319 fld qword ptr [esi+3*8*8]
00320 fld st(0)
00321 fxch st(3)
00322 fmul qword ptr [W1pW7_8]
00323 fld qword ptr [esi+5*8*8]
00324 fadd st(4),st(0)
00325 fmul qword ptr [W3mW5_8]
00326 fxch st(1)
00327 fsubp st(3),st(0)
00328 fld qword ptr [W3_8]
00329 fmulp st(4),st(0)
00330 fsubr st(0),st(3)
00331 fstp qword ptr [tmp3]
00332 fld qword ptr [D8]
00333 fld qword ptr [esi+0*8*8]
00334 fmul st(0),st(1)
00335 fxch st(2)
00336 fmul qword ptr [W3pW5_8]
00337 fld qword ptr [esi+4*8*8]
00338 fmulp st(2),st(0)
00339 fld qword ptr [esi+6*8*8]
00340 fld st(3)
00341 fxch st(6)
00342 fsubrp st(2),st(0)
00343 fld qword ptr [esi+2*8*8]
00344 fld st(0)
00345 fxch st(5)
00346 fsub st(0),st(4)
00347 fxch st(7)
00348 faddp st(4),st(0)
00349 fxch st(4)
00350 fadd st(0),st(1)
00351 fld qword ptr [W6_8]
00352 fxch st(2)
00353 fmul qword ptr [W2pW6_8]
00354 fxch st(2)
00355 fmulp st(1),st(0)
00356 fxch st(4)
00357 fmul qword ptr [W2mW6_8]
00358 fld qword ptr [tmp2]
00359 fsub qword ptr [tmp3]
00360 fxch st(2)
00361 fsubr st(0),st(5)
00362 fxch st(1)
00363 faddp st(5),st(0)
00364 fld qword ptr [tmp2]
00365 fxch st(2)
00366 fstp qword ptr [tmp2]
00367 fld st(5)
00368 fxch st(2)
00369 fadd qword ptr [tmp3]
00370 fxch st(6)
00371 fsub st(0),st(3)
00372 fxch st(2)
00373 faddp st(3),st(0)
00374 fld st(3)
00375 fsub st(0),st(5)
00376 fxch st(3)
00377 fstp qword ptr [tmp3]
00378 fxch st(3)
00379 faddp st(4),st(0)
00380 fld st(5)
00381 fld qword ptr [tmp2]
00382 fxch st(7)
00383 fsub st(0),st(4)
00384 fxch st(7)
00385 fadd st(0),st(2)
00386 fxch st(1)
00387 faddp st(4),st(0)
00388 fld qword ptr [S2]
00389 fmul st(1),st(0)
00390 fxch st(1)
00391 fstp qword ptr [tmp1]
00392 fld st(4)
00393 fadd st(0),st(6)
00394 fxch st(2)
00395 fsubr qword ptr [tmp2]
00396 fxch st(5)
00397 fsubrp st(6),st(0)
00398 fxch st(1)
00399 fistp dword ptr [int0]
00400 fxch st(4)
00401 mov eax,[int0]
00402 movsx eax,word ptr [edx+2*eax]
00403 mov [edi+0*8*2],ax
00404 fistp dword ptr [int7]
00405 mov eax,[int7]
00406 fld st(0)
00407 movsx eax,word ptr [edx+2*eax]
00408 mov [edi+7*8*2],ax
00409 fadd qword ptr [tmp3]
00410 fistp dword ptr [int3]
00411 mov eax,[int3]
00412 movsx eax,word ptr [edx+2*eax]
00413 mov [edi+3*8*2],ax
00414 fsub qword ptr [tmp3]
00415 fld st(1)
00416 fxch st(1)
00417 fistp dword ptr [int4]
00418 mov eax,[int4]
00419 movsx eax,word ptr [edx+2*eax]
00420 mov [edi+4*8*2],ax
00421 fadd qword ptr [tmp1]
00422 fxch st(3)
00423 fmulp st(2),st(0)
00424 fxch st(2)
00425 fistp dword ptr [int1]
00426 fxch st(1)
00427 mov eax,[int1]
00428 movsx eax,word ptr [edx+2*eax]
00429 mov [edi+1*8*2],ax
00430 fsub qword ptr [tmp1]
00431 fld st(2)
00432 fsub st(0),st(2)
00433 fxch st(1)
00434 fistp dword ptr [int6]
00435 fxch st(2)
00436 mov eax,[int6]
00437 faddp st(1),st(0)
00438 movsx eax,word ptr [edx+2*eax]
00439 mov [edi+6*8*2],ax
00440 fistp dword ptr [int2]
00441 mov eax,[int2]
00442 movsx eax,word ptr [edx+2*eax]
00443 mov [edi+2*8*2],ax
00444 fistp dword ptr [int5]
00445 mov eax,[int5]
00446 movsx eax,word ptr [edx+2*eax]
00447 mov [edi+5*8*2],ax
00448
00449 add esi,8
00450 add edi,2
00451 dec ecx
00452 jnz Lcols
00453
00454 add esp,SIZE
00455 }
00456 }