mc_sse2.cpp

00001 /* 
00002  *      Copyright (C) 2003-2005 Gabest
00003  *      http://www.gabest.org
00004  *
00005  *  This Program is free software; you can redistribute it and/or modify
00006  *  it under the terms of the GNU General Public License as published by
00007  *  the Free Software Foundation; either version 2, or (at your option)
00008  *  any later version.
00009  *   
00010  *  This Program is distributed in the hope that it will be useful,
00011  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00013  *  GNU General Public License for more details.
00014  *   
00015  *  You should have received a copy of the GNU General Public License
00016  *  along with GNU Make; see the file COPYING.  If not, write to
00017  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
00018  *  http://www.gnu.org/copyleft/gpl.html
00019  *
00020  *  Based on Intel's AP-942
00021  *
00022  */
00023 
00024 #include "stdafx.h"
00025 #include "libmpeg2.h"
00026 
00027 __declspec(align(16)) static BYTE const_1_16_bytes[] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
00028 
00029 static void MC_put_o_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00030 {
00031         __asm
00032         {
00033                 mov edx, ref
00034                 mov ecx, dest
00035                 mov esi, height
00036                 mov eax, stride
00037                 lea edi, [eax+eax]
00038 
00039         MC_put_o_16_sse2_loop:
00040 
00041                 movdqu xmm0, [edx]
00042                 movdqu xmm1, [edx+eax] 
00043                 movdqa [ecx], xmm0
00044                 movdqa [ecx+eax], xmm1 
00045                 add edx, edi
00046                 add ecx, edi
00047                 sub esi, 2
00048 
00049                 jg MC_put_o_16_sse2_loop
00050         }
00051 }
00052 
00053 static void MC_put_o_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00054 {
00055         __asm
00056         {
00057                 mov edx, ref
00058                 mov ecx, dest
00059                 mov esi, height
00060                 mov eax, stride
00061                 lea edi, [eax+eax]
00062 
00063         MC_put_o_8_sse2_loop:
00064 
00065                 movlpd xmm0, [edx]
00066                 movhpd xmm0, [edx+eax] 
00067                 movlpd [ecx], xmm0
00068                 movhpd [ecx+eax], xmm0 
00069                 add edx, edi
00070                 add ecx, edi
00071                 sub esi, 2
00072 
00073                 jg MC_put_o_8_sse2_loop
00074         }
00075 }
00076 
00077 static void MC_put_x_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00078 {
00079         __asm
00080         {
00081                 mov edx, ref
00082                 mov ecx, dest
00083                 mov eax, stride
00084                 mov esi, height
00085                 lea edi, [eax+eax]
00086 
00087         MC_put_x_16_sse2_loop:
00088 
00089                 movdqu xmm0, [edx]
00090                 movdqu xmm1, [edx+1]
00091                 movdqu xmm2, [edx+eax]
00092                 movdqu xmm3, [edx+eax+1]
00093                 pavgb xmm0, xmm1
00094                 pavgb xmm2, xmm3
00095                 movdqa [ecx], xmm0
00096                 movdqa [ecx+eax], xmm2
00097                 add edx, edi
00098                 add ecx, edi
00099                 sub esi, 2
00100 
00101                 jg MC_put_x_16_sse2_loop
00102         }
00103 }
00104 
00105 static void MC_put_x_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00106 {
00107         __asm
00108         {
00109                 mov edx, ref
00110                 mov ecx, dest
00111                 mov eax, stride
00112                 mov esi, height
00113                 lea edi, [eax+eax]
00114 
00115         MC_put_x_8_sse2_loop:
00116 
00117                 movlpd xmm0, [edx]
00118                 movlpd xmm1, [edx+1]
00119                 movhpd xmm0, [edx+eax]
00120                 movhpd xmm1, [edx+eax+1]
00121                 pavgb xmm0, xmm1
00122                 movlpd [ecx], xmm0
00123                 movhpd [ecx+eax], xmm0
00124                 add edx, edi
00125                 add ecx, edi
00126                 sub esi, 2
00127 
00128                 jg MC_put_x_8_sse2_loop
00129         }
00130 }
00131 
00132 static void MC_put_y_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00133 {
00134         __asm
00135         {
00136                 mov edx, ref
00137                 mov ecx, dest
00138                 mov eax, stride
00139                 mov esi, height
00140                 lea edi, [eax+eax]
00141 
00142                 movdqu xmm0, [edx] 
00143 
00144         MC_put_y_16_sse2_loop:
00145 
00146                 movdqu xmm1, [edx+eax] 
00147                 movdqu xmm2, [edx+edi] 
00148                 pavgb xmm0, xmm1 
00149                 pavgb xmm1, xmm2 
00150                 movdqa [ecx], xmm0 
00151                 movdqa [ecx+eax], xmm1 
00152                 movdqa xmm0, xmm2 
00153                 add edx, edi 
00154                 add ecx, edi
00155                 sub esi, 2
00156 
00157                 jg MC_put_y_16_sse2_loop
00158         }
00159 }
00160 
00161 static void MC_put_y_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00162 {
00163         __asm
00164         {
00165                 mov edx, ref
00166                 mov ecx, dest
00167                 mov eax, stride
00168                 mov esi, height
00169                 lea edi, [eax+eax]
00170 
00171                 movhpd xmm0, [edx] 
00172                 movlpd xmm0, [edx+eax] 
00173 
00174         MC_put_y_8_sse2_loop:
00175 
00176                 movhpd xmm1, [edx+eax] 
00177                 movlpd xmm1, [edx+edi] 
00178                 pavgb xmm0, xmm1 
00179                 movhpd [ecx], xmm0 
00180                 movlpd [ecx+eax], xmm0 
00181                 movdqa xmm0, xmm1 
00182                 add edx, edi 
00183                 add ecx, edi
00184                 sub esi, 2
00185 
00186                 jg MC_put_y_8_sse2_loop
00187         }
00188 }
00189 
00190 static void MC_put_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00191 {
00192 /*
00193         __asm
00194         {
00195                 mov edx, ref 
00196                 mov ecx, dest
00197                 mov eax, stride 
00198                 mov esi, height 
00199                 lea edi, [eax+eax] 
00200                 
00201                 movdqa xmm7, [const_1_16_bytes] 
00202                 movdqu xmm0, [edx] 
00203                 movdqu xmm1, [edx+1]
00204                 pavgb xmm0, xmm1 
00205 
00206         MC_put_xy_16_sse2_loop:
00207 
00208                 movdqu xmm2, [edx+eax] 
00209                 movdqu xmm3, [edx+eax+1] 
00210                 movdqu xmm4, [edx+edi] 
00211                 movdqu xmm5, [edx+edi+1] 
00212                 pavgb xmm2, xmm3 
00213                 pavgb xmm5, xmm4 
00214                 psubusb xmm2, xmm7 
00215                 movdqa xmm1, xmm0 
00216                 movdqa xmm0, xmm2 
00217                 pavgb xmm1, xmm2 
00218                 pavgb xmm2, xmm5
00219                 movdqa [ecx], xmm1
00220                 movdqa [ecx+eax], xmm2
00221                 add edx, edi
00222                 add ecx, edi
00223                 sub esi, 2
00224 
00225                 jg MC_put_xy_16_sse2_loop
00226         }
00227 */
00228         __asm
00229         {
00230                 mov edx, ref 
00231                 mov ecx, dest
00232                 mov eax, stride 
00233                 mov esi, height 
00234                 lea edi, [eax+eax] 
00235                 
00236                 movdqa xmm7, [const_1_16_bytes] 
00237                 movdqu xmm0, [edx] 
00238                 movdqu xmm1, [edx+1] 
00239 
00240         MC_put_xy_16_sse2_loop:
00241 
00242                 movdqu xmm2, [edx+eax] 
00243                 movdqu xmm3, [edx+eax+1] 
00244                 movdqu xmm4, [edx+edi] 
00245                 movdqu xmm5, [edx+edi+1] 
00246                 pavgb xmm0, xmm1 
00247                 pavgb xmm2, xmm3 
00248                 movdqa xmm1, xmm5 
00249                 pavgb xmm5, xmm4 
00250                 psubusb xmm2, xmm7 
00251                 pavgb xmm0, xmm2 
00252                 pavgb xmm2, xmm5
00253                 movdqa [ecx], xmm0
00254                 movdqa xmm0, xmm4
00255                 movdqa [ecx+eax], xmm2
00256                 add edx, edi
00257                 add ecx, edi
00258                 sub esi, 2
00259 
00260                 jg MC_put_xy_16_sse2_loop
00261         }
00262 }
00263 
00264 static void MC_put_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00265 {
00266         __asm
00267         {
00268                 mov edx, ref
00269                 mov ecx, dest
00270                 mov eax, stride
00271                 mov esi, height
00272                 lea edi, [eax+eax]
00273 
00274                 movdqa xmm7, [const_1_16_bytes] 
00275                 movhpd xmm0, [edx] 
00276                 movlpd xmm0, [edx+eax] 
00277                 movhpd xmm2, [edx+1] 
00278                 movlpd xmm2, [edx+eax+1] 
00279 
00280         MC_put_xy_8_sse2_loop:
00281 
00282                 movhpd xmm1, [edx+eax] 
00283                 movlpd xmm1, [edx+edi] 
00284                 movhpd xmm3, [edx+eax+1] 
00285                 movlpd xmm3, [edx+edi+1] 
00286                 pavgb xmm0, xmm1 
00287                 pavgb xmm2, xmm3 
00288                 psubusb xmm0, xmm7 
00289                 pavgb xmm0, xmm1 
00290                 movhpd [ecx], xmm0 
00291                 movlpd [ecx+eax], xmm0 
00292                 movdqa xmm0, xmm1 
00293                 movdqa xmm2, xmm3 
00294                 add edx, edi 
00295                 add ecx, edi
00296                 sub esi, 2
00297 
00298                 jg MC_put_xy_8_sse2_loop
00299         }
00300 }
00301 
00302 static void MC_avg_o_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00303 {
00304         __asm
00305         {
00306                 mov edx, ref
00307                 mov ecx, dest
00308                 mov esi, height
00309                 mov eax, stride
00310                 lea edi, [eax+eax]
00311 
00312         MC_avg_o_16_sse2_loop:
00313 
00314                 movdqu xmm0, [edx]
00315                 movdqu xmm1, [edx+eax] 
00316                 movdqa xmm2, [ecx]
00317                 movdqa xmm3, [ecx+eax]
00318                 pavgb xmm0, xmm2
00319                 pavgb xmm1, xmm3
00320                 movdqa [ecx], xmm0
00321                 movdqa [ecx+eax], xmm1 
00322                 add edx, edi
00323                 add ecx, edi
00324                 sub esi, 2
00325 
00326                 jg MC_avg_o_16_sse2_loop
00327         }
00328 }
00329 
00330 static void MC_avg_o_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00331 {
00332         __asm
00333         {
00334                 mov edx, ref
00335                 mov ecx, dest
00336                 mov esi, height
00337                 mov eax, stride
00338                 lea edi, [eax+eax]
00339 
00340         MC_avg_o_16_sse2_loop:
00341 
00342                 movlpd xmm0, [edx]
00343                 movhpd xmm0, [edx+eax] 
00344                 movlpd xmm1, [ecx]
00345                 movhpd xmm1, [ecx+eax]
00346                 pavgb xmm0, xmm1
00347                 movlpd [ecx], xmm0
00348                 movhpd [ecx+eax], xmm0
00349                 add edx, edi
00350                 add ecx, edi
00351                 sub esi, 2
00352 
00353                 jg MC_avg_o_16_sse2_loop
00354         }
00355 }
00356 
00357 static void MC_avg_x_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00358 {
00359         __asm
00360         {
00361                 mov edx, ref
00362                 mov ecx, dest
00363                 mov eax, stride
00364                 mov esi, height
00365                 lea edi, [eax+eax]
00366 
00367         MC_avg_x_16_sse2_loop:
00368 
00369                 movdqu xmm0, [edx]
00370                 movdqu xmm1, [edx+1]
00371                 movdqu xmm2, [edx+eax]
00372                 movdqu xmm3, [edx+eax+1]
00373                 pavgb xmm0, xmm1
00374                 pavgb xmm2, xmm3
00375                 movdqa xmm4, [ecx]
00376                 movdqa xmm5, [ecx+eax]
00377                 pavgb xmm0, xmm4
00378                 pavgb xmm2, xmm5
00379                 movdqa [ecx], xmm0
00380                 movdqa [ecx+eax], xmm2
00381                 add edx, edi
00382                 add ecx, edi
00383                 sub esi, 2
00384 
00385                 jg MC_avg_x_16_sse2_loop
00386         }
00387 }
00388 
00389 static void MC_avg_x_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00390 {
00391         __asm
00392         {
00393                 mov edx, ref
00394                 mov ecx, dest
00395                 mov eax, stride
00396                 mov esi, height
00397                 lea edi, [eax+eax]
00398 
00399         MC_avg_x_8_sse2_loop:
00400 
00401                 movlpd xmm0, [edx]
00402                 movlpd xmm1, [edx+1]
00403                 movhpd xmm0, [edx+eax]
00404                 movhpd xmm1, [edx+eax+1]
00405                 pavgb xmm0, xmm1
00406                 movlpd xmm2, [ecx]
00407                 movhpd xmm2, [ecx+eax]
00408                 pavgb xmm0, xmm2
00409                 movlpd [ecx], xmm0
00410                 movhpd [ecx+eax], xmm0
00411                 add edx, edi
00412                 add ecx, edi
00413                 sub esi, 2
00414 
00415                 jg MC_avg_x_8_sse2_loop
00416         }
00417 }
00418 
00419 static void MC_avg_y_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00420 {
00421         __asm
00422         {
00423                 mov edx, ref
00424                 mov ecx, dest
00425                 mov eax, stride
00426                 mov esi, height
00427                 lea edi, [eax+eax]
00428 
00429                 movdqu xmm0, [edx] 
00430 
00431         MC_avg_y_16_sse2_loop:
00432 
00433                 movdqu xmm1, [edx+eax] 
00434                 movdqu xmm2, [edx+edi] 
00435                 pavgb xmm0, xmm1 
00436                 pavgb xmm1, xmm2 
00437                 movdqa xmm3, [ecx] 
00438                 movdqa xmm4, [ecx+eax] 
00439                 pavgb xmm0, xmm3
00440                 pavgb xmm1, xmm4
00441                 movdqa [ecx], xmm0 
00442                 movdqa xmm0, xmm2 
00443                 movdqa [ecx+eax], xmm1 
00444                 add edx, edi 
00445                 add ecx, edi
00446                 sub esi, 2
00447 
00448                 jg MC_avg_y_16_sse2_loop
00449         }
00450 }
00451 
00452 static void MC_avg_y_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00453 {
00454         __asm
00455         {
00456                 mov edx, ref
00457                 mov ecx, dest
00458                 mov eax, stride
00459                 mov esi, height
00460                 lea edi, [eax+eax]
00461 
00462                 movhpd xmm0, [edx] 
00463                 movlpd xmm0, [edx+eax] 
00464 
00465         MC_put_y_8_sse2_loop:
00466 
00467                 movhpd xmm1, [edx+eax] 
00468                 movlpd xmm1, [edx+edi] 
00469                 pavgb xmm0, xmm1 
00470                 movhpd xmm2, [ecx] 
00471                 movlpd xmm2, [ecx+eax] 
00472                 pavgb xmm0, xmm2
00473                 movhpd [ecx], xmm0 
00474                 movlpd [ecx+eax], xmm0 
00475                 movdqa xmm0, xmm1 
00476                 add edx, edi 
00477                 add ecx, edi
00478                 sub esi, 2
00479 
00480                 jg MC_put_y_8_sse2_loop
00481         }
00482 }
00483 
00484 static void MC_avg_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00485 {
00486         __asm
00487         {
00488                 mov edx, ref 
00489                 mov ecx, dest
00490                 mov eax, stride 
00491                 mov esi, height 
00492                 lea edi, [eax+eax] 
00493                 
00494                 movdqa xmm7, [const_1_16_bytes] 
00495                 movdqu xmm0, [edx] 
00496                 movdqu xmm1, [edx+1] 
00497 
00498         MC_avg_xy_16_sse2_loop:
00499 
00500                 movdqu xmm2, [edx+eax] 
00501                 movdqu xmm3, [edx+eax+1] 
00502                 movdqu xmm4, [edx+edi] 
00503                 movdqu xmm5, [edx+edi+1] 
00504                 pavgb xmm0, xmm1 
00505                 pavgb xmm2, xmm3 
00506                 movdqa xmm1, xmm5 
00507                 pavgb xmm5, xmm4 
00508                 psubusb xmm2, xmm7 
00509                 pavgb xmm0, xmm2 
00510                 pavgb xmm2, xmm5
00511                 movdqa xmm5, [ecx]
00512                 movdqa xmm6, [ecx+eax]
00513                 pavgb xmm0, xmm5 
00514                 pavgb xmm2, xmm6
00515                 movdqa [ecx], xmm0
00516                 movdqa xmm0, xmm4
00517                 movdqa [ecx+eax], xmm2
00518                 add edx, edi
00519                 add ecx, edi
00520                 sub esi, 2
00521 
00522                 jg MC_avg_xy_16_sse2_loop
00523         }
00524 }
00525 
00526 static void MC_avg_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
00527 {
00528         __asm
00529         {
00530                 mov edx, ref
00531                 mov ecx, dest
00532                 mov eax, stride
00533                 mov esi, height
00534                 lea edi, [eax+eax]
00535 
00536                 movdqa xmm7, [const_1_16_bytes] 
00537                 movhpd xmm0, [edx] 
00538                 movlpd xmm0, [edx+eax] 
00539                 movhpd xmm2, [edx+1] 
00540                 movlpd xmm2, [edx+eax+1] 
00541 
00542         MC_avg_xy_8_sse2_loop:
00543 
00544                 movhpd xmm1, [edx+eax] 
00545                 movlpd xmm1, [edx+edi] 
00546                 movhpd xmm3, [edx+eax+1] 
00547                 movlpd xmm3, [edx+edi+1] 
00548                 pavgb xmm0, xmm1 
00549                 pavgb xmm2, xmm3 
00550                 psubusb xmm0, xmm7 
00551                 pavgb xmm0, xmm2 
00552                 movhpd xmm4, [ecx]
00553                 movlpd xmm4, [ecx+eax]
00554                 pavgb xmm0, xmm4 
00555                 movhpd [ecx], xmm0 
00556                 movlpd [ecx+eax], xmm0 
00557                 movdqa xmm0, xmm1 
00558                 movdqa xmm2, xmm3 
00559                 add edx, edi 
00560                 add ecx, edi
00561                 sub esi, 2
00562 
00563                 jg MC_avg_xy_8_sse2_loop
00564         }
00565 }
00566 
00567 mpeg2_mc_t mpeg2_mc_sse2 = 
00568 {
00569         {MC_put_o_16_sse2, MC_put_x_16_sse2, MC_put_y_16_sse2, MC_put_xy_16_sse2,
00570         MC_put_o_8_sse2,  MC_put_x_8_sse2,  MC_put_y_8_sse2,  MC_put_xy_8_sse2},
00571         {MC_avg_o_16_sse2, MC_avg_x_16_sse2, MC_avg_y_16_sse2, MC_avg_xy_16_sse2,
00572         MC_avg_o_8_sse2,  MC_avg_x_8_sse2,  MC_avg_y_8_sse2,  MC_avg_xy_8_sse2}
00573 };

Generated on Tue Dec 13 14:47:50 2005 for guliverkli by  doxygen 1.4.5