00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
00028 #define USED_U64(foo) \
00029 static const uint64_t foo __asm__ (#foo) __attribute__((used))
00030 #else
00031 #define USED_U64(foo) \
00032 static const uint64_t foo __asm__ (#foo) __attribute__((unused))
00033 #endif
00034 USED_U64(mmx_80w) = 0x0080008000800080ULL;
00035 USED_U64(mmx_10w) = 0x1010101010101010ULL;
00036 USED_U64(mmx_00ffw) = 0x00ff00ff00ff00ffULL;
00037 USED_U64(mmx_Y_coeff) = 0x253f253f253f253fULL;
00038
00039 USED_U64(mmx_U_green) = 0xf37df37df37df37dULL;
00040 USED_U64(mmx_U_blue) = 0x4093409340934093ULL;
00041 USED_U64(mmx_V_red) = 0x3312331233123312ULL;
00042 USED_U64(mmx_V_green) = 0xe5fce5fce5fce5fcULL;
00043
00044 USED_U64(mmx_mask_f8) = 0xf8f8f8f8f8f8f8f8ULL;
00045 USED_U64(mmx_mask_fc) = 0xfcfcfcfcfcfcfcfcULL;
00046 #undef USED_U64
00047
00048
00049 #if defined(__x86_64__) && defined(__PIC__)
00050 # define G "(%%rip)"
00051 #else
00052 # define G
00053 #endif
00054
00055 #define MMX_INIT_16 " \n\
00056 movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00057 movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00058 pxor %%mm4, %%mm4 # zero mm4 \n\
00059 movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00060 #movl $0, (%3) # cache preload for image \n\
00061 "
00062
00063 #define INTRINSICS_INIT_16 \
00064 tmp64 = *(uint32_t *)p_u; \
00065 mm0 = (__m64)tmp64; \
00066 tmp64 = *(uint32_t *)p_v; \
00067 mm1 = (__m64)tmp64; \
00068 mm4 = (__m64)(uint64_t)0; \
00069 mm6 = (__m64)*(uint64_t *)p_y; \
00070
00071
00072 #define MMX_INIT_16_GRAY " \n\
00073 movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00074 #movl $0, (%3) # cache preload for image \n\
00075 "
00076
00077 #define MMX_INIT_32 " \n\
00078 movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00079 movl $0, (%3) # cache preload for image \n\
00080 movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00081 pxor %%mm4, %%mm4 # zero mm4 \n\
00082 movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00083 "
00084
00085 #define INTRINSICS_INIT_32 \
00086 tmp64 = *(uint32_t *)p_u; \
00087 mm0 = (__m64)tmp64; \
00088 *(uint16_t *)p_buffer = 0; \
00089 tmp64 = *(uint32_t *)p_v; \
00090 mm1 = (__m64)tmp64; \
00091 mm4 = (__m64)(uint64_t)0; \
00092 mm6 = (__m64)*(uint64_t *)p_y;
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102 #define MMX_YUV_MUL " \n\
00103 # convert the chroma part \n\
00104 punpcklbw %%mm4, %%mm0 # scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 \n\
00105 punpcklbw %%mm4, %%mm1 # scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 \n\
00106 psubsw mmx_80w"G", %%mm0 # Cb -= 128 \n\
00107 psubsw mmx_80w"G", %%mm1 # Cr -= 128 \n\
00108 psllw $3, %%mm0 # Promote precision \n\
00109 psllw $3, %%mm1 # Promote precision \n\
00110 movq %%mm0, %%mm2 # Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 \n\
00111 movq %%mm1, %%mm3 # Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 \n\
00112 pmulhw mmx_U_green"G", %%mm2 # Mul Cb with green coeff -> Cb green \n\
00113 pmulhw mmx_V_green"G", %%mm3 # Mul Cr with green coeff -> Cr green \n\
00114 pmulhw mmx_U_blue"G", %%mm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 \n\
00115 pmulhw mmx_V_red"G", %%mm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 \n\
00116 paddsw %%mm3, %%mm2 # Cb green + Cr green -> Cgreen \n\
00117 \n\
00118 # convert the luma part \n\
00119 psubusb mmx_10w"G", %%mm6 # Y -= 16 \n\
00120 movq %%mm6, %%mm7 # Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00121 pand mmx_00ffw"G", %%mm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\
00122 psrlw $8, %%mm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 \n\
00123 psllw $3, %%mm6 # Promote precision \n\
00124 psllw $3, %%mm7 # Promote precision \n\
00125 pmulhw mmx_Y_coeff"G", %%mm6 # Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 \n\
00126 pmulhw mmx_Y_coeff"G", %%mm7 # Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
00127 "
00128
00129 #define INTRINSICS_YUV_MUL \
00130 mm0 = _mm_unpacklo_pi8(mm0, mm4); \
00131 mm1 = _mm_unpacklo_pi8(mm1, mm4); \
00132 mm0 = _mm_subs_pi16(mm0, (__m64)mmx_80w); \
00133 mm1 = _mm_subs_pi16(mm1, (__m64)mmx_80w); \
00134 mm0 = _mm_slli_pi16(mm0, 3); \
00135 mm1 = _mm_slli_pi16(mm1, 3); \
00136 mm2 = mm0; \
00137 mm3 = mm1; \
00138 mm2 = _mm_mulhi_pi16(mm2, (__m64)mmx_U_green); \
00139 mm3 = _mm_mulhi_pi16(mm3, (__m64)mmx_V_green); \
00140 mm0 = _mm_mulhi_pi16(mm0, (__m64)mmx_U_blue); \
00141 mm1 = _mm_mulhi_pi16(mm1, (__m64)mmx_V_red); \
00142 mm2 = _mm_adds_pi16(mm2, mm3); \
00143 \
00144 mm6 = _mm_subs_pu8(mm6, (__m64)mmx_10w); \
00145 mm7 = mm6; \
00146 mm6 = _mm_and_si64(mm6, (__m64)mmx_00ffw); \
00147 mm7 = _mm_srli_pi16(mm7, 8); \
00148 mm6 = _mm_slli_pi16(mm6, 3); \
00149 mm7 = _mm_slli_pi16(mm7, 3); \
00150 mm6 = _mm_mulhi_pi16(mm6, (__m64)mmx_Y_coeff); \
00151 mm7 = _mm_mulhi_pi16(mm7, (__m64)mmx_Y_coeff);
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161 #define MMX_YUV_ADD " \n\
00162 # Do horizontal and vertical scaling \n\
00163 movq %%mm0, %%mm3 # Copy Cblue \n\
00164 movq %%mm1, %%mm4 # Copy Cred \n\
00165 movq %%mm2, %%mm5 # Copy Cgreen \n\
00166 paddsw %%mm6, %%mm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0 \n\
00167 paddsw %%mm7, %%mm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 \n\
00168 paddsw %%mm6, %%mm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0 \n\
00169 paddsw %%mm7, %%mm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1 \n\
00170 paddsw %%mm6, %%mm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 \n\
00171 paddsw %%mm7, %%mm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 \n\
00172 \n\
00173 # Limit RGB even to 0..255 \n\
00174 packuswb %%mm0, %%mm0 # B6 B4 B2 B0 / B6 B4 B2 B0 \n\
00175 packuswb %%mm1, %%mm1 # R6 R4 R2 R0 / R6 R4 R2 R0 \n\
00176 packuswb %%mm2, %%mm2 # G6 G4 G2 G0 / G6 G4 G2 G0 \n\
00177 \n\
00178 # Limit RGB odd to 0..255 \n\
00179 packuswb %%mm3, %%mm3 # B7 B5 B3 B1 / B7 B5 B3 B1 \n\
00180 packuswb %%mm4, %%mm4 # R7 R5 R3 R1 / R7 R5 R3 R1 \n\
00181 packuswb %%mm5, %%mm5 # G7 G5 G3 G1 / G7 G5 G3 G1 \n\
00182 \n\
00183 # Interleave RGB even and odd \n\
00184 punpcklbw %%mm3, %%mm0 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00185 punpcklbw %%mm4, %%mm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00186 punpcklbw %%mm5, %%mm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00187 "
00188
00189 #define INTRINSICS_YUV_ADD \
00190 mm3 = mm0; \
00191 mm4 = mm1; \
00192 mm5 = mm2; \
00193 mm0 = _mm_adds_pi16(mm0, mm6); \
00194 mm3 = _mm_adds_pi16(mm3, mm7); \
00195 mm1 = _mm_adds_pi16(mm1, mm6); \
00196 mm4 = _mm_adds_pi16(mm4, mm7); \
00197 mm2 = _mm_adds_pi16(mm2, mm6); \
00198 mm5 = _mm_adds_pi16(mm5, mm7); \
00199 \
00200 mm0 = _mm_packs_pu16(mm0, mm0); \
00201 mm1 = _mm_packs_pu16(mm1, mm1); \
00202 mm2 = _mm_packs_pu16(mm2, mm2); \
00203 \
00204 mm3 = _mm_packs_pu16(mm3, mm3); \
00205 mm4 = _mm_packs_pu16(mm4, mm4); \
00206 mm5 = _mm_packs_pu16(mm5, mm5); \
00207 \
00208 mm0 = _mm_unpacklo_pi8(mm0, mm3); \
00209 mm1 = _mm_unpacklo_pi8(mm1, mm4); \
00210 mm2 = _mm_unpacklo_pi8(mm2, mm5);
00211
00212
00213
00214
00215
00216 #define MMX_YUV_GRAY " \n\
00217 # convert the luma part \n\
00218 psubusb mmx_10w"G", %%mm6 \n\
00219 movq %%mm6, %%mm7 \n\
00220 pand mmx_00ffw"G", %%mm6 \n\
00221 psrlw $8, %%mm7 \n\
00222 psllw $3, %%mm6 \n\
00223 psllw $3, %%mm7 \n\
00224 pmulhw mmx_Y_coeff"G", %%mm6 \n\
00225 pmulhw mmx_Y_coeff"G", %%mm7 \n\
00226 packuswb %%mm6, %%mm6 \n\
00227 packuswb %%mm7, %%mm7 \n\
00228 punpcklbw %%mm7, %%mm6 \n\
00229 "
00230
00231 #define MMX_UNPACK_16_GRAY " \n\
00232 movq %%mm6, %%mm5 \n\
00233 pand mmx_mask_f8"G", %%mm6 \n\
00234 pand mmx_mask_fc"G", %%mm5 \n\
00235 movq %%mm6, %%mm7 \n\
00236 psrlw $3, %%mm7 \n\
00237 pxor %%mm3, %%mm3 \n\
00238 movq %%mm7, %%mm2 \n\
00239 movq %%mm5, %%mm0 \n\
00240 punpcklbw %%mm3, %%mm5 \n\
00241 punpcklbw %%mm6, %%mm7 \n\
00242 psllw $3, %%mm5 \n\
00243 por %%mm5, %%mm7 \n\
00244 movq %%mm7, (%3) \n\
00245 punpckhbw %%mm3, %%mm0 \n\
00246 punpckhbw %%mm6, %%mm2 \n\
00247 psllw $3, %%mm0 \n\
00248 movq 8(%0), %%mm6 \n\
00249 por %%mm0, %%mm2 \n\
00250 movq %%mm2, 8(%3) \n\
00251 "
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261 #define MMX_UNPACK_15 " \n\
00262 # mask unneeded bits off \n\
00263 pand mmx_mask_f8"G", %%mm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
00264 psrlw $3,%%mm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
00265 pand mmx_mask_f8"G", %%mm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\
00266 pand mmx_mask_f8"G", %%mm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
00267 psrlw $1,%%mm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\
00268 pxor %%mm4, %%mm4 # zero mm4 \n\
00269 movq %%mm0, %%mm5 # Copy B7-B0 \n\
00270 movq %%mm2, %%mm7 # Copy G7-G0 \n\
00271 \n\
00272 # convert rgb24 plane to rgb15 pack for pixel 0-3 \n\
00273 punpcklbw %%mm4, %%mm2 # ________ ________ g7g6g5g4 g3______ \n\
00274 punpcklbw %%mm1, %%mm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00275 psllw $2,%%mm2 # ________ ____g7g6 g5g4g3__ ________ \n\
00276 por %%mm2, %%mm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
00277 movq 8(%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00278 movq %%mm0, (%3) # store pixel 0-3 \n\
00279 \n\
00280 # convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
00281 punpckhbw %%mm4, %%mm7 # ________ ________ g7g6g5g4 g3______ \n\
00282 punpckhbw %%mm1, %%mm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00283 psllw $2,%%mm7 # ________ ____g7g6 g5g4g3__ ________ \n\
00284 movd 4(%1), %%mm0 # Load 4 Cb __ __ __ __ u3 u2 u1 u0 \n\
00285 por %%mm7, %%mm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
00286 movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\
00287 movq %%mm5, 8(%3) # store pixel 4-7 \n\
00288 "
00289
00290 #define INTRINSICS_UNPACK_15 \
00291 mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
00292 mm0 = _mm_srli_pi16(mm0, 3); \
00293 mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_f8); \
00294 mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
00295 mm1 = _mm_srli_pi16(mm1, 1); \
00296 mm4 = (__m64)(uint64_t)0; \
00297 mm5 = mm0; \
00298 mm7 = mm2; \
00299 \
00300 mm2 = _mm_unpacklo_pi8(mm2, mm4); \
00301 mm0 = _mm_unpacklo_pi8(mm0, mm1); \
00302 mm2 = _mm_slli_pi16(mm2, 2); \
00303 mm0 = _mm_or_si64(mm0, mm2); \
00304 tmp64 = *(uint64_t *)(p_y + 8); \
00305 mm6 = (__m64)tmp64; \
00306 *(uint64_t *)p_buffer = (uint64_t)mm0; \
00307 \
00308 mm7 = _mm_unpackhi_pi8(mm7, mm4); \
00309 mm5 = _mm_unpackhi_pi8(mm5, mm1); \
00310 mm7 = _mm_slli_pi16(mm7, 2); \
00311 tmp64 = (uint64_t)*(uint32_t *)(p_u + 4); \
00312 mm0 = (__m64)tmp64; \
00313 mm5 = _mm_or_si64(mm5, mm7); \
00314 tmp64 = (uint64_t)*(uint32_t *)(p_v + 4); \
00315 mm1 = (__m64)tmp64; \
00316 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
00317
00318
00319
00320
00321
00322
00323
00324
00325 #define MMX_UNPACK_16 " \n\
00326 # mask unneeded bits off \n\
00327 pand mmx_mask_f8"G", %%mm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
00328 pand mmx_mask_fc"G", %%mm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\
00329 pand mmx_mask_f8"G", %%mm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
00330 psrlw $3,%%mm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
00331 pxor %%mm4, %%mm4 # zero mm4 \n\
00332 movq %%mm0, %%mm5 # Copy B7-B0 \n\
00333 movq %%mm2, %%mm7 # Copy G7-G0 \n\
00334 \n\
00335 # convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
00336 punpcklbw %%mm4, %%mm2 # ________ ________ g7g6g5g4 g3g2____ \n\
00337 punpcklbw %%mm1, %%mm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00338 psllw $3,%%mm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
00339 por %%mm2, %%mm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
00340 movq 8(%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00341 movq %%mm0, (%3) # store pixel 0-3 \n\
00342 \n\
00343 # convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
00344 punpckhbw %%mm4, %%mm7 # ________ ________ g7g6g5g4 g3g2____ \n\
00345 punpckhbw %%mm1, %%mm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00346 psllw $3,%%mm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
00347 movd 4(%1), %%mm0 # Load 4 Cb __ __ __ __ u3 u2 u1 u0 \n\
00348 por %%mm7, %%mm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
00349 movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\
00350 movq %%mm5, 8(%3) # store pixel 4-7 \n\
00351 "
00352
00353 #define INTRINSICS_UNPACK_16 \
00354 mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
00355 mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_fc); \
00356 mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
00357 mm0 = _mm_srli_pi16(mm0, 3); \
00358 mm4 = (__m64)(uint64_t)0; \
00359 mm5 = mm0; \
00360 mm7 = mm2; \
00361 \
00362 mm2 = _mm_unpacklo_pi8(mm2, mm4); \
00363 mm0 = _mm_unpacklo_pi8(mm0, mm1); \
00364 mm2 = _mm_slli_pi16(mm2, 3); \
00365 mm0 = _mm_or_si64(mm0, mm2); \
00366 tmp64 = *(uint64_t *)(p_y + 8); \
00367 mm6 = (__m64)tmp64; \
00368 *(uint64_t *)p_buffer = (uint64_t)mm0; \
00369 \
00370 mm7 = _mm_unpackhi_pi8(mm7, mm4); \
00371 mm5 = _mm_unpackhi_pi8(mm5, mm1); \
00372 mm7 = _mm_slli_pi16(mm7, 3); \
00373 tmp64 = (uint64_t)*(uint32_t *)(p_u + 4); \
00374 mm0 = (__m64)tmp64; \
00375 mm5 = _mm_or_si64(mm5, mm7); \
00376 tmp64 = (uint64_t)*(uint32_t *)(p_v + 4); \
00377 mm1 = (__m64)tmp64; \
00378 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
00379
00380
00381
00382
00383
00384
00385
00386
00387 #define MMX_UNPACK_32 " \n\
00388 pxor %%mm3, %%mm3 # zero mm3 \n\
00389 movq %%mm0, %%mm6 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00390 movq %%mm1, %%mm7 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00391 movq %%mm0, %%mm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00392 movq %%mm1, %%mm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00393 punpcklbw %%mm2, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
00394 punpcklbw %%mm3, %%mm7 # 00 R3 00 R2 00 R1 00 R0 \n\
00395 punpcklwd %%mm7, %%mm6 # 00 R1 B1 G1 00 R0 B0 G0 \n\
00396 movq %%mm6, (%3) # Store ARGB1 ARGB0 \n\
00397 movq %%mm0, %%mm6 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00398 punpcklbw %%mm2, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
00399 punpckhwd %%mm7, %%mm6 # 00 R3 G3 B3 00 R2 B3 G2 \n\
00400 movq %%mm6, 8(%3) # Store ARGB3 ARGB2 \n\
00401 punpckhbw %%mm2, %%mm4 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
00402 punpckhbw %%mm3, %%mm5 # 00 R7 00 R6 00 R5 00 R4 \n\
00403 punpcklwd %%mm5, %%mm4 # 00 R5 B5 G5 00 R4 B4 G4 \n\
00404 movq %%mm4, 16(%3) # Store ARGB5 ARGB4 \n\
00405 movq %%mm0, %%mm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00406 punpckhbw %%mm2, %%mm4 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
00407 punpckhwd %%mm5, %%mm4 # 00 R7 G7 B7 00 R6 B6 G6 \n\
00408 movq %%mm4, 24(%3) # Store ARGB7 ARGB6 \n\
00409 \n\
00410 #movd 4(%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00411 #movd 4(%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00412 #pxor %%mm4, %%mm4 # zero mm4 \n\
00413 #movq 8(%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00414 "
00415
00416 #define INTRINSICS_UNPACK_32 \
00417 mm3 = (__m64)(uint64_t)0; \
00418 mm6 = mm0; \
00419 mm7 = mm1; \
00420 mm4 = mm0; \
00421 mm5 = mm1; \
00422 mm6 = _mm_unpacklo_pi8(mm6, mm2); \
00423 mm7 = _mm_unpacklo_pi8(mm7, mm3); \
00424 mm6 = _mm_unpacklo_pi16(mm6, mm7); \
00425 *(uint64_t *)p_buffer = (uint64_t)mm6; \
00426 mm6 = mm0; \
00427 mm6 = _mm_unpacklo_pi8(mm6, mm2); \
00428 mm6 = _mm_unpackhi_pi16(mm6, mm7); \
00429 *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6; \
00430 mm4 = _mm_unpackhi_pi8(mm4, mm2); \
00431 mm5 = _mm_unpackhi_pi8(mm5, mm3); \
00432 mm4 = _mm_unpacklo_pi16(mm4, mm5); \
00433 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm4; \
00434 mm4 = mm0; \
00435 mm4 = _mm_unpackhi_pi8(mm4, mm2); \
00436 mm4 = _mm_unpackhi_pi16(mm4, mm5); \
00437 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm4; \
00438