clang API Documentation

xmmintrin.h
Go to the documentation of this file.
00001 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
00002  *
00003  * Permission is hereby granted, free of charge, to any person obtaining a copy
00004  * of this software and associated documentation files (the "Software"), to deal
00005  * in the Software without restriction, including without limitation the rights
00006  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00007  * copies of the Software, and to permit persons to whom the Software is
00008  * furnished to do so, subject to the following conditions:
00009  *
00010  * The above copyright notice and this permission notice shall be included in
00011  * all copies or substantial portions of the Software.
00012  *
00013  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00014  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00015  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00016  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00017  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00018  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00019  * THE SOFTWARE.
00020  *
00021  *===-----------------------------------------------------------------------===
00022  */
00023  
00024 #ifndef __XMMINTRIN_H
00025 #define __XMMINTRIN_H
00026  
00027 #ifndef __SSE__
00028 #error "SSE instruction set not enabled"
00029 #else
00030 
00031 #include <mmintrin.h>
00032 
00033 typedef int __v4si __attribute__((__vector_size__(16)));
00034 typedef float __v4sf __attribute__((__vector_size__(16)));
00035 typedef float __m128 __attribute__((__vector_size__(16)));
00036 
00037 /* This header should only be included in a hosted environment as it depends on
00038  * a standard library to provide allocation routines. */
00039 #if __STDC_HOSTED__
00040 #include <mm_malloc.h>
00041 #endif
00042 
00043 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00044 _mm_add_ss(__m128 __a, __m128 __b)
00045 {
00046   __a[0] += __b[0];
00047   return __a;
00048 }
00049 
00050 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00051 _mm_add_ps(__m128 __a, __m128 __b)
00052 {
00053   return __a + __b;
00054 }
00055 
00056 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00057 _mm_sub_ss(__m128 __a, __m128 __b)
00058 {
00059   __a[0] -= __b[0];
00060   return __a;
00061 }
00062 
00063 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00064 _mm_sub_ps(__m128 __a, __m128 __b)
00065 {
00066   return __a - __b;
00067 }
00068 
00069 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00070 _mm_mul_ss(__m128 __a, __m128 __b)
00071 {
00072   __a[0] *= __b[0];
00073   return __a;
00074 }
00075 
00076 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00077 _mm_mul_ps(__m128 __a, __m128 __b)
00078 {
00079   return __a * __b;
00080 }
00081 
00082 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00083 _mm_div_ss(__m128 __a, __m128 __b)
00084 {
00085   __a[0] /= __b[0];
00086   return __a;
00087 }
00088 
00089 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00090 _mm_div_ps(__m128 __a, __m128 __b)
00091 {
00092   return __a / __b;
00093 }
00094 
00095 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00096 _mm_sqrt_ss(__m128 __a)
00097 {
00098   __m128 __c = __builtin_ia32_sqrtss(__a);
00099   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
00100 }
00101 
00102 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00103 _mm_sqrt_ps(__m128 __a)
00104 {
00105   return __builtin_ia32_sqrtps(__a);
00106 }
00107 
00108 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00109 _mm_rcp_ss(__m128 __a)
00110 {
00111   __m128 __c = __builtin_ia32_rcpss(__a);
00112   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
00113 }
00114 
00115 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00116 _mm_rcp_ps(__m128 __a)
00117 {
00118   return __builtin_ia32_rcpps(__a);
00119 }
00120 
00121 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00122 _mm_rsqrt_ss(__m128 __a)
00123 {
00124   __m128 __c = __builtin_ia32_rsqrtss(__a);
00125   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
00126 }
00127 
00128 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00129 _mm_rsqrt_ps(__m128 __a)
00130 {
00131   return __builtin_ia32_rsqrtps(__a);
00132 }
00133 
00134 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00135 _mm_min_ss(__m128 __a, __m128 __b)
00136 {
00137   return __builtin_ia32_minss(__a, __b);
00138 }
00139 
00140 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00141 _mm_min_ps(__m128 __a, __m128 __b)
00142 {
00143   return __builtin_ia32_minps(__a, __b);
00144 }
00145 
00146 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00147 _mm_max_ss(__m128 __a, __m128 __b)
00148 {
00149   return __builtin_ia32_maxss(__a, __b);
00150 }
00151 
00152 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00153 _mm_max_ps(__m128 __a, __m128 __b)
00154 {
00155   return __builtin_ia32_maxps(__a, __b);
00156 }
00157 
00158 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00159 _mm_and_ps(__m128 __a, __m128 __b)
00160 {
00161   return (__m128)((__v4si)__a & (__v4si)__b);
00162 }
00163 
00164 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00165 _mm_andnot_ps(__m128 __a, __m128 __b)
00166 {
00167   return (__m128)(~(__v4si)__a & (__v4si)__b);
00168 }
00169 
00170 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00171 _mm_or_ps(__m128 __a, __m128 __b)
00172 {
00173   return (__m128)((__v4si)__a | (__v4si)__b);
00174 }
00175 
00176 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00177 _mm_xor_ps(__m128 __a, __m128 __b)
00178 {
00179   return (__m128)((__v4si)__a ^ (__v4si)__b);
00180 }
00181 
00182 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00183 _mm_cmpeq_ss(__m128 __a, __m128 __b)
00184 {
00185   return (__m128)__builtin_ia32_cmpss(__a, __b, 0);
00186 }
00187 
00188 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00189 _mm_cmpeq_ps(__m128 __a, __m128 __b)
00190 {
00191   return (__m128)__builtin_ia32_cmpps(__a, __b, 0);
00192 }
00193 
00194 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00195 _mm_cmplt_ss(__m128 __a, __m128 __b)
00196 {
00197   return (__m128)__builtin_ia32_cmpss(__a, __b, 1);
00198 }
00199 
00200 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00201 _mm_cmplt_ps(__m128 __a, __m128 __b)
00202 {
00203   return (__m128)__builtin_ia32_cmpps(__a, __b, 1);
00204 }
00205 
00206 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00207 _mm_cmple_ss(__m128 __a, __m128 __b)
00208 {
00209   return (__m128)__builtin_ia32_cmpss(__a, __b, 2);
00210 }
00211 
00212 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00213 _mm_cmple_ps(__m128 __a, __m128 __b)
00214 {
00215   return (__m128)__builtin_ia32_cmpps(__a, __b, 2);
00216 }
00217 
00218 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00219 _mm_cmpgt_ss(__m128 __a, __m128 __b)
00220 {
00221   return (__m128)__builtin_shufflevector(__a,
00222                                          __builtin_ia32_cmpss(__b, __a, 1),
00223                                          4, 1, 2, 3);
00224 }
00225 
00226 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00227 _mm_cmpgt_ps(__m128 __a, __m128 __b)
00228 {
00229   return (__m128)__builtin_ia32_cmpps(__b, __a, 1);
00230 }
00231 
00232 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00233 _mm_cmpge_ss(__m128 __a, __m128 __b)
00234 {
00235   return (__m128)__builtin_shufflevector(__a,
00236                                          __builtin_ia32_cmpss(__b, __a, 2),
00237                                          4, 1, 2, 3);
00238 }
00239 
00240 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00241 _mm_cmpge_ps(__m128 __a, __m128 __b)
00242 {
00243   return (__m128)__builtin_ia32_cmpps(__b, __a, 2);
00244 }
00245 
00246 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00247 _mm_cmpneq_ss(__m128 __a, __m128 __b)
00248 {
00249   return (__m128)__builtin_ia32_cmpss(__a, __b, 4);
00250 }
00251 
00252 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00253 _mm_cmpneq_ps(__m128 __a, __m128 __b)
00254 {
00255   return (__m128)__builtin_ia32_cmpps(__a, __b, 4);
00256 }
00257 
00258 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00259 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
00260 {
00261   return (__m128)__builtin_ia32_cmpss(__a, __b, 5);
00262 }
00263 
00264 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00265 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
00266 {
00267   return (__m128)__builtin_ia32_cmpps(__a, __b, 5);
00268 }
00269 
00270 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00271 _mm_cmpnle_ss(__m128 __a, __m128 __b)
00272 {
00273   return (__m128)__builtin_ia32_cmpss(__a, __b, 6);
00274 }
00275 
00276 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00277 _mm_cmpnle_ps(__m128 __a, __m128 __b)
00278 {
00279   return (__m128)__builtin_ia32_cmpps(__a, __b, 6);
00280 }
00281 
00282 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00283 _mm_cmpngt_ss(__m128 __a, __m128 __b)
00284 {
00285   return (__m128)__builtin_shufflevector(__a,
00286                                          __builtin_ia32_cmpss(__b, __a, 5),
00287                                          4, 1, 2, 3);
00288 }
00289 
00290 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00291 _mm_cmpngt_ps(__m128 __a, __m128 __b)
00292 {
00293   return (__m128)__builtin_ia32_cmpps(__b, __a, 5);
00294 }
00295 
00296 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00297 _mm_cmpnge_ss(__m128 __a, __m128 __b)
00298 {
00299   return (__m128)__builtin_shufflevector(__a,
00300                                          __builtin_ia32_cmpss(__b, __a, 6),
00301                                          4, 1, 2, 3);
00302 }
00303 
00304 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00305 _mm_cmpnge_ps(__m128 __a, __m128 __b)
00306 {
00307   return (__m128)__builtin_ia32_cmpps(__b, __a, 6);
00308 }
00309 
00310 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00311 _mm_cmpord_ss(__m128 __a, __m128 __b)
00312 {
00313   return (__m128)__builtin_ia32_cmpss(__a, __b, 7);
00314 }
00315 
00316 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00317 _mm_cmpord_ps(__m128 __a, __m128 __b)
00318 {
00319   return (__m128)__builtin_ia32_cmpps(__a, __b, 7);
00320 }
00321 
00322 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00323 _mm_cmpunord_ss(__m128 __a, __m128 __b)
00324 {
00325   return (__m128)__builtin_ia32_cmpss(__a, __b, 3);
00326 }
00327 
00328 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00329 _mm_cmpunord_ps(__m128 __a, __m128 __b)
00330 {
00331   return (__m128)__builtin_ia32_cmpps(__a, __b, 3);
00332 }
00333 
00334 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00335 _mm_comieq_ss(__m128 __a, __m128 __b)
00336 {
00337   return __builtin_ia32_comieq(__a, __b);
00338 }
00339 
00340 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00341 _mm_comilt_ss(__m128 __a, __m128 __b)
00342 {
00343   return __builtin_ia32_comilt(__a, __b);
00344 }
00345 
00346 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00347 _mm_comile_ss(__m128 __a, __m128 __b)
00348 {
00349   return __builtin_ia32_comile(__a, __b);
00350 }
00351 
00352 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00353 _mm_comigt_ss(__m128 __a, __m128 __b)
00354 {
00355   return __builtin_ia32_comigt(__a, __b);
00356 }
00357 
00358 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00359 _mm_comige_ss(__m128 __a, __m128 __b)
00360 {
00361   return __builtin_ia32_comige(__a, __b);
00362 }
00363 
00364 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00365 _mm_comineq_ss(__m128 __a, __m128 __b)
00366 {
00367   return __builtin_ia32_comineq(__a, __b);
00368 }
00369 
00370 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00371 _mm_ucomieq_ss(__m128 __a, __m128 __b)
00372 {
00373   return __builtin_ia32_ucomieq(__a, __b);
00374 }
00375 
00376 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00377 _mm_ucomilt_ss(__m128 __a, __m128 __b)
00378 {
00379   return __builtin_ia32_ucomilt(__a, __b);
00380 }
00381 
00382 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00383 _mm_ucomile_ss(__m128 __a, __m128 __b)
00384 {
00385   return __builtin_ia32_ucomile(__a, __b);
00386 }
00387 
00388 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00389 _mm_ucomigt_ss(__m128 __a, __m128 __b)
00390 {
00391   return __builtin_ia32_ucomigt(__a, __b);
00392 }
00393 
00394 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00395 _mm_ucomige_ss(__m128 __a, __m128 __b)
00396 {
00397   return __builtin_ia32_ucomige(__a, __b);
00398 }
00399 
00400 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00401 _mm_ucomineq_ss(__m128 __a, __m128 __b)
00402 {
00403   return __builtin_ia32_ucomineq(__a, __b);
00404 }
00405 
00406 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00407 _mm_cvtss_si32(__m128 __a)
00408 {
00409   return __builtin_ia32_cvtss2si(__a);
00410 }
00411 
00412 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00413 _mm_cvt_ss2si(__m128 __a)
00414 {
00415   return _mm_cvtss_si32(__a);
00416 }
00417 
00418 #ifdef __x86_64__
00419 
00420 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
00421 _mm_cvtss_si64(__m128 __a)
00422 {
00423   return __builtin_ia32_cvtss2si64(__a);
00424 }
00425 
00426 #endif
00427 
00428 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00429 _mm_cvtps_pi32(__m128 __a)
00430 {
00431   return (__m64)__builtin_ia32_cvtps2pi(__a);
00432 }
00433 
00434 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00435 _mm_cvt_ps2pi(__m128 __a)
00436 {
00437   return _mm_cvtps_pi32(__a);
00438 }
00439 
00440 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00441 _mm_cvttss_si32(__m128 __a)
00442 {
00443   return __a[0];
00444 }
00445 
00446 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00447 _mm_cvtt_ss2si(__m128 __a)
00448 {
00449   return _mm_cvttss_si32(__a);
00450 }
00451 
00452 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
00453 _mm_cvttss_si64(__m128 __a)
00454 {
00455   return __a[0];
00456 }
00457 
00458 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00459 _mm_cvttps_pi32(__m128 __a)
00460 {
00461   return (__m64)__builtin_ia32_cvttps2pi(__a);
00462 }
00463 
00464 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00465 _mm_cvtt_ps2pi(__m128 __a)
00466 {
00467   return _mm_cvttps_pi32(__a);
00468 }
00469 
00470 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00471 _mm_cvtsi32_ss(__m128 __a, int __b)
00472 {
00473   __a[0] = __b;
00474   return __a;
00475 }
00476 
00477 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00478 _mm_cvt_si2ss(__m128 __a, int __b)
00479 {
00480   return _mm_cvtsi32_ss(__a, __b);
00481 }
00482 
00483 #ifdef __x86_64__
00484 
00485 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00486 _mm_cvtsi64_ss(__m128 __a, long long __b)
00487 {
00488   __a[0] = __b;
00489   return __a;
00490 }
00491 
00492 #endif
00493 
00494 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00495 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
00496 {
00497   return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b);
00498 }
00499 
00500 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00501 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
00502 {
00503   return _mm_cvtpi32_ps(__a, __b);
00504 }
00505 
00506 static __inline__ float __attribute__((__always_inline__, __nodebug__))
00507 _mm_cvtss_f32(__m128 __a)
00508 {
00509   return __a[0];
00510 }
00511 
00512 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00513 _mm_loadh_pi(__m128 __a, const __m64 *__p)
00514 {
00515   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
00516   struct __mm_loadh_pi_struct {
00517     __mm_loadh_pi_v2f32 __u;
00518   } __attribute__((__packed__, __may_alias__));
00519   __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
00520   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
00521   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
00522 }
00523 
00524 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00525 _mm_loadl_pi(__m128 __a, const __m64 *__p)
00526 {
00527   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
00528   struct __mm_loadl_pi_struct {
00529     __mm_loadl_pi_v2f32 __u;
00530   } __attribute__((__packed__, __may_alias__));
00531   __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
00532   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
00533   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
00534 }
00535 
00536 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00537 _mm_load_ss(const float *__p)
00538 {
00539   struct __mm_load_ss_struct {
00540     float __u;
00541   } __attribute__((__packed__, __may_alias__));
00542   float __u = ((struct __mm_load_ss_struct*)__p)->__u;
00543   return (__m128){ __u, 0, 0, 0 };
00544 }
00545 
00546 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00547 _mm_load1_ps(const float *__p)
00548 {
00549   struct __mm_load1_ps_struct {
00550     float __u;
00551   } __attribute__((__packed__, __may_alias__));
00552   float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
00553   return (__m128){ __u, __u, __u, __u };
00554 }
00555 
00556 #define        _mm_load_ps1(p) _mm_load1_ps(p)
00557 
00558 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00559 _mm_load_ps(const float *__p)
00560 {
00561   return *(__m128*)__p;
00562 }
00563 
00564 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00565 _mm_loadu_ps(const float *__p)
00566 {
00567   struct __loadu_ps {
00568     __m128 __v;
00569   } __attribute__((__packed__, __may_alias__));
00570   return ((struct __loadu_ps*)__p)->__v;
00571 }
00572 
00573 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00574 _mm_loadr_ps(const float *__p)
00575 {
00576   __m128 __a = _mm_load_ps(__p);
00577   return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
00578 }
00579 
00580 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00581 _mm_set_ss(float __w)
00582 {
00583   return (__m128){ __w, 0, 0, 0 };
00584 }
00585 
00586 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00587 _mm_set1_ps(float __w)
00588 {
00589   return (__m128){ __w, __w, __w, __w };
00590 }
00591 
00592 /* Microsoft specific. */
00593 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00594 _mm_set_ps1(float __w)
00595 {
00596     return _mm_set1_ps(__w);
00597 }
00598 
00599 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00600 _mm_set_ps(float __z, float __y, float __x, float __w)
00601 {
00602   return (__m128){ __w, __x, __y, __z };
00603 }
00604 
00605 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00606 _mm_setr_ps(float __z, float __y, float __x, float __w)
00607 {
00608   return (__m128){ __z, __y, __x, __w };
00609 }
00610 
00611 static __inline__ __m128 __attribute__((__always_inline__))
00612 _mm_setzero_ps(void)
00613 {
00614   return (__m128){ 0, 0, 0, 0 };
00615 }
00616 
00617 static __inline__ void __attribute__((__always_inline__))
00618 _mm_storeh_pi(__m64 *__p, __m128 __a)
00619 {
00620   __builtin_ia32_storehps((__v2si *)__p, __a);
00621 }
00622 
00623 static __inline__ void __attribute__((__always_inline__))
00624 _mm_storel_pi(__m64 *__p, __m128 __a)
00625 {
00626   __builtin_ia32_storelps((__v2si *)__p, __a);
00627 }
00628 
00629 static __inline__ void __attribute__((__always_inline__))
00630 _mm_store_ss(float *__p, __m128 __a)
00631 {
00632   struct __mm_store_ss_struct {
00633     float __u;
00634   } __attribute__((__packed__, __may_alias__));
00635   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
00636 }
00637 
00638 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00639 _mm_storeu_ps(float *__p, __m128 __a)
00640 {
00641   __builtin_ia32_storeups(__p, __a);
00642 }
00643 
00644 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00645 _mm_store1_ps(float *__p, __m128 __a)
00646 {
00647   __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0);
00648   _mm_storeu_ps(__p, __a);
00649 }
00650 
00651 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00652 _mm_store_ps1(float *__p, __m128 __a)
00653 {
00654     return _mm_store1_ps(__p, __a);
00655 }
00656 
00657 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00658 _mm_store_ps(float *__p, __m128 __a)
00659 {
00660   *(__m128 *)__p = __a;
00661 }
00662 
00663 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00664 _mm_storer_ps(float *__p, __m128 __a)
00665 {
00666   __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
00667   _mm_store_ps(__p, __a);
00668 }
00669 
00670 #define _MM_HINT_T0 3
00671 #define _MM_HINT_T1 2
00672 #define _MM_HINT_T2 1
00673 #define _MM_HINT_NTA 0
00674 
00675 #ifndef _MSC_VER
00676 /* FIXME: We have to #define this because "sel" must be a constant integer, and
00677    Sema doesn't do any form of constant propagation yet. */
00678 
00679 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
00680 #endif
00681 
00682 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00683 _mm_stream_pi(__m64 *__p, __m64 __a)
00684 {
00685   __builtin_ia32_movntq(__p, __a);
00686 }
00687 
00688 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00689 _mm_stream_ps(float *__p, __m128 __a)
00690 {
00691   __builtin_ia32_movntps(__p, __a);
00692 }
00693 
00694 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00695 _mm_sfence(void)
00696 {
00697   __builtin_ia32_sfence();
00698 }
00699 
00700 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00701 _mm_extract_pi16(__m64 __a, int __n)
00702 {
00703   __v4hi __b = (__v4hi)__a;
00704   return (unsigned short)__b[__n & 3];
00705 }
00706 
00707 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00708 _mm_insert_pi16(__m64 __a, int __d, int __n)
00709 {
00710    __v4hi __b = (__v4hi)__a;
00711    __b[__n & 3] = __d;
00712    return (__m64)__b;
00713 }
00714 
00715 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00716 _mm_max_pi16(__m64 __a, __m64 __b)
00717 {
00718   return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
00719 }
00720 
00721 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00722 _mm_max_pu8(__m64 __a, __m64 __b)
00723 {
00724   return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
00725 }
00726 
00727 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00728 _mm_min_pi16(__m64 __a, __m64 __b)
00729 {
00730   return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
00731 }
00732 
00733 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00734 _mm_min_pu8(__m64 __a, __m64 __b)
00735 {
00736   return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
00737 }
00738 
00739 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00740 _mm_movemask_pi8(__m64 __a)
00741 {
00742   return __builtin_ia32_pmovmskb((__v8qi)__a);
00743 }
00744 
00745 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00746 _mm_mulhi_pu16(__m64 __a, __m64 __b)
00747 {
00748   return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
00749 }
00750 
00751 #define _mm_shuffle_pi16(a, n) __extension__ ({ \
00752   __m64 __a = (a); \
00753   (__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); })
00754 
00755 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00756 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
00757 {
00758   __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
00759 }
00760 
00761 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00762 _mm_avg_pu8(__m64 __a, __m64 __b)
00763 {
00764   return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
00765 }
00766 
00767 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00768 _mm_avg_pu16(__m64 __a, __m64 __b)
00769 {
00770   return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
00771 }
00772 
00773 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00774 _mm_sad_pu8(__m64 __a, __m64 __b)
00775 {
00776   return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
00777 }
00778 
00779 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
00780 _mm_getcsr(void)
00781 {
00782   return __builtin_ia32_stmxcsr();
00783 }
00784 
00785 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00786 _mm_setcsr(unsigned int __i)
00787 {
00788   __builtin_ia32_ldmxcsr(__i);
00789 }
00790 
00791 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
00792   __m128 __a = (a); \
00793   __m128 __b = (b); \
00794   (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \
00795                                   (mask) & 0x3, ((mask) & 0xc) >> 2, \
00796                                   (((mask) & 0x30) >> 4) + 4, \
00797                                   (((mask) & 0xc0) >> 6) + 4); })
00798 
00799 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00800 _mm_unpackhi_ps(__m128 __a, __m128 __b)
00801 {
00802   return __builtin_shufflevector(__a, __b, 2, 6, 3, 7);
00803 }
00804 
00805 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00806 _mm_unpacklo_ps(__m128 __a, __m128 __b)
00807 {
00808   return __builtin_shufflevector(__a, __b, 0, 4, 1, 5);
00809 }
00810 
00811 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00812 _mm_move_ss(__m128 __a, __m128 __b)
00813 {
00814   return __builtin_shufflevector(__a, __b, 4, 1, 2, 3);
00815 }
00816 
00817 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00818 _mm_movehl_ps(__m128 __a, __m128 __b)
00819 {
00820   return __builtin_shufflevector(__a, __b, 6, 7, 2, 3);
00821 }
00822 
00823 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00824 _mm_movelh_ps(__m128 __a, __m128 __b)
00825 {
00826   return __builtin_shufflevector(__a, __b, 0, 1, 4, 5);
00827 }
00828 
00829 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00830 _mm_cvtpi16_ps(__m64 __a)
00831 {
00832   __m64 __b, __c;
00833   __m128 __r;
00834 
00835   __b = _mm_setzero_si64();
00836   __b = _mm_cmpgt_pi16(__b, __a);
00837   __c = _mm_unpackhi_pi16(__a, __b);
00838   __r = _mm_setzero_ps();
00839   __r = _mm_cvtpi32_ps(__r, __c);
00840   __r = _mm_movelh_ps(__r, __r);
00841   __c = _mm_unpacklo_pi16(__a, __b);
00842   __r = _mm_cvtpi32_ps(__r, __c);
00843 
00844   return __r;
00845 }
00846 
00847 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00848 _mm_cvtpu16_ps(__m64 __a)
00849 {
00850   __m64 __b, __c;
00851   __m128 __r;
00852 
00853   __b = _mm_setzero_si64();
00854   __c = _mm_unpackhi_pi16(__a, __b);
00855   __r = _mm_setzero_ps();
00856   __r = _mm_cvtpi32_ps(__r, __c);
00857   __r = _mm_movelh_ps(__r, __r);
00858   __c = _mm_unpacklo_pi16(__a, __b);
00859   __r = _mm_cvtpi32_ps(__r, __c);
00860 
00861   return __r;
00862 }
00863 
00864 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00865 _mm_cvtpi8_ps(__m64 __a)
00866 {
00867   __m64 __b;
00868   
00869   __b = _mm_setzero_si64();
00870   __b = _mm_cmpgt_pi8(__b, __a);
00871   __b = _mm_unpacklo_pi8(__a, __b);
00872 
00873   return _mm_cvtpi16_ps(__b);
00874 }
00875 
00876 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00877 _mm_cvtpu8_ps(__m64 __a)
00878 {
00879   __m64 __b;
00880   
00881   __b = _mm_setzero_si64();
00882   __b = _mm_unpacklo_pi8(__a, __b);
00883 
00884   return _mm_cvtpi16_ps(__b);
00885 }
00886 
00887 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00888 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
00889 {
00890   __m128 __c;
00891   
00892   __c = _mm_setzero_ps();
00893   __c = _mm_cvtpi32_ps(__c, __b);
00894   __c = _mm_movelh_ps(__c, __c);
00895 
00896   return _mm_cvtpi32_ps(__c, __a);
00897 }
00898 
00899 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00900 _mm_cvtps_pi16(__m128 __a)
00901 {
00902   __m64 __b, __c;
00903   
00904   __b = _mm_cvtps_pi32(__a);
00905   __a = _mm_movehl_ps(__a, __a);
00906   __c = _mm_cvtps_pi32(__a);
00907   
00908   return _mm_packs_pi32(__b, __c);
00909 }
00910 
00911 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00912 _mm_cvtps_pi8(__m128 __a)
00913 {
00914   __m64 __b, __c;
00915   
00916   __b = _mm_cvtps_pi16(__a);
00917   __c = _mm_setzero_si64();
00918   
00919   return _mm_packs_pi16(__b, __c);
00920 }
00921 
00922 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00923 _mm_movemask_ps(__m128 __a)
00924 {
00925   return __builtin_ia32_movmskps(__a);
00926 }
00927 
00928 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
00929 
00930 #define _MM_EXCEPT_INVALID    (0x0001)
00931 #define _MM_EXCEPT_DENORM     (0x0002)
00932 #define _MM_EXCEPT_DIV_ZERO   (0x0004)
00933 #define _MM_EXCEPT_OVERFLOW   (0x0008)
00934 #define _MM_EXCEPT_UNDERFLOW  (0x0010)
00935 #define _MM_EXCEPT_INEXACT    (0x0020)
00936 #define _MM_EXCEPT_MASK       (0x003f)
00937 
00938 #define _MM_MASK_INVALID      (0x0080)
00939 #define _MM_MASK_DENORM       (0x0100)
00940 #define _MM_MASK_DIV_ZERO     (0x0200)
00941 #define _MM_MASK_OVERFLOW     (0x0400)
00942 #define _MM_MASK_UNDERFLOW    (0x0800)
00943 #define _MM_MASK_INEXACT      (0x1000)
00944 #define _MM_MASK_MASK         (0x1f80)
00945 
00946 #define _MM_ROUND_NEAREST     (0x0000)
00947 #define _MM_ROUND_DOWN        (0x2000)
00948 #define _MM_ROUND_UP          (0x4000)
00949 #define _MM_ROUND_TOWARD_ZERO (0x6000)
00950 #define _MM_ROUND_MASK        (0x6000)
00951 
00952 #define _MM_FLUSH_ZERO_MASK   (0x8000)
00953 #define _MM_FLUSH_ZERO_ON     (0x8000)
00954 #define _MM_FLUSH_ZERO_OFF    (0x0000)
00955 
00956 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
00957 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
00958 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
00959 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
00960 
00961 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
00962 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
00963 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
00964 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
00965 
00966 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
00967 do { \
00968   __m128 tmp3, tmp2, tmp1, tmp0; \
00969   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
00970   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
00971   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
00972   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
00973   (row0) = _mm_movelh_ps(tmp0, tmp2); \
00974   (row1) = _mm_movehl_ps(tmp2, tmp0); \
00975   (row2) = _mm_movelh_ps(tmp1, tmp3); \
00976   (row3) = _mm_movehl_ps(tmp3, tmp1); \
00977 } while (0)
00978 
00979 /* Aliases for compatibility. */
00980 #define _m_pextrw _mm_extract_pi16
00981 #define _m_pinsrw _mm_insert_pi16
00982 #define _m_pmaxsw _mm_max_pi16
00983 #define _m_pmaxub _mm_max_pu8
00984 #define _m_pminsw _mm_min_pi16
00985 #define _m_pminub _mm_min_pu8
00986 #define _m_pmovmskb _mm_movemask_pi8
00987 #define _m_pmulhuw _mm_mulhi_pu16
00988 #define _m_pshufw _mm_shuffle_pi16
00989 #define _m_maskmovq _mm_maskmove_si64
00990 #define _m_pavgb _mm_avg_pu8
00991 #define _m_pavgw _mm_avg_pu16
00992 #define _m_psadbw _mm_sad_pu8
00993 #define _m_ _mm_
00994 #define _m_ _mm_
00995 
00996 /* Ugly hack for backwards-compatibility (compatible with gcc) */
00997 #ifdef __SSE2__
00998 #include <emmintrin.h>
00999 #endif
01000 
01001 #endif /* __SSE__ */
01002 
01003 #endif /* __XMMINTRIN_H */