clang API Documentation

emmintrin.h
Go to the documentation of this file.
00001 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
00002  *
00003  * Permission is hereby granted, free of charge, to any person obtaining a copy
00004  * of this software and associated documentation files (the "Software"), to deal
00005  * in the Software without restriction, including without limitation the rights
00006  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00007  * copies of the Software, and to permit persons to whom the Software is
00008  * furnished to do so, subject to the following conditions:
00009  *
00010  * The above copyright notice and this permission notice shall be included in
00011  * all copies or substantial portions of the Software.
00012  *
00013  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00014  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00015  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00016  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00017  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00018  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00019  * THE SOFTWARE.
00020  *
00021  *===-----------------------------------------------------------------------===
00022  */
00023 
00024 #ifndef __EMMINTRIN_H
00025 #define __EMMINTRIN_H
00026 
00027 #ifndef __SSE2__
00028 #error "SSE2 instruction set not enabled"
00029 #else
00030 
00031 #include <xmmintrin.h>
00032 
00033 typedef double __m128d __attribute__((__vector_size__(16)));
00034 typedef long long __m128i __attribute__((__vector_size__(16)));
00035 
00036 /* Type defines.  */
00037 typedef double __v2df __attribute__ ((__vector_size__ (16)));
00038 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
00039 typedef short __v8hi __attribute__((__vector_size__(16)));
00040 typedef char __v16qi __attribute__((__vector_size__(16)));
00041 
00042 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00043 _mm_add_sd(__m128d __a, __m128d __b)
00044 {
00045   __a[0] += __b[0];
00046   return __a;
00047 }
00048 
00049 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00050 _mm_add_pd(__m128d __a, __m128d __b)
00051 {
00052   return __a + __b;
00053 }
00054 
00055 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00056 _mm_sub_sd(__m128d __a, __m128d __b)
00057 {
00058   __a[0] -= __b[0];
00059   return __a;
00060 }
00061 
00062 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00063 _mm_sub_pd(__m128d __a, __m128d __b)
00064 {
00065   return __a - __b;
00066 }
00067 
00068 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00069 _mm_mul_sd(__m128d __a, __m128d __b)
00070 {
00071   __a[0] *= __b[0];
00072   return __a;
00073 }
00074 
00075 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00076 _mm_mul_pd(__m128d __a, __m128d __b)
00077 {
00078   return __a * __b;
00079 }
00080 
00081 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00082 _mm_div_sd(__m128d __a, __m128d __b)
00083 {
00084   __a[0] /= __b[0];
00085   return __a;
00086 }
00087 
00088 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00089 _mm_div_pd(__m128d __a, __m128d __b)
00090 {
00091   return __a / __b;
00092 }
00093 
00094 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00095 _mm_sqrt_sd(__m128d __a, __m128d __b)
00096 {
00097   __m128d __c = __builtin_ia32_sqrtsd(__b);
00098   return (__m128d) { __c[0], __a[1] };
00099 }
00100 
00101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00102 _mm_sqrt_pd(__m128d __a)
00103 {
00104   return __builtin_ia32_sqrtpd(__a);
00105 }
00106 
00107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00108 _mm_min_sd(__m128d __a, __m128d __b)
00109 {
00110   return __builtin_ia32_minsd(__a, __b);
00111 }
00112 
00113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00114 _mm_min_pd(__m128d __a, __m128d __b)
00115 {
00116   return __builtin_ia32_minpd(__a, __b);
00117 }
00118 
00119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00120 _mm_max_sd(__m128d __a, __m128d __b)
00121 {
00122   return __builtin_ia32_maxsd(__a, __b);
00123 }
00124 
00125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00126 _mm_max_pd(__m128d __a, __m128d __b)
00127 {
00128   return __builtin_ia32_maxpd(__a, __b);
00129 }
00130 
00131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00132 _mm_and_pd(__m128d __a, __m128d __b)
00133 {
00134   return (__m128d)((__v4si)__a & (__v4si)__b);
00135 }
00136 
00137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00138 _mm_andnot_pd(__m128d __a, __m128d __b)
00139 {
00140   return (__m128d)(~(__v4si)__a & (__v4si)__b);
00141 }
00142 
00143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00144 _mm_or_pd(__m128d __a, __m128d __b)
00145 {
00146   return (__m128d)((__v4si)__a | (__v4si)__b);
00147 }
00148 
00149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00150 _mm_xor_pd(__m128d __a, __m128d __b)
00151 {
00152   return (__m128d)((__v4si)__a ^ (__v4si)__b);
00153 }
00154 
00155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00156 _mm_cmpeq_pd(__m128d __a, __m128d __b)
00157 {
00158   return (__m128d)__builtin_ia32_cmppd(__a, __b, 0);
00159 }
00160 
00161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00162 _mm_cmplt_pd(__m128d __a, __m128d __b)
00163 {
00164   return (__m128d)__builtin_ia32_cmppd(__a, __b, 1);
00165 }
00166 
00167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00168 _mm_cmple_pd(__m128d __a, __m128d __b)
00169 {
00170   return (__m128d)__builtin_ia32_cmppd(__a, __b, 2);
00171 }
00172 
00173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00174 _mm_cmpgt_pd(__m128d __a, __m128d __b)
00175 {
00176   return (__m128d)__builtin_ia32_cmppd(__b, __a, 1);
00177 }
00178 
00179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00180 _mm_cmpge_pd(__m128d __a, __m128d __b)
00181 {
00182   return (__m128d)__builtin_ia32_cmppd(__b, __a, 2);
00183 }
00184 
00185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00186 _mm_cmpord_pd(__m128d __a, __m128d __b)
00187 {
00188   return (__m128d)__builtin_ia32_cmppd(__a, __b, 7);
00189 }
00190 
00191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00192 _mm_cmpunord_pd(__m128d __a, __m128d __b)
00193 {
00194   return (__m128d)__builtin_ia32_cmppd(__a, __b, 3);
00195 }
00196 
00197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00198 _mm_cmpneq_pd(__m128d __a, __m128d __b)
00199 {
00200   return (__m128d)__builtin_ia32_cmppd(__a, __b, 4);
00201 }
00202 
00203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00204 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
00205 {
00206   return (__m128d)__builtin_ia32_cmppd(__a, __b, 5);
00207 }
00208 
00209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00210 _mm_cmpnle_pd(__m128d __a, __m128d __b)
00211 {
00212   return (__m128d)__builtin_ia32_cmppd(__a, __b, 6);
00213 }
00214 
00215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00216 _mm_cmpngt_pd(__m128d __a, __m128d __b)
00217 {
00218   return (__m128d)__builtin_ia32_cmppd(__b, __a, 5);
00219 }
00220 
00221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00222 _mm_cmpnge_pd(__m128d __a, __m128d __b)
00223 {
00224   return (__m128d)__builtin_ia32_cmppd(__b, __a, 6);
00225 }
00226 
00227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00228 _mm_cmpeq_sd(__m128d __a, __m128d __b)
00229 {
00230   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 0);
00231 }
00232 
00233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00234 _mm_cmplt_sd(__m128d __a, __m128d __b)
00235 {
00236   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 1);
00237 }
00238 
00239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00240 _mm_cmple_sd(__m128d __a, __m128d __b)
00241 {
00242   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 2);
00243 }
00244 
00245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00246 _mm_cmpgt_sd(__m128d __a, __m128d __b)
00247 {
00248   __m128d __c = __builtin_ia32_cmpsd(__b, __a, 1);
00249   return (__m128d) { __c[0], __a[1] };
00250 }
00251 
00252 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00253 _mm_cmpge_sd(__m128d __a, __m128d __b)
00254 {
00255   __m128d __c = __builtin_ia32_cmpsd(__b, __a, 2);
00256   return (__m128d) { __c[0], __a[1] };
00257 }
00258 
00259 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00260 _mm_cmpord_sd(__m128d __a, __m128d __b)
00261 {
00262   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 7);
00263 }
00264 
00265 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00266 _mm_cmpunord_sd(__m128d __a, __m128d __b)
00267 {
00268   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 3);
00269 }
00270 
00271 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00272 _mm_cmpneq_sd(__m128d __a, __m128d __b)
00273 {
00274   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 4);
00275 }
00276 
00277 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00278 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
00279 {
00280   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 5);
00281 }
00282 
00283 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00284 _mm_cmpnle_sd(__m128d __a, __m128d __b)
00285 {
00286   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 6);
00287 }
00288 
00289 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00290 _mm_cmpngt_sd(__m128d __a, __m128d __b)
00291 {
00292   __m128d __c = __builtin_ia32_cmpsd(__b, __a, 5);
00293   return (__m128d) { __c[0], __a[1] };
00294 }
00295 
00296 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00297 _mm_cmpnge_sd(__m128d __a, __m128d __b)
00298 {
00299   __m128d __c = __builtin_ia32_cmpsd(__b, __a, 6);
00300   return (__m128d) { __c[0], __a[1] };
00301 }
00302 
00303 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00304 _mm_comieq_sd(__m128d __a, __m128d __b)
00305 {
00306   return __builtin_ia32_comisdeq(__a, __b);
00307 }
00308 
00309 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00310 _mm_comilt_sd(__m128d __a, __m128d __b)
00311 {
00312   return __builtin_ia32_comisdlt(__a, __b);
00313 }
00314 
00315 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00316 _mm_comile_sd(__m128d __a, __m128d __b)
00317 {
00318   return __builtin_ia32_comisdle(__a, __b);
00319 }
00320 
00321 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00322 _mm_comigt_sd(__m128d __a, __m128d __b)
00323 {
00324   return __builtin_ia32_comisdgt(__a, __b);
00325 }
00326 
00327 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00328 _mm_comige_sd(__m128d __a, __m128d __b)
00329 {
00330   return __builtin_ia32_comisdge(__a, __b);
00331 }
00332 
00333 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00334 _mm_comineq_sd(__m128d __a, __m128d __b)
00335 {
00336   return __builtin_ia32_comisdneq(__a, __b);
00337 }
00338 
00339 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00340 _mm_ucomieq_sd(__m128d __a, __m128d __b)
00341 {
00342   return __builtin_ia32_ucomisdeq(__a, __b);
00343 }
00344 
00345 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00346 _mm_ucomilt_sd(__m128d __a, __m128d __b)
00347 {
00348   return __builtin_ia32_ucomisdlt(__a, __b);
00349 }
00350 
00351 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00352 _mm_ucomile_sd(__m128d __a, __m128d __b)
00353 {
00354   return __builtin_ia32_ucomisdle(__a, __b);
00355 }
00356 
00357 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00358 _mm_ucomigt_sd(__m128d __a, __m128d __b)
00359 {
00360   return __builtin_ia32_ucomisdgt(__a, __b);
00361 }
00362 
00363 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00364 _mm_ucomige_sd(__m128d __a, __m128d __b)
00365 {
00366   return __builtin_ia32_ucomisdge(__a, __b);
00367 }
00368 
00369 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00370 _mm_ucomineq_sd(__m128d __a, __m128d __b)
00371 {
00372   return __builtin_ia32_ucomisdneq(__a, __b);
00373 }
00374 
00375 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00376 _mm_cvtpd_ps(__m128d __a)
00377 {
00378   return __builtin_ia32_cvtpd2ps(__a);
00379 }
00380 
00381 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00382 _mm_cvtps_pd(__m128 __a)
00383 {
00384   return __builtin_ia32_cvtps2pd(__a);
00385 }
00386 
00387 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00388 _mm_cvtepi32_pd(__m128i __a)
00389 {
00390   return __builtin_ia32_cvtdq2pd((__v4si)__a);
00391 }
00392 
00393 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00394 _mm_cvtpd_epi32(__m128d __a)
00395 {
00396   return __builtin_ia32_cvtpd2dq(__a);
00397 }
00398 
00399 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00400 _mm_cvtsd_si32(__m128d __a)
00401 {
00402   return __builtin_ia32_cvtsd2si(__a);
00403 }
00404 
00405 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00406 _mm_cvtsd_ss(__m128 __a, __m128d __b)
00407 {
00408   __a[0] = __b[0];
00409   return __a;
00410 }
00411 
00412 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00413 _mm_cvtsi32_sd(__m128d __a, int __b)
00414 {
00415   __a[0] = __b;
00416   return __a;
00417 }
00418 
00419 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00420 _mm_cvtss_sd(__m128d __a, __m128 __b)
00421 {
00422   __a[0] = __b[0];
00423   return __a;
00424 }
00425 
00426 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00427 _mm_cvttpd_epi32(__m128d __a)
00428 {
00429   return (__m128i)__builtin_ia32_cvttpd2dq(__a);
00430 }
00431 
00432 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00433 _mm_cvttsd_si32(__m128d __a)
00434 {
00435   return __a[0];
00436 }
00437 
00438 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00439 _mm_cvtpd_pi32(__m128d __a)
00440 {
00441   return (__m64)__builtin_ia32_cvtpd2pi(__a);
00442 }
00443 
00444 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00445 _mm_cvttpd_pi32(__m128d __a)
00446 {
00447   return (__m64)__builtin_ia32_cvttpd2pi(__a);
00448 }
00449 
00450 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00451 _mm_cvtpi32_pd(__m64 __a)
00452 {
00453   return __builtin_ia32_cvtpi2pd((__v2si)__a);
00454 }
00455 
00456 static __inline__ double __attribute__((__always_inline__, __nodebug__))
00457 _mm_cvtsd_f64(__m128d __a)
00458 {
00459   return __a[0];
00460 }
00461 
00462 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00463 _mm_load_pd(double const *__dp)
00464 {
00465   return *(__m128d*)__dp;
00466 }
00467 
00468 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00469 _mm_load1_pd(double const *__dp)
00470 {
00471   struct __mm_load1_pd_struct {
00472     double __u;
00473   } __attribute__((__packed__, __may_alias__));
00474   double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
00475   return (__m128d){ __u, __u };
00476 }
00477 
00478 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
00479 
00480 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00481 _mm_loadr_pd(double const *__dp)
00482 {
00483   __m128d __u = *(__m128d*)__dp;
00484   return __builtin_shufflevector(__u, __u, 1, 0);
00485 }
00486 
00487 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00488 _mm_loadu_pd(double const *__dp)
00489 {
00490   struct __loadu_pd {
00491     __m128d __v;
00492   } __attribute__((packed, may_alias));
00493   return ((struct __loadu_pd*)__dp)->__v;
00494 }
00495 
00496 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00497 _mm_load_sd(double const *__dp)
00498 {
00499   struct __mm_load_sd_struct {
00500     double __u;
00501   } __attribute__((__packed__, __may_alias__));
00502   double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
00503   return (__m128d){ __u, 0 };
00504 }
00505 
00506 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00507 _mm_loadh_pd(__m128d __a, double const *__dp)
00508 {
00509   struct __mm_loadh_pd_struct {
00510     double __u;
00511   } __attribute__((__packed__, __may_alias__));
00512   double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
00513   return (__m128d){ __a[0], __u };
00514 }
00515 
00516 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00517 _mm_loadl_pd(__m128d __a, double const *__dp)
00518 {
00519   struct __mm_loadl_pd_struct {
00520     double __u;
00521   } __attribute__((__packed__, __may_alias__));
00522   double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
00523   return (__m128d){ __u, __a[1] };
00524 }
00525 
00526 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00527 _mm_set_sd(double __w)
00528 {
00529   return (__m128d){ __w, 0 };
00530 }
00531 
00532 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00533 _mm_set1_pd(double __w)
00534 {
00535   return (__m128d){ __w, __w };
00536 }
00537 
00538 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00539 _mm_set_pd(double __w, double __x)
00540 {
00541   return (__m128d){ __x, __w };
00542 }
00543 
00544 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00545 _mm_setr_pd(double __w, double __x)
00546 {
00547   return (__m128d){ __w, __x };
00548 }
00549 
00550 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00551 _mm_setzero_pd(void)
00552 {
00553   return (__m128d){ 0, 0 };
00554 }
00555 
00556 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00557 _mm_move_sd(__m128d __a, __m128d __b)
00558 {
00559   return (__m128d){ __b[0], __a[1] };
00560 }
00561 
00562 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00563 _mm_store_sd(double *__dp, __m128d __a)
00564 {
00565   struct __mm_store_sd_struct {
00566     double __u;
00567   } __attribute__((__packed__, __may_alias__));
00568   ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
00569 }
00570 
00571 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00572 _mm_store1_pd(double *__dp, __m128d __a)
00573 {
00574   struct __mm_store1_pd_struct {
00575     double __u[2];
00576   } __attribute__((__packed__, __may_alias__));
00577   ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
00578   ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
00579 }
00580 
00581 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00582 _mm_store_pd(double *__dp, __m128d __a)
00583 {
00584   *(__m128d *)__dp = __a;
00585 }
00586 
00587 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00588 _mm_storeu_pd(double *__dp, __m128d __a)
00589 {
00590   __builtin_ia32_storeupd(__dp, __a);
00591 }
00592 
00593 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00594 _mm_storer_pd(double *__dp, __m128d __a)
00595 {
00596   __a = __builtin_shufflevector(__a, __a, 1, 0);
00597   *(__m128d *)__dp = __a;
00598 }
00599 
00600 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00601 _mm_storeh_pd(double *__dp, __m128d __a)
00602 {
00603   struct __mm_storeh_pd_struct {
00604     double __u;
00605   } __attribute__((__packed__, __may_alias__));
00606   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
00607 }
00608 
00609 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00610 _mm_storel_pd(double *__dp, __m128d __a)
00611 {
00612   struct __mm_storeh_pd_struct {
00613     double __u;
00614   } __attribute__((__packed__, __may_alias__));
00615   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
00616 }
00617 
00618 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00619 _mm_add_epi8(__m128i __a, __m128i __b)
00620 {
00621   return (__m128i)((__v16qi)__a + (__v16qi)__b);
00622 }
00623 
00624 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00625 _mm_add_epi16(__m128i __a, __m128i __b)
00626 {
00627   return (__m128i)((__v8hi)__a + (__v8hi)__b);
00628 }
00629 
00630 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00631 _mm_add_epi32(__m128i __a, __m128i __b)
00632 {
00633   return (__m128i)((__v4si)__a + (__v4si)__b);
00634 }
00635 
00636 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00637 _mm_add_si64(__m64 __a, __m64 __b)
00638 {
00639   return __a + __b;
00640 }
00641 
00642 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00643 _mm_add_epi64(__m128i __a, __m128i __b)
00644 {
00645   return __a + __b;
00646 }
00647 
00648 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00649 _mm_adds_epi8(__m128i __a, __m128i __b)
00650 {
00651   return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
00652 }
00653 
00654 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00655 _mm_adds_epi16(__m128i __a, __m128i __b)
00656 {
00657   return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
00658 }
00659 
00660 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00661 _mm_adds_epu8(__m128i __a, __m128i __b)
00662 {
00663   return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
00664 }
00665 
00666 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00667 _mm_adds_epu16(__m128i __a, __m128i __b)
00668 {
00669   return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
00670 }
00671 
00672 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00673 _mm_avg_epu8(__m128i __a, __m128i __b)
00674 {
00675   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
00676 }
00677 
00678 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00679 _mm_avg_epu16(__m128i __a, __m128i __b)
00680 {
00681   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
00682 }
00683 
00684 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00685 _mm_madd_epi16(__m128i __a, __m128i __b)
00686 {
00687   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
00688 }
00689 
00690 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00691 _mm_max_epi16(__m128i __a, __m128i __b)
00692 {
00693   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
00694 }
00695 
00696 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00697 _mm_max_epu8(__m128i __a, __m128i __b)
00698 {
00699   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
00700 }
00701 
00702 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00703 _mm_min_epi16(__m128i __a, __m128i __b)
00704 {
00705   return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
00706 }
00707 
00708 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00709 _mm_min_epu8(__m128i __a, __m128i __b)
00710 {
00711   return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
00712 }
00713 
00714 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00715 _mm_mulhi_epi16(__m128i __a, __m128i __b)
00716 {
00717   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
00718 }
00719 
00720 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00721 _mm_mulhi_epu16(__m128i __a, __m128i __b)
00722 {
00723   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
00724 }
00725 
00726 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00727 _mm_mullo_epi16(__m128i __a, __m128i __b)
00728 {
00729   return (__m128i)((__v8hi)__a * (__v8hi)__b);
00730 }
00731 
00732 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00733 _mm_mul_su32(__m64 __a, __m64 __b)
00734 {
00735   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
00736 }
00737 
00738 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00739 _mm_mul_epu32(__m128i __a, __m128i __b)
00740 {
00741   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
00742 }
00743 
00744 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00745 _mm_sad_epu8(__m128i __a, __m128i __b)
00746 {
00747   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
00748 }
00749 
00750 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00751 _mm_sub_epi8(__m128i __a, __m128i __b)
00752 {
00753   return (__m128i)((__v16qi)__a - (__v16qi)__b);
00754 }
00755 
00756 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00757 _mm_sub_epi16(__m128i __a, __m128i __b)
00758 {
00759   return (__m128i)((__v8hi)__a - (__v8hi)__b);
00760 }
00761 
00762 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00763 _mm_sub_epi32(__m128i __a, __m128i __b)
00764 {
00765   return (__m128i)((__v4si)__a - (__v4si)__b);
00766 }
00767 
00768 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00769 _mm_sub_si64(__m64 __a, __m64 __b)
00770 {
00771   return __a - __b;
00772 }
00773 
00774 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00775 _mm_sub_epi64(__m128i __a, __m128i __b)
00776 {
00777   return __a - __b;
00778 }
00779 
00780 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00781 _mm_subs_epi8(__m128i __a, __m128i __b)
00782 {
00783   return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
00784 }
00785 
00786 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00787 _mm_subs_epi16(__m128i __a, __m128i __b)
00788 {
00789   return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
00790 }
00791 
00792 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00793 _mm_subs_epu8(__m128i __a, __m128i __b)
00794 {
00795   return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
00796 }
00797 
00798 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00799 _mm_subs_epu16(__m128i __a, __m128i __b)
00800 {
00801   return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
00802 }
00803 
00804 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00805 _mm_and_si128(__m128i __a, __m128i __b)
00806 {
00807   return __a & __b;
00808 }
00809 
00810 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00811 _mm_andnot_si128(__m128i __a, __m128i __b)
00812 {
00813   return ~__a & __b;
00814 }
00815 
00816 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00817 _mm_or_si128(__m128i __a, __m128i __b)
00818 {
00819   return __a | __b;
00820 }
00821 
00822 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00823 _mm_xor_si128(__m128i __a, __m128i __b)
00824 {
00825   return __a ^ __b;
00826 }
00827 
00828 #define _mm_slli_si128(a, count) __extension__ ({ \
00829   _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
00830   __m128i __a = (a); \
00831    _Pragma("clang diagnostic pop"); \
00832   (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
00833 
00834 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00835 _mm_slli_epi16(__m128i __a, int __count)
00836 {
00837   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
00838 }
00839 
00840 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00841 _mm_sll_epi16(__m128i __a, __m128i __count)
00842 {
00843   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
00844 }
00845 
00846 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00847 _mm_slli_epi32(__m128i __a, int __count)
00848 {
00849   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
00850 }
00851 
00852 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00853 _mm_sll_epi32(__m128i __a, __m128i __count)
00854 {
00855   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
00856 }
00857 
00858 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00859 _mm_slli_epi64(__m128i __a, int __count)
00860 {
00861   return __builtin_ia32_psllqi128(__a, __count);
00862 }
00863 
00864 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00865 _mm_sll_epi64(__m128i __a, __m128i __count)
00866 {
00867   return __builtin_ia32_psllq128(__a, __count);
00868 }
00869 
00870 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00871 _mm_srai_epi16(__m128i __a, int __count)
00872 {
00873   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
00874 }
00875 
00876 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00877 _mm_sra_epi16(__m128i __a, __m128i __count)
00878 {
00879   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
00880 }
00881 
00882 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00883 _mm_srai_epi32(__m128i __a, int __count)
00884 {
00885   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
00886 }
00887 
00888 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00889 _mm_sra_epi32(__m128i __a, __m128i __count)
00890 {
00891   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
00892 }
00893 
00894 
00895 #define _mm_srli_si128(a, count) __extension__ ({ \
00896   _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
00897   __m128i __a = (a); \
00898   _Pragma("clang diagnostic pop"); \
00899   (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
00900 
00901 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00902 _mm_srli_epi16(__m128i __a, int __count)
00903 {
00904   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
00905 }
00906 
00907 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00908 _mm_srl_epi16(__m128i __a, __m128i __count)
00909 {
00910   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
00911 }
00912 
00913 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00914 _mm_srli_epi32(__m128i __a, int __count)
00915 {
00916   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
00917 }
00918 
00919 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00920 _mm_srl_epi32(__m128i __a, __m128i __count)
00921 {
00922   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
00923 }
00924 
00925 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00926 _mm_srli_epi64(__m128i __a, int __count)
00927 {
00928   return __builtin_ia32_psrlqi128(__a, __count);
00929 }
00930 
00931 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00932 _mm_srl_epi64(__m128i __a, __m128i __count)
00933 {
00934   return __builtin_ia32_psrlq128(__a, __count);
00935 }
00936 
00937 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00938 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
00939 {
00940   return (__m128i)((__v16qi)__a == (__v16qi)__b);
00941 }
00942 
00943 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00944 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
00945 {
00946   return (__m128i)((__v8hi)__a == (__v8hi)__b);
00947 }
00948 
00949 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00950 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
00951 {
00952   return (__m128i)((__v4si)__a == (__v4si)__b);
00953 }
00954 
00955 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00956 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
00957 {
00958   /* This function always performs a signed comparison, but __v16qi is a char
00959      which may be signed or unsigned. */
00960   typedef signed char __v16qs __attribute__((__vector_size__(16)));
00961   return (__m128i)((__v16qs)__a > (__v16qs)__b);
00962 }
00963 
00964 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00965 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
00966 {
00967   return (__m128i)((__v8hi)__a > (__v8hi)__b);
00968 }
00969 
00970 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00971 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
00972 {
00973   return (__m128i)((__v4si)__a > (__v4si)__b);
00974 }
00975 
00976 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00977 _mm_cmplt_epi8(__m128i __a, __m128i __b)
00978 {
00979   return _mm_cmpgt_epi8(__b, __a);
00980 }
00981 
00982 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00983 _mm_cmplt_epi16(__m128i __a, __m128i __b)
00984 {
00985   return _mm_cmpgt_epi16(__b, __a);
00986 }
00987 
00988 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00989 _mm_cmplt_epi32(__m128i __a, __m128i __b)
00990 {
00991   return _mm_cmpgt_epi32(__b, __a);
00992 }
00993 
00994 #ifdef __x86_64__
00995 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00996 _mm_cvtsi64_sd(__m128d __a, long long __b)
00997 {
00998   __a[0] = __b;
00999   return __a;
01000 }
01001 
01002 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
01003 _mm_cvtsd_si64(__m128d __a)
01004 {
01005   return __builtin_ia32_cvtsd2si64(__a);
01006 }
01007 
01008 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
01009 _mm_cvttsd_si64(__m128d __a)
01010 {
01011   return __a[0];
01012 }
01013 #endif
01014 
01015 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
01016 _mm_cvtepi32_ps(__m128i __a)
01017 {
01018   return __builtin_ia32_cvtdq2ps((__v4si)__a);
01019 }
01020 
01021 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01022 _mm_cvtps_epi32(__m128 __a)
01023 {
01024   return (__m128i)__builtin_ia32_cvtps2dq(__a);
01025 }
01026 
01027 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01028 _mm_cvttps_epi32(__m128 __a)
01029 {
01030   return (__m128i)__builtin_ia32_cvttps2dq(__a);
01031 }
01032 
01033 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01034 _mm_cvtsi32_si128(int __a)
01035 {
01036   return (__m128i)(__v4si){ __a, 0, 0, 0 };
01037 }
01038 
01039 #ifdef __x86_64__
01040 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01041 _mm_cvtsi64_si128(long long __a)
01042 {
01043   return (__m128i){ __a, 0 };
01044 }
01045 #endif
01046 
01047 static __inline__ int __attribute__((__always_inline__, __nodebug__))
01048 _mm_cvtsi128_si32(__m128i __a)
01049 {
01050   __v4si __b = (__v4si)__a;
01051   return __b[0];
01052 }
01053 
01054 #ifdef __x86_64__
01055 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
01056 _mm_cvtsi128_si64(__m128i __a)
01057 {
01058   return __a[0];
01059 }
01060 #endif
01061 
01062 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01063 _mm_load_si128(__m128i const *__p)
01064 {
01065   return *__p;
01066 }
01067 
01068 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01069 _mm_loadu_si128(__m128i const *__p)
01070 {
01071   struct __loadu_si128 {
01072     __m128i __v;
01073   } __attribute__((packed, may_alias));
01074   return ((struct __loadu_si128*)__p)->__v;
01075 }
01076 
01077 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01078 _mm_loadl_epi64(__m128i const *__p)
01079 {
01080   struct __mm_loadl_epi64_struct {
01081     long long __u;
01082   } __attribute__((__packed__, __may_alias__));
01083   return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
01084 }
01085 
01086 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01087 _mm_set_epi64x(long long q1, long long q0)
01088 {
01089   return (__m128i){ q0, q1 };
01090 }
01091 
01092 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01093 _mm_set_epi64(__m64 q1, __m64 q0)
01094 {
01095   return (__m128i){ (long long)q0, (long long)q1 };
01096 }
01097 
01098 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01099 _mm_set_epi32(int i3, int i2, int i1, int i0)
01100 {
01101   return (__m128i)(__v4si){ i0, i1, i2, i3};
01102 }
01103 
01104 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01105 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
01106 {
01107   return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
01108 }
01109 
01110 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01111 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
01112 {
01113   return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
01114 }
01115 
01116 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01117 _mm_set1_epi64x(long long __q)
01118 {
01119   return (__m128i){ __q, __q };
01120 }
01121 
01122 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01123 _mm_set1_epi64(__m64 __q)
01124 {
01125   return (__m128i){ (long long)__q, (long long)__q };
01126 }
01127 
01128 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01129 _mm_set1_epi32(int __i)
01130 {
01131   return (__m128i)(__v4si){ __i, __i, __i, __i };
01132 }
01133 
01134 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01135 _mm_set1_epi16(short __w)
01136 {
01137   return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
01138 }
01139 
01140 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01141 _mm_set1_epi8(char __b)
01142 {
01143   return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
01144 }
01145 
01146 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01147 _mm_setr_epi64(__m64 q0, __m64 q1)
01148 {
01149   return (__m128i){ (long long)q0, (long long)q1 };
01150 }
01151 
01152 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01153 _mm_setr_epi32(int i0, int i1, int i2, int i3)
01154 {
01155   return (__m128i)(__v4si){ i0, i1, i2, i3};
01156 }
01157 
01158 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01159 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
01160 {
01161   return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
01162 }
01163 
01164 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01165 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
01166 {
01167   return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
01168 }
01169 
01170 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01171 _mm_setzero_si128(void)
01172 {
01173   return (__m128i){ 0LL, 0LL };
01174 }
01175 
01176 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01177 _mm_store_si128(__m128i *__p, __m128i __b)
01178 {
01179   *__p = __b;
01180 }
01181 
01182 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01183 _mm_storeu_si128(__m128i *__p, __m128i __b)
01184 {
01185   __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
01186 }
01187 
01188 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01189 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
01190 {
01191   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
01192 }
01193 
01194 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01195 _mm_storel_epi64(__m128i *__p, __m128i __a)
01196 {
01197   struct __mm_storel_epi64_struct {
01198     long long __u;
01199   } __attribute__((__packed__, __may_alias__));
01200   ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
01201 }
01202 
01203 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01204 _mm_stream_pd(double *__p, __m128d __a)
01205 {
01206   __builtin_ia32_movntpd(__p, __a);
01207 }
01208 
01209 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01210 _mm_stream_si128(__m128i *__p, __m128i __a)
01211 {
01212   __builtin_ia32_movntdq(__p, __a);
01213 }
01214 
01215 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01216 _mm_stream_si32(int *__p, int __a)
01217 {
01218   __builtin_ia32_movnti(__p, __a);
01219 }
01220 
01221 #ifdef __x86_64__
01222 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01223 _mm_stream_si64(long long *__p, long long __a)
01224 {
01225   __builtin_ia32_movnti64(__p, __a);
01226 }
01227 #endif
01228 
01229 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01230 _mm_clflush(void const *__p)
01231 {
01232   __builtin_ia32_clflush(__p);
01233 }
01234 
01235 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01236 _mm_lfence(void)
01237 {
01238   __builtin_ia32_lfence();
01239 }
01240 
01241 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01242 _mm_mfence(void)
01243 {
01244   __builtin_ia32_mfence();
01245 }
01246 
01247 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01248 _mm_packs_epi16(__m128i __a, __m128i __b)
01249 {
01250   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
01251 }
01252 
01253 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01254 _mm_packs_epi32(__m128i __a, __m128i __b)
01255 {
01256   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
01257 }
01258 
01259 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01260 _mm_packus_epi16(__m128i __a, __m128i __b)
01261 {
01262   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
01263 }
01264 
01265 static __inline__ int __attribute__((__always_inline__, __nodebug__))
01266 _mm_extract_epi16(__m128i __a, int __imm)
01267 {
01268   __v8hi __b = (__v8hi)__a;
01269   return (unsigned short)__b[__imm & 7];
01270 }
01271 
01272 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01273 _mm_insert_epi16(__m128i __a, int __b, int __imm)
01274 {
01275   __v8hi __c = (__v8hi)__a;
01276   __c[__imm & 7] = __b;
01277   return (__m128i)__c;
01278 }
01279 
01280 static __inline__ int __attribute__((__always_inline__, __nodebug__))
01281 _mm_movemask_epi8(__m128i __a)
01282 {
01283   return __builtin_ia32_pmovmskb128((__v16qi)__a);
01284 }
01285 
01286 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
01287   _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
01288   __m128i __a = (a); \
01289   _Pragma("clang diagnostic pop"); \
01290   (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
01291                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
01292                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
01293 
01294 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
01295   _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
01296   __m128i __a = (a); \
01297   _Pragma("clang diagnostic pop"); \
01298   (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
01299                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
01300                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
01301                                    4, 5, 6, 7); })
01302 
01303 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
01304   _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
01305   __m128i __a = (a); \
01306   _Pragma("clang diagnostic pop"); \
01307   (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
01308                                    0, 1, 2, 3, \
01309                                    4 + (((imm) & 0x03) >> 0), \
01310                                    4 + (((imm) & 0x0c) >> 2), \
01311                                    4 + (((imm) & 0x30) >> 4), \
01312                                    4 + (((imm) & 0xc0) >> 6)); })
01313 
01314 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01315 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
01316 {
01317   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
01318 }
01319 
01320 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01321 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
01322 {
01323   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
01324 }
01325 
01326 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01327 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
01328 {
01329   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
01330 }
01331 
01332 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01333 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
01334 {
01335   return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
01336 }
01337 
01338 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01339 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
01340 {
01341   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
01342 }
01343 
01344 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01345 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
01346 {
01347   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
01348 }
01349 
01350 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01351 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
01352 {
01353   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
01354 }
01355 
01356 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01357 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
01358 {
01359   return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
01360 }
01361 
01362 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
01363 _mm_movepi64_pi64(__m128i __a)
01364 {
01365   return (__m64)__a[0];
01366 }
01367 
01368 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01369 _mm_movpi64_epi64(__m64 __a)
01370 {
01371   return (__m128i){ (long long)__a, 0 };
01372 }
01373 
01374 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01375 _mm_move_epi64(__m128i __a)
01376 {
01377   return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
01378 }
01379 
01380 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
01381 _mm_unpackhi_pd(__m128d __a, __m128d __b)
01382 {
01383   return __builtin_shufflevector(__a, __b, 1, 2+1);
01384 }
01385 
01386 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
01387 _mm_unpacklo_pd(__m128d __a, __m128d __b)
01388 {
01389   return __builtin_shufflevector(__a, __b, 0, 2+0);
01390 }
01391 
01392 static __inline__ int __attribute__((__always_inline__, __nodebug__))
01393 _mm_movemask_pd(__m128d __a)
01394 {
01395   return __builtin_ia32_movmskpd(__a);
01396 }
01397 
01398 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
01399   _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
01400   __m128d __a = (a); \
01401   __m128d __b = (b); \
01402   _Pragma("clang diagnostic pop"); \
01403   __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
01404 
01405 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
01406 _mm_castpd_ps(__m128d __a)
01407 {
01408   return (__m128)__a;
01409 }
01410 
01411 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01412 _mm_castpd_si128(__m128d __a)
01413 {
01414   return (__m128i)__a;
01415 }
01416 
01417 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
01418 _mm_castps_pd(__m128 __a)
01419 {
01420   return (__m128d)__a;
01421 }
01422 
01423 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01424 _mm_castps_si128(__m128 __a)
01425 {
01426   return (__m128i)__a;
01427 }
01428 
01429 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
01430 _mm_castsi128_ps(__m128i __a)
01431 {
01432   return (__m128)__a;
01433 }
01434 
01435 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
01436 _mm_castsi128_pd(__m128i __a)
01437 {
01438   return (__m128d)__a;
01439 }
01440 
01441 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01442 _mm_pause(void)
01443 {
01444   __asm__ volatile ("pause");
01445 }
01446 
01447 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
01448 
01449 #endif /* __SSE2__ */
01450 
01451 #endif /* __EMMINTRIN_H */