clang API Documentation
00001 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 00002 * 00003 * Permission is hereby granted, free of charge, to any person obtaining a copy 00004 * of this software and associated documentation files (the "Software"), to deal 00005 * in the Software without restriction, including without limitation the rights 00006 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 00007 * copies of the Software, and to permit persons to whom the Software is 00008 * furnished to do so, subject to the following conditions: 00009 * 00010 * The above copyright notice and this permission notice shall be included in 00011 * all copies or substantial portions of the Software. 00012 * 00013 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00014 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00015 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 00016 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00017 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 00018 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 00019 * THE SOFTWARE. 00020 * 00021 *===-----------------------------------------------------------------------=== 00022 */ 00023 00024 #ifndef __EMMINTRIN_H 00025 #define __EMMINTRIN_H 00026 00027 #ifndef __SSE2__ 00028 #error "SSE2 instruction set not enabled" 00029 #else 00030 00031 #include <xmmintrin.h> 00032 00033 typedef double __m128d __attribute__((__vector_size__(16))); 00034 typedef long long __m128i __attribute__((__vector_size__(16))); 00035 00036 /* Type defines. */ 00037 typedef double __v2df __attribute__ ((__vector_size__ (16))); 00038 typedef long long __v2di __attribute__ ((__vector_size__ (16))); 00039 typedef short __v8hi __attribute__((__vector_size__(16))); 00040 typedef char __v16qi __attribute__((__vector_size__(16))); 00041 00042 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00043 _mm_add_sd(__m128d __a, __m128d __b) 00044 { 00045 __a[0] += __b[0]; 00046 return __a; 00047 } 00048 00049 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00050 _mm_add_pd(__m128d __a, __m128d __b) 00051 { 00052 return __a + __b; 00053 } 00054 00055 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00056 _mm_sub_sd(__m128d __a, __m128d __b) 00057 { 00058 __a[0] -= __b[0]; 00059 return __a; 00060 } 00061 00062 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00063 _mm_sub_pd(__m128d __a, __m128d __b) 00064 { 00065 return __a - __b; 00066 } 00067 00068 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00069 _mm_mul_sd(__m128d __a, __m128d __b) 00070 { 00071 __a[0] *= __b[0]; 00072 return __a; 00073 } 00074 00075 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00076 _mm_mul_pd(__m128d __a, __m128d __b) 00077 { 00078 return __a * __b; 00079 } 00080 00081 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00082 _mm_div_sd(__m128d __a, __m128d __b) 00083 { 00084 __a[0] /= __b[0]; 00085 return __a; 00086 } 00087 00088 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00089 _mm_div_pd(__m128d __a, __m128d __b) 00090 { 00091 return __a / __b; 00092 } 00093 00094 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00095 _mm_sqrt_sd(__m128d __a, __m128d __b) 00096 { 00097 __m128d __c = __builtin_ia32_sqrtsd(__b); 00098 return (__m128d) { __c[0], __a[1] }; 00099 } 00100 00101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00102 _mm_sqrt_pd(__m128d __a) 00103 { 00104 return __builtin_ia32_sqrtpd(__a); 00105 } 00106 00107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00108 _mm_min_sd(__m128d __a, __m128d __b) 00109 { 00110 return __builtin_ia32_minsd(__a, __b); 00111 } 00112 00113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00114 _mm_min_pd(__m128d __a, __m128d __b) 00115 { 00116 return __builtin_ia32_minpd(__a, __b); 00117 } 00118 00119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00120 _mm_max_sd(__m128d __a, __m128d __b) 00121 { 00122 return __builtin_ia32_maxsd(__a, __b); 00123 } 00124 00125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00126 _mm_max_pd(__m128d __a, __m128d __b) 00127 { 00128 return __builtin_ia32_maxpd(__a, __b); 00129 } 00130 00131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00132 _mm_and_pd(__m128d __a, __m128d __b) 00133 { 00134 return (__m128d)((__v4si)__a & (__v4si)__b); 00135 } 00136 00137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00138 _mm_andnot_pd(__m128d __a, __m128d __b) 00139 { 00140 return (__m128d)(~(__v4si)__a & (__v4si)__b); 00141 } 00142 00143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00144 _mm_or_pd(__m128d __a, __m128d __b) 00145 { 00146 return (__m128d)((__v4si)__a | (__v4si)__b); 00147 } 00148 00149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00150 _mm_xor_pd(__m128d __a, __m128d __b) 00151 { 00152 return (__m128d)((__v4si)__a ^ (__v4si)__b); 00153 } 00154 00155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00156 _mm_cmpeq_pd(__m128d __a, __m128d __b) 00157 { 00158 return (__m128d)__builtin_ia32_cmppd(__a, __b, 0); 00159 } 00160 00161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00162 _mm_cmplt_pd(__m128d __a, __m128d __b) 00163 { 00164 return (__m128d)__builtin_ia32_cmppd(__a, __b, 1); 00165 } 00166 00167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00168 _mm_cmple_pd(__m128d __a, __m128d __b) 00169 { 00170 return (__m128d)__builtin_ia32_cmppd(__a, __b, 2); 00171 } 00172 00173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00174 _mm_cmpgt_pd(__m128d __a, __m128d __b) 00175 { 00176 return (__m128d)__builtin_ia32_cmppd(__b, __a, 1); 00177 } 00178 00179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00180 _mm_cmpge_pd(__m128d __a, __m128d __b) 00181 { 00182 return (__m128d)__builtin_ia32_cmppd(__b, __a, 2); 00183 } 00184 00185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00186 _mm_cmpord_pd(__m128d __a, __m128d __b) 00187 { 00188 return (__m128d)__builtin_ia32_cmppd(__a, __b, 7); 00189 } 00190 00191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00192 _mm_cmpunord_pd(__m128d __a, __m128d __b) 00193 { 00194 return (__m128d)__builtin_ia32_cmppd(__a, __b, 3); 00195 } 00196 00197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00198 _mm_cmpneq_pd(__m128d __a, __m128d __b) 00199 { 00200 return (__m128d)__builtin_ia32_cmppd(__a, __b, 4); 00201 } 00202 00203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00204 _mm_cmpnlt_pd(__m128d __a, __m128d __b) 00205 { 00206 return (__m128d)__builtin_ia32_cmppd(__a, __b, 5); 00207 } 00208 00209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00210 _mm_cmpnle_pd(__m128d __a, __m128d __b) 00211 { 00212 return (__m128d)__builtin_ia32_cmppd(__a, __b, 6); 00213 } 00214 00215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00216 _mm_cmpngt_pd(__m128d __a, __m128d __b) 00217 { 00218 return (__m128d)__builtin_ia32_cmppd(__b, __a, 5); 00219 } 00220 00221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00222 _mm_cmpnge_pd(__m128d __a, __m128d __b) 00223 { 00224 return (__m128d)__builtin_ia32_cmppd(__b, __a, 6); 00225 } 00226 00227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00228 _mm_cmpeq_sd(__m128d __a, __m128d __b) 00229 { 00230 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 0); 00231 } 00232 00233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00234 _mm_cmplt_sd(__m128d __a, __m128d __b) 00235 { 00236 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 1); 00237 } 00238 00239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00240 _mm_cmple_sd(__m128d __a, __m128d __b) 00241 { 00242 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 2); 00243 } 00244 00245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00246 _mm_cmpgt_sd(__m128d __a, __m128d __b) 00247 { 00248 __m128d __c = __builtin_ia32_cmpsd(__b, __a, 1); 00249 return (__m128d) { __c[0], __a[1] }; 00250 } 00251 00252 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00253 _mm_cmpge_sd(__m128d __a, __m128d __b) 00254 { 00255 __m128d __c = __builtin_ia32_cmpsd(__b, __a, 2); 00256 return (__m128d) { __c[0], __a[1] }; 00257 } 00258 00259 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00260 _mm_cmpord_sd(__m128d __a, __m128d __b) 00261 { 00262 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 7); 00263 } 00264 00265 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00266 _mm_cmpunord_sd(__m128d __a, __m128d __b) 00267 { 00268 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 3); 00269 } 00270 00271 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00272 _mm_cmpneq_sd(__m128d __a, __m128d __b) 00273 { 00274 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 4); 00275 } 00276 00277 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00278 _mm_cmpnlt_sd(__m128d __a, __m128d __b) 00279 { 00280 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 5); 00281 } 00282 00283 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00284 _mm_cmpnle_sd(__m128d __a, __m128d __b) 00285 { 00286 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 6); 00287 } 00288 00289 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00290 _mm_cmpngt_sd(__m128d __a, __m128d __b) 00291 { 00292 __m128d __c = __builtin_ia32_cmpsd(__b, __a, 5); 00293 return (__m128d) { __c[0], __a[1] }; 00294 } 00295 00296 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00297 _mm_cmpnge_sd(__m128d __a, __m128d __b) 00298 { 00299 __m128d __c = __builtin_ia32_cmpsd(__b, __a, 6); 00300 return (__m128d) { __c[0], __a[1] }; 00301 } 00302 00303 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00304 _mm_comieq_sd(__m128d __a, __m128d __b) 00305 { 00306 return __builtin_ia32_comisdeq(__a, __b); 00307 } 00308 00309 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00310 _mm_comilt_sd(__m128d __a, __m128d __b) 00311 { 00312 return __builtin_ia32_comisdlt(__a, __b); 00313 } 00314 00315 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00316 _mm_comile_sd(__m128d __a, __m128d __b) 00317 { 00318 return __builtin_ia32_comisdle(__a, __b); 00319 } 00320 00321 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00322 _mm_comigt_sd(__m128d __a, __m128d __b) 00323 { 00324 return __builtin_ia32_comisdgt(__a, __b); 00325 } 00326 00327 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00328 _mm_comige_sd(__m128d __a, __m128d __b) 00329 { 00330 return __builtin_ia32_comisdge(__a, __b); 00331 } 00332 00333 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00334 _mm_comineq_sd(__m128d __a, __m128d __b) 00335 { 00336 return __builtin_ia32_comisdneq(__a, __b); 00337 } 00338 00339 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00340 _mm_ucomieq_sd(__m128d __a, __m128d __b) 00341 { 00342 return __builtin_ia32_ucomisdeq(__a, __b); 00343 } 00344 00345 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00346 _mm_ucomilt_sd(__m128d __a, __m128d __b) 00347 { 00348 return __builtin_ia32_ucomisdlt(__a, __b); 00349 } 00350 00351 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00352 _mm_ucomile_sd(__m128d __a, __m128d __b) 00353 { 00354 return __builtin_ia32_ucomisdle(__a, __b); 00355 } 00356 00357 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00358 _mm_ucomigt_sd(__m128d __a, __m128d __b) 00359 { 00360 return __builtin_ia32_ucomisdgt(__a, __b); 00361 } 00362 00363 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00364 _mm_ucomige_sd(__m128d __a, __m128d __b) 00365 { 00366 return __builtin_ia32_ucomisdge(__a, __b); 00367 } 00368 00369 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00370 _mm_ucomineq_sd(__m128d __a, __m128d __b) 00371 { 00372 return __builtin_ia32_ucomisdneq(__a, __b); 00373 } 00374 00375 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00376 _mm_cvtpd_ps(__m128d __a) 00377 { 00378 return __builtin_ia32_cvtpd2ps(__a); 00379 } 00380 00381 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00382 _mm_cvtps_pd(__m128 __a) 00383 { 00384 return __builtin_ia32_cvtps2pd(__a); 00385 } 00386 00387 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00388 _mm_cvtepi32_pd(__m128i __a) 00389 { 00390 return __builtin_ia32_cvtdq2pd((__v4si)__a); 00391 } 00392 00393 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00394 _mm_cvtpd_epi32(__m128d __a) 00395 { 00396 return __builtin_ia32_cvtpd2dq(__a); 00397 } 00398 00399 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00400 _mm_cvtsd_si32(__m128d __a) 00401 { 00402 return __builtin_ia32_cvtsd2si(__a); 00403 } 00404 00405 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00406 _mm_cvtsd_ss(__m128 __a, __m128d __b) 00407 { 00408 __a[0] = __b[0]; 00409 return __a; 00410 } 00411 00412 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00413 _mm_cvtsi32_sd(__m128d __a, int __b) 00414 { 00415 __a[0] = __b; 00416 return __a; 00417 } 00418 00419 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00420 _mm_cvtss_sd(__m128d __a, __m128 __b) 00421 { 00422 __a[0] = __b[0]; 00423 return __a; 00424 } 00425 00426 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00427 _mm_cvttpd_epi32(__m128d __a) 00428 { 00429 return (__m128i)__builtin_ia32_cvttpd2dq(__a); 00430 } 00431 00432 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00433 _mm_cvttsd_si32(__m128d __a) 00434 { 00435 return __a[0]; 00436 } 00437 00438 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00439 _mm_cvtpd_pi32(__m128d __a) 00440 { 00441 return (__m64)__builtin_ia32_cvtpd2pi(__a); 00442 } 00443 00444 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00445 _mm_cvttpd_pi32(__m128d __a) 00446 { 00447 return (__m64)__builtin_ia32_cvttpd2pi(__a); 00448 } 00449 00450 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00451 _mm_cvtpi32_pd(__m64 __a) 00452 { 00453 return __builtin_ia32_cvtpi2pd((__v2si)__a); 00454 } 00455 00456 static __inline__ double __attribute__((__always_inline__, __nodebug__)) 00457 _mm_cvtsd_f64(__m128d __a) 00458 { 00459 return __a[0]; 00460 } 00461 00462 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00463 _mm_load_pd(double const *__dp) 00464 { 00465 return *(__m128d*)__dp; 00466 } 00467 00468 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00469 _mm_load1_pd(double const *__dp) 00470 { 00471 struct __mm_load1_pd_struct { 00472 double __u; 00473 } __attribute__((__packed__, __may_alias__)); 00474 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u; 00475 return (__m128d){ __u, __u }; 00476 } 00477 00478 #define _mm_load_pd1(dp) _mm_load1_pd(dp) 00479 00480 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00481 _mm_loadr_pd(double const *__dp) 00482 { 00483 __m128d __u = *(__m128d*)__dp; 00484 return __builtin_shufflevector(__u, __u, 1, 0); 00485 } 00486 00487 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00488 _mm_loadu_pd(double const *__dp) 00489 { 00490 struct __loadu_pd { 00491 __m128d __v; 00492 } __attribute__((packed, may_alias)); 00493 return ((struct __loadu_pd*)__dp)->__v; 00494 } 00495 00496 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00497 _mm_load_sd(double const *__dp) 00498 { 00499 struct __mm_load_sd_struct { 00500 double __u; 00501 } __attribute__((__packed__, __may_alias__)); 00502 double __u = ((struct __mm_load_sd_struct*)__dp)->__u; 00503 return (__m128d){ __u, 0 }; 00504 } 00505 00506 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00507 _mm_loadh_pd(__m128d __a, double const *__dp) 00508 { 00509 struct __mm_loadh_pd_struct { 00510 double __u; 00511 } __attribute__((__packed__, __may_alias__)); 00512 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u; 00513 return (__m128d){ __a[0], __u }; 00514 } 00515 00516 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00517 _mm_loadl_pd(__m128d __a, double const *__dp) 00518 { 00519 struct __mm_loadl_pd_struct { 00520 double __u; 00521 } __attribute__((__packed__, __may_alias__)); 00522 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u; 00523 return (__m128d){ __u, __a[1] }; 00524 } 00525 00526 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00527 _mm_set_sd(double __w) 00528 { 00529 return (__m128d){ __w, 0 }; 00530 } 00531 00532 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00533 _mm_set1_pd(double __w) 00534 { 00535 return (__m128d){ __w, __w }; 00536 } 00537 00538 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00539 _mm_set_pd(double __w, double __x) 00540 { 00541 return (__m128d){ __x, __w }; 00542 } 00543 00544 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00545 _mm_setr_pd(double __w, double __x) 00546 { 00547 return (__m128d){ __w, __x }; 00548 } 00549 00550 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00551 _mm_setzero_pd(void) 00552 { 00553 return (__m128d){ 0, 0 }; 00554 } 00555 00556 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00557 _mm_move_sd(__m128d __a, __m128d __b) 00558 { 00559 return (__m128d){ __b[0], __a[1] }; 00560 } 00561 00562 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00563 _mm_store_sd(double *__dp, __m128d __a) 00564 { 00565 struct __mm_store_sd_struct { 00566 double __u; 00567 } __attribute__((__packed__, __may_alias__)); 00568 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; 00569 } 00570 00571 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00572 _mm_store1_pd(double *__dp, __m128d __a) 00573 { 00574 struct __mm_store1_pd_struct { 00575 double __u[2]; 00576 } __attribute__((__packed__, __may_alias__)); 00577 ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0]; 00578 ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0]; 00579 } 00580 00581 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00582 _mm_store_pd(double *__dp, __m128d __a) 00583 { 00584 *(__m128d *)__dp = __a; 00585 } 00586 00587 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00588 _mm_storeu_pd(double *__dp, __m128d __a) 00589 { 00590 __builtin_ia32_storeupd(__dp, __a); 00591 } 00592 00593 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00594 _mm_storer_pd(double *__dp, __m128d __a) 00595 { 00596 __a = __builtin_shufflevector(__a, __a, 1, 0); 00597 *(__m128d *)__dp = __a; 00598 } 00599 00600 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00601 _mm_storeh_pd(double *__dp, __m128d __a) 00602 { 00603 struct __mm_storeh_pd_struct { 00604 double __u; 00605 } __attribute__((__packed__, __may_alias__)); 00606 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; 00607 } 00608 00609 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00610 _mm_storel_pd(double *__dp, __m128d __a) 00611 { 00612 struct __mm_storeh_pd_struct { 00613 double __u; 00614 } __attribute__((__packed__, __may_alias__)); 00615 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; 00616 } 00617 00618 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00619 _mm_add_epi8(__m128i __a, __m128i __b) 00620 { 00621 return (__m128i)((__v16qi)__a + (__v16qi)__b); 00622 } 00623 00624 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00625 _mm_add_epi16(__m128i __a, __m128i __b) 00626 { 00627 return (__m128i)((__v8hi)__a + (__v8hi)__b); 00628 } 00629 00630 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00631 _mm_add_epi32(__m128i __a, __m128i __b) 00632 { 00633 return (__m128i)((__v4si)__a + (__v4si)__b); 00634 } 00635 00636 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00637 _mm_add_si64(__m64 __a, __m64 __b) 00638 { 00639 return __a + __b; 00640 } 00641 00642 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00643 _mm_add_epi64(__m128i __a, __m128i __b) 00644 { 00645 return __a + __b; 00646 } 00647 00648 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00649 _mm_adds_epi8(__m128i __a, __m128i __b) 00650 { 00651 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); 00652 } 00653 00654 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00655 _mm_adds_epi16(__m128i __a, __m128i __b) 00656 { 00657 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); 00658 } 00659 00660 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00661 _mm_adds_epu8(__m128i __a, __m128i __b) 00662 { 00663 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); 00664 } 00665 00666 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00667 _mm_adds_epu16(__m128i __a, __m128i __b) 00668 { 00669 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); 00670 } 00671 00672 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00673 _mm_avg_epu8(__m128i __a, __m128i __b) 00674 { 00675 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 00676 } 00677 00678 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00679 _mm_avg_epu16(__m128i __a, __m128i __b) 00680 { 00681 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 00682 } 00683 00684 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00685 _mm_madd_epi16(__m128i __a, __m128i __b) 00686 { 00687 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 00688 } 00689 00690 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00691 _mm_max_epi16(__m128i __a, __m128i __b) 00692 { 00693 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); 00694 } 00695 00696 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00697 _mm_max_epu8(__m128i __a, __m128i __b) 00698 { 00699 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); 00700 } 00701 00702 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00703 _mm_min_epi16(__m128i __a, __m128i __b) 00704 { 00705 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); 00706 } 00707 00708 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00709 _mm_min_epu8(__m128i __a, __m128i __b) 00710 { 00711 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); 00712 } 00713 00714 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00715 _mm_mulhi_epi16(__m128i __a, __m128i __b) 00716 { 00717 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 00718 } 00719 00720 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00721 _mm_mulhi_epu16(__m128i __a, __m128i __b) 00722 { 00723 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 00724 } 00725 00726 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00727 _mm_mullo_epi16(__m128i __a, __m128i __b) 00728 { 00729 return (__m128i)((__v8hi)__a * (__v8hi)__b); 00730 } 00731 00732 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00733 _mm_mul_su32(__m64 __a, __m64 __b) 00734 { 00735 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 00736 } 00737 00738 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00739 _mm_mul_epu32(__m128i __a, __m128i __b) 00740 { 00741 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 00742 } 00743 00744 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00745 _mm_sad_epu8(__m128i __a, __m128i __b) 00746 { 00747 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 00748 } 00749 00750 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00751 _mm_sub_epi8(__m128i __a, __m128i __b) 00752 { 00753 return (__m128i)((__v16qi)__a - (__v16qi)__b); 00754 } 00755 00756 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00757 _mm_sub_epi16(__m128i __a, __m128i __b) 00758 { 00759 return (__m128i)((__v8hi)__a - (__v8hi)__b); 00760 } 00761 00762 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00763 _mm_sub_epi32(__m128i __a, __m128i __b) 00764 { 00765 return (__m128i)((__v4si)__a - (__v4si)__b); 00766 } 00767 00768 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00769 _mm_sub_si64(__m64 __a, __m64 __b) 00770 { 00771 return __a - __b; 00772 } 00773 00774 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00775 _mm_sub_epi64(__m128i __a, __m128i __b) 00776 { 00777 return __a - __b; 00778 } 00779 00780 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00781 _mm_subs_epi8(__m128i __a, __m128i __b) 00782 { 00783 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); 00784 } 00785 00786 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00787 _mm_subs_epi16(__m128i __a, __m128i __b) 00788 { 00789 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); 00790 } 00791 00792 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00793 _mm_subs_epu8(__m128i __a, __m128i __b) 00794 { 00795 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); 00796 } 00797 00798 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00799 _mm_subs_epu16(__m128i __a, __m128i __b) 00800 { 00801 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); 00802 } 00803 00804 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00805 _mm_and_si128(__m128i __a, __m128i __b) 00806 { 00807 return __a & __b; 00808 } 00809 00810 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00811 _mm_andnot_si128(__m128i __a, __m128i __b) 00812 { 00813 return ~__a & __b; 00814 } 00815 00816 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00817 _mm_or_si128(__m128i __a, __m128i __b) 00818 { 00819 return __a | __b; 00820 } 00821 00822 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00823 _mm_xor_si128(__m128i __a, __m128i __b) 00824 { 00825 return __a ^ __b; 00826 } 00827 00828 #define _mm_slli_si128(a, count) __extension__ ({ \ 00829 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \ 00830 __m128i __a = (a); \ 00831 _Pragma("clang diagnostic pop"); \ 00832 (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); }) 00833 00834 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00835 _mm_slli_epi16(__m128i __a, int __count) 00836 { 00837 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 00838 } 00839 00840 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00841 _mm_sll_epi16(__m128i __a, __m128i __count) 00842 { 00843 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 00844 } 00845 00846 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00847 _mm_slli_epi32(__m128i __a, int __count) 00848 { 00849 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 00850 } 00851 00852 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00853 _mm_sll_epi32(__m128i __a, __m128i __count) 00854 { 00855 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 00856 } 00857 00858 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00859 _mm_slli_epi64(__m128i __a, int __count) 00860 { 00861 return __builtin_ia32_psllqi128(__a, __count); 00862 } 00863 00864 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00865 _mm_sll_epi64(__m128i __a, __m128i __count) 00866 { 00867 return __builtin_ia32_psllq128(__a, __count); 00868 } 00869 00870 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00871 _mm_srai_epi16(__m128i __a, int __count) 00872 { 00873 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 00874 } 00875 00876 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00877 _mm_sra_epi16(__m128i __a, __m128i __count) 00878 { 00879 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 00880 } 00881 00882 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00883 _mm_srai_epi32(__m128i __a, int __count) 00884 { 00885 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 00886 } 00887 00888 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00889 _mm_sra_epi32(__m128i __a, __m128i __count) 00890 { 00891 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 00892 } 00893 00894 00895 #define _mm_srli_si128(a, count) __extension__ ({ \ 00896 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \ 00897 __m128i __a = (a); \ 00898 _Pragma("clang diagnostic pop"); \ 00899 (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); }) 00900 00901 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00902 _mm_srli_epi16(__m128i __a, int __count) 00903 { 00904 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 00905 } 00906 00907 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00908 _mm_srl_epi16(__m128i __a, __m128i __count) 00909 { 00910 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 00911 } 00912 00913 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00914 _mm_srli_epi32(__m128i __a, int __count) 00915 { 00916 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 00917 } 00918 00919 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00920 _mm_srl_epi32(__m128i __a, __m128i __count) 00921 { 00922 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 00923 } 00924 00925 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00926 _mm_srli_epi64(__m128i __a, int __count) 00927 { 00928 return __builtin_ia32_psrlqi128(__a, __count); 00929 } 00930 00931 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00932 _mm_srl_epi64(__m128i __a, __m128i __count) 00933 { 00934 return __builtin_ia32_psrlq128(__a, __count); 00935 } 00936 00937 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00938 _mm_cmpeq_epi8(__m128i __a, __m128i __b) 00939 { 00940 return (__m128i)((__v16qi)__a == (__v16qi)__b); 00941 } 00942 00943 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00944 _mm_cmpeq_epi16(__m128i __a, __m128i __b) 00945 { 00946 return (__m128i)((__v8hi)__a == (__v8hi)__b); 00947 } 00948 00949 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00950 _mm_cmpeq_epi32(__m128i __a, __m128i __b) 00951 { 00952 return (__m128i)((__v4si)__a == (__v4si)__b); 00953 } 00954 00955 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00956 _mm_cmpgt_epi8(__m128i __a, __m128i __b) 00957 { 00958 /* This function always performs a signed comparison, but __v16qi is a char 00959 which may be signed or unsigned. */ 00960 typedef signed char __v16qs __attribute__((__vector_size__(16))); 00961 return (__m128i)((__v16qs)__a > (__v16qs)__b); 00962 } 00963 00964 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00965 _mm_cmpgt_epi16(__m128i __a, __m128i __b) 00966 { 00967 return (__m128i)((__v8hi)__a > (__v8hi)__b); 00968 } 00969 00970 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00971 _mm_cmpgt_epi32(__m128i __a, __m128i __b) 00972 { 00973 return (__m128i)((__v4si)__a > (__v4si)__b); 00974 } 00975 00976 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00977 _mm_cmplt_epi8(__m128i __a, __m128i __b) 00978 { 00979 return _mm_cmpgt_epi8(__b, __a); 00980 } 00981 00982 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00983 _mm_cmplt_epi16(__m128i __a, __m128i __b) 00984 { 00985 return _mm_cmpgt_epi16(__b, __a); 00986 } 00987 00988 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00989 _mm_cmplt_epi32(__m128i __a, __m128i __b) 00990 { 00991 return _mm_cmpgt_epi32(__b, __a); 00992 } 00993 00994 #ifdef __x86_64__ 00995 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00996 _mm_cvtsi64_sd(__m128d __a, long long __b) 00997 { 00998 __a[0] = __b; 00999 return __a; 01000 } 01001 01002 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 01003 _mm_cvtsd_si64(__m128d __a) 01004 { 01005 return __builtin_ia32_cvtsd2si64(__a); 01006 } 01007 01008 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 01009 _mm_cvttsd_si64(__m128d __a) 01010 { 01011 return __a[0]; 01012 } 01013 #endif 01014 01015 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 01016 _mm_cvtepi32_ps(__m128i __a) 01017 { 01018 return __builtin_ia32_cvtdq2ps((__v4si)__a); 01019 } 01020 01021 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01022 _mm_cvtps_epi32(__m128 __a) 01023 { 01024 return (__m128i)__builtin_ia32_cvtps2dq(__a); 01025 } 01026 01027 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01028 _mm_cvttps_epi32(__m128 __a) 01029 { 01030 return (__m128i)__builtin_ia32_cvttps2dq(__a); 01031 } 01032 01033 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01034 _mm_cvtsi32_si128(int __a) 01035 { 01036 return (__m128i)(__v4si){ __a, 0, 0, 0 }; 01037 } 01038 01039 #ifdef __x86_64__ 01040 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01041 _mm_cvtsi64_si128(long long __a) 01042 { 01043 return (__m128i){ __a, 0 }; 01044 } 01045 #endif 01046 01047 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 01048 _mm_cvtsi128_si32(__m128i __a) 01049 { 01050 __v4si __b = (__v4si)__a; 01051 return __b[0]; 01052 } 01053 01054 #ifdef __x86_64__ 01055 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 01056 _mm_cvtsi128_si64(__m128i __a) 01057 { 01058 return __a[0]; 01059 } 01060 #endif 01061 01062 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01063 _mm_load_si128(__m128i const *__p) 01064 { 01065 return *__p; 01066 } 01067 01068 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01069 _mm_loadu_si128(__m128i const *__p) 01070 { 01071 struct __loadu_si128 { 01072 __m128i __v; 01073 } __attribute__((packed, may_alias)); 01074 return ((struct __loadu_si128*)__p)->__v; 01075 } 01076 01077 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01078 _mm_loadl_epi64(__m128i const *__p) 01079 { 01080 struct __mm_loadl_epi64_struct { 01081 long long __u; 01082 } __attribute__((__packed__, __may_alias__)); 01083 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0}; 01084 } 01085 01086 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01087 _mm_set_epi64x(long long q1, long long q0) 01088 { 01089 return (__m128i){ q0, q1 }; 01090 } 01091 01092 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01093 _mm_set_epi64(__m64 q1, __m64 q0) 01094 { 01095 return (__m128i){ (long long)q0, (long long)q1 }; 01096 } 01097 01098 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01099 _mm_set_epi32(int i3, int i2, int i1, int i0) 01100 { 01101 return (__m128i)(__v4si){ i0, i1, i2, i3}; 01102 } 01103 01104 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01105 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0) 01106 { 01107 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 01108 } 01109 01110 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01111 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) 01112 { 01113 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 01114 } 01115 01116 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01117 _mm_set1_epi64x(long long __q) 01118 { 01119 return (__m128i){ __q, __q }; 01120 } 01121 01122 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01123 _mm_set1_epi64(__m64 __q) 01124 { 01125 return (__m128i){ (long long)__q, (long long)__q }; 01126 } 01127 01128 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01129 _mm_set1_epi32(int __i) 01130 { 01131 return (__m128i)(__v4si){ __i, __i, __i, __i }; 01132 } 01133 01134 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01135 _mm_set1_epi16(short __w) 01136 { 01137 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w }; 01138 } 01139 01140 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01141 _mm_set1_epi8(char __b) 01142 { 01143 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b }; 01144 } 01145 01146 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01147 _mm_setr_epi64(__m64 q0, __m64 q1) 01148 { 01149 return (__m128i){ (long long)q0, (long long)q1 }; 01150 } 01151 01152 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01153 _mm_setr_epi32(int i0, int i1, int i2, int i3) 01154 { 01155 return (__m128i)(__v4si){ i0, i1, i2, i3}; 01156 } 01157 01158 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01159 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) 01160 { 01161 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 01162 } 01163 01164 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01165 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15) 01166 { 01167 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 01168 } 01169 01170 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01171 _mm_setzero_si128(void) 01172 { 01173 return (__m128i){ 0LL, 0LL }; 01174 } 01175 01176 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01177 _mm_store_si128(__m128i *__p, __m128i __b) 01178 { 01179 *__p = __b; 01180 } 01181 01182 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01183 _mm_storeu_si128(__m128i *__p, __m128i __b) 01184 { 01185 __builtin_ia32_storedqu((char *)__p, (__v16qi)__b); 01186 } 01187 01188 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01189 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) 01190 { 01191 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 01192 } 01193 01194 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01195 _mm_storel_epi64(__m128i *__p, __m128i __a) 01196 { 01197 struct __mm_storel_epi64_struct { 01198 long long __u; 01199 } __attribute__((__packed__, __may_alias__)); 01200 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; 01201 } 01202 01203 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01204 _mm_stream_pd(double *__p, __m128d __a) 01205 { 01206 __builtin_ia32_movntpd(__p, __a); 01207 } 01208 01209 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01210 _mm_stream_si128(__m128i *__p, __m128i __a) 01211 { 01212 __builtin_ia32_movntdq(__p, __a); 01213 } 01214 01215 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01216 _mm_stream_si32(int *__p, int __a) 01217 { 01218 __builtin_ia32_movnti(__p, __a); 01219 } 01220 01221 #ifdef __x86_64__ 01222 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01223 _mm_stream_si64(long long *__p, long long __a) 01224 { 01225 __builtin_ia32_movnti64(__p, __a); 01226 } 01227 #endif 01228 01229 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01230 _mm_clflush(void const *__p) 01231 { 01232 __builtin_ia32_clflush(__p); 01233 } 01234 01235 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01236 _mm_lfence(void) 01237 { 01238 __builtin_ia32_lfence(); 01239 } 01240 01241 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01242 _mm_mfence(void) 01243 { 01244 __builtin_ia32_mfence(); 01245 } 01246 01247 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01248 _mm_packs_epi16(__m128i __a, __m128i __b) 01249 { 01250 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 01251 } 01252 01253 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01254 _mm_packs_epi32(__m128i __a, __m128i __b) 01255 { 01256 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 01257 } 01258 01259 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01260 _mm_packus_epi16(__m128i __a, __m128i __b) 01261 { 01262 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 01263 } 01264 01265 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 01266 _mm_extract_epi16(__m128i __a, int __imm) 01267 { 01268 __v8hi __b = (__v8hi)__a; 01269 return (unsigned short)__b[__imm & 7]; 01270 } 01271 01272 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01273 _mm_insert_epi16(__m128i __a, int __b, int __imm) 01274 { 01275 __v8hi __c = (__v8hi)__a; 01276 __c[__imm & 7] = __b; 01277 return (__m128i)__c; 01278 } 01279 01280 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 01281 _mm_movemask_epi8(__m128i __a) 01282 { 01283 return __builtin_ia32_pmovmskb128((__v16qi)__a); 01284 } 01285 01286 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \ 01287 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \ 01288 __m128i __a = (a); \ 01289 _Pragma("clang diagnostic pop"); \ 01290 (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \ 01291 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 01292 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) 01293 01294 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ 01295 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \ 01296 __m128i __a = (a); \ 01297 _Pragma("clang diagnostic pop"); \ 01298 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \ 01299 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 01300 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 01301 4, 5, 6, 7); }) 01302 01303 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ 01304 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \ 01305 __m128i __a = (a); \ 01306 _Pragma("clang diagnostic pop"); \ 01307 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \ 01308 0, 1, 2, 3, \ 01309 4 + (((imm) & 0x03) >> 0), \ 01310 4 + (((imm) & 0x0c) >> 2), \ 01311 4 + (((imm) & 0x30) >> 4), \ 01312 4 + (((imm) & 0xc0) >> 6)); }) 01313 01314 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01315 _mm_unpackhi_epi8(__m128i __a, __m128i __b) 01316 { 01317 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 01318 } 01319 01320 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01321 _mm_unpackhi_epi16(__m128i __a, __m128i __b) 01322 { 01323 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 01324 } 01325 01326 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01327 _mm_unpackhi_epi32(__m128i __a, __m128i __b) 01328 { 01329 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); 01330 } 01331 01332 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01333 _mm_unpackhi_epi64(__m128i __a, __m128i __b) 01334 { 01335 return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1); 01336 } 01337 01338 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01339 _mm_unpacklo_epi8(__m128i __a, __m128i __b) 01340 { 01341 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 01342 } 01343 01344 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01345 _mm_unpacklo_epi16(__m128i __a, __m128i __b) 01346 { 01347 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 01348 } 01349 01350 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01351 _mm_unpacklo_epi32(__m128i __a, __m128i __b) 01352 { 01353 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); 01354 } 01355 01356 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01357 _mm_unpacklo_epi64(__m128i __a, __m128i __b) 01358 { 01359 return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0); 01360 } 01361 01362 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 01363 _mm_movepi64_pi64(__m128i __a) 01364 { 01365 return (__m64)__a[0]; 01366 } 01367 01368 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01369 _mm_movpi64_epi64(__m64 __a) 01370 { 01371 return (__m128i){ (long long)__a, 0 }; 01372 } 01373 01374 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01375 _mm_move_epi64(__m128i __a) 01376 { 01377 return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2); 01378 } 01379 01380 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 01381 _mm_unpackhi_pd(__m128d __a, __m128d __b) 01382 { 01383 return __builtin_shufflevector(__a, __b, 1, 2+1); 01384 } 01385 01386 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 01387 _mm_unpacklo_pd(__m128d __a, __m128d __b) 01388 { 01389 return __builtin_shufflevector(__a, __b, 0, 2+0); 01390 } 01391 01392 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 01393 _mm_movemask_pd(__m128d __a) 01394 { 01395 return __builtin_ia32_movmskpd(__a); 01396 } 01397 01398 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \ 01399 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \ 01400 __m128d __a = (a); \ 01401 __m128d __b = (b); \ 01402 _Pragma("clang diagnostic pop"); \ 01403 __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); }) 01404 01405 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 01406 _mm_castpd_ps(__m128d __a) 01407 { 01408 return (__m128)__a; 01409 } 01410 01411 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01412 _mm_castpd_si128(__m128d __a) 01413 { 01414 return (__m128i)__a; 01415 } 01416 01417 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 01418 _mm_castps_pd(__m128 __a) 01419 { 01420 return (__m128d)__a; 01421 } 01422 01423 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01424 _mm_castps_si128(__m128 __a) 01425 { 01426 return (__m128i)__a; 01427 } 01428 01429 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 01430 _mm_castsi128_ps(__m128i __a) 01431 { 01432 return (__m128)__a; 01433 } 01434 01435 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 01436 _mm_castsi128_pd(__m128i __a) 01437 { 01438 return (__m128d)__a; 01439 } 01440 01441 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01442 _mm_pause(void) 01443 { 01444 __asm__ volatile ("pause"); 01445 } 01446 01447 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 01448 01449 #endif /* __SSE2__ */ 01450 01451 #endif /* __EMMINTRIN_H */