clang API Documentation
00001 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 00002 * 00003 * Permission is hereby granted, free of charge, to any person obtaining a copy 00004 * of this software and associated documentation files (the "Software"), to deal 00005 * in the Software without restriction, including without limitation the rights 00006 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 00007 * copies of the Software, and to permit persons to whom the Software is 00008 * furnished to do so, subject to the following conditions: 00009 * 00010 * The above copyright notice and this permission notice shall be included in 00011 * all copies or substantial portions of the Software. 00012 * 00013 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00014 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00015 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 00016 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00017 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 00018 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 00019 * THE SOFTWARE. 00020 * 00021 *===-----------------------------------------------------------------------=== 00022 */ 00023 00024 #ifndef __XMMINTRIN_H 00025 #define __XMMINTRIN_H 00026 00027 #ifndef __SSE__ 00028 #error "SSE instruction set not enabled" 00029 #else 00030 00031 #include <mmintrin.h> 00032 00033 typedef int __v4si __attribute__((__vector_size__(16))); 00034 typedef float __v4sf __attribute__((__vector_size__(16))); 00035 typedef float __m128 __attribute__((__vector_size__(16))); 00036 00037 /* This header should only be included in a hosted environment as it depends on 00038 * a standard library to provide allocation routines. */ 00039 #if __STDC_HOSTED__ 00040 #include <mm_malloc.h> 00041 #endif 00042 00043 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00044 _mm_add_ss(__m128 __a, __m128 __b) 00045 { 00046 __a[0] += __b[0]; 00047 return __a; 00048 } 00049 00050 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00051 _mm_add_ps(__m128 __a, __m128 __b) 00052 { 00053 return __a + __b; 00054 } 00055 00056 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00057 _mm_sub_ss(__m128 __a, __m128 __b) 00058 { 00059 __a[0] -= __b[0]; 00060 return __a; 00061 } 00062 00063 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00064 _mm_sub_ps(__m128 __a, __m128 __b) 00065 { 00066 return __a - __b; 00067 } 00068 00069 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00070 _mm_mul_ss(__m128 __a, __m128 __b) 00071 { 00072 __a[0] *= __b[0]; 00073 return __a; 00074 } 00075 00076 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00077 _mm_mul_ps(__m128 __a, __m128 __b) 00078 { 00079 return __a * __b; 00080 } 00081 00082 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00083 _mm_div_ss(__m128 __a, __m128 __b) 00084 { 00085 __a[0] /= __b[0]; 00086 return __a; 00087 } 00088 00089 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00090 _mm_div_ps(__m128 __a, __m128 __b) 00091 { 00092 return __a / __b; 00093 } 00094 00095 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00096 _mm_sqrt_ss(__m128 __a) 00097 { 00098 __m128 __c = __builtin_ia32_sqrtss(__a); 00099 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 00100 } 00101 00102 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00103 _mm_sqrt_ps(__m128 __a) 00104 { 00105 return __builtin_ia32_sqrtps(__a); 00106 } 00107 00108 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00109 _mm_rcp_ss(__m128 __a) 00110 { 00111 __m128 __c = __builtin_ia32_rcpss(__a); 00112 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 00113 } 00114 00115 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00116 _mm_rcp_ps(__m128 __a) 00117 { 00118 return __builtin_ia32_rcpps(__a); 00119 } 00120 00121 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00122 _mm_rsqrt_ss(__m128 __a) 00123 { 00124 __m128 __c = __builtin_ia32_rsqrtss(__a); 00125 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 00126 } 00127 00128 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00129 _mm_rsqrt_ps(__m128 __a) 00130 { 00131 return __builtin_ia32_rsqrtps(__a); 00132 } 00133 00134 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00135 _mm_min_ss(__m128 __a, __m128 __b) 00136 { 00137 return __builtin_ia32_minss(__a, __b); 00138 } 00139 00140 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00141 _mm_min_ps(__m128 __a, __m128 __b) 00142 { 00143 return __builtin_ia32_minps(__a, __b); 00144 } 00145 00146 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00147 _mm_max_ss(__m128 __a, __m128 __b) 00148 { 00149 return __builtin_ia32_maxss(__a, __b); 00150 } 00151 00152 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00153 _mm_max_ps(__m128 __a, __m128 __b) 00154 { 00155 return __builtin_ia32_maxps(__a, __b); 00156 } 00157 00158 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00159 _mm_and_ps(__m128 __a, __m128 __b) 00160 { 00161 return (__m128)((__v4si)__a & (__v4si)__b); 00162 } 00163 00164 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00165 _mm_andnot_ps(__m128 __a, __m128 __b) 00166 { 00167 return (__m128)(~(__v4si)__a & (__v4si)__b); 00168 } 00169 00170 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00171 _mm_or_ps(__m128 __a, __m128 __b) 00172 { 00173 return (__m128)((__v4si)__a | (__v4si)__b); 00174 } 00175 00176 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00177 _mm_xor_ps(__m128 __a, __m128 __b) 00178 { 00179 return (__m128)((__v4si)__a ^ (__v4si)__b); 00180 } 00181 00182 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00183 _mm_cmpeq_ss(__m128 __a, __m128 __b) 00184 { 00185 return (__m128)__builtin_ia32_cmpss(__a, __b, 0); 00186 } 00187 00188 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00189 _mm_cmpeq_ps(__m128 __a, __m128 __b) 00190 { 00191 return (__m128)__builtin_ia32_cmpps(__a, __b, 0); 00192 } 00193 00194 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00195 _mm_cmplt_ss(__m128 __a, __m128 __b) 00196 { 00197 return (__m128)__builtin_ia32_cmpss(__a, __b, 1); 00198 } 00199 00200 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00201 _mm_cmplt_ps(__m128 __a, __m128 __b) 00202 { 00203 return (__m128)__builtin_ia32_cmpps(__a, __b, 1); 00204 } 00205 00206 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00207 _mm_cmple_ss(__m128 __a, __m128 __b) 00208 { 00209 return (__m128)__builtin_ia32_cmpss(__a, __b, 2); 00210 } 00211 00212 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00213 _mm_cmple_ps(__m128 __a, __m128 __b) 00214 { 00215 return (__m128)__builtin_ia32_cmpps(__a, __b, 2); 00216 } 00217 00218 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00219 _mm_cmpgt_ss(__m128 __a, __m128 __b) 00220 { 00221 return (__m128)__builtin_shufflevector(__a, 00222 __builtin_ia32_cmpss(__b, __a, 1), 00223 4, 1, 2, 3); 00224 } 00225 00226 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00227 _mm_cmpgt_ps(__m128 __a, __m128 __b) 00228 { 00229 return (__m128)__builtin_ia32_cmpps(__b, __a, 1); 00230 } 00231 00232 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00233 _mm_cmpge_ss(__m128 __a, __m128 __b) 00234 { 00235 return (__m128)__builtin_shufflevector(__a, 00236 __builtin_ia32_cmpss(__b, __a, 2), 00237 4, 1, 2, 3); 00238 } 00239 00240 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00241 _mm_cmpge_ps(__m128 __a, __m128 __b) 00242 { 00243 return (__m128)__builtin_ia32_cmpps(__b, __a, 2); 00244 } 00245 00246 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00247 _mm_cmpneq_ss(__m128 __a, __m128 __b) 00248 { 00249 return (__m128)__builtin_ia32_cmpss(__a, __b, 4); 00250 } 00251 00252 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00253 _mm_cmpneq_ps(__m128 __a, __m128 __b) 00254 { 00255 return (__m128)__builtin_ia32_cmpps(__a, __b, 4); 00256 } 00257 00258 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00259 _mm_cmpnlt_ss(__m128 __a, __m128 __b) 00260 { 00261 return (__m128)__builtin_ia32_cmpss(__a, __b, 5); 00262 } 00263 00264 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00265 _mm_cmpnlt_ps(__m128 __a, __m128 __b) 00266 { 00267 return (__m128)__builtin_ia32_cmpps(__a, __b, 5); 00268 } 00269 00270 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00271 _mm_cmpnle_ss(__m128 __a, __m128 __b) 00272 { 00273 return (__m128)__builtin_ia32_cmpss(__a, __b, 6); 00274 } 00275 00276 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00277 _mm_cmpnle_ps(__m128 __a, __m128 __b) 00278 { 00279 return (__m128)__builtin_ia32_cmpps(__a, __b, 6); 00280 } 00281 00282 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00283 _mm_cmpngt_ss(__m128 __a, __m128 __b) 00284 { 00285 return (__m128)__builtin_shufflevector(__a, 00286 __builtin_ia32_cmpss(__b, __a, 5), 00287 4, 1, 2, 3); 00288 } 00289 00290 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00291 _mm_cmpngt_ps(__m128 __a, __m128 __b) 00292 { 00293 return (__m128)__builtin_ia32_cmpps(__b, __a, 5); 00294 } 00295 00296 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00297 _mm_cmpnge_ss(__m128 __a, __m128 __b) 00298 { 00299 return (__m128)__builtin_shufflevector(__a, 00300 __builtin_ia32_cmpss(__b, __a, 6), 00301 4, 1, 2, 3); 00302 } 00303 00304 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00305 _mm_cmpnge_ps(__m128 __a, __m128 __b) 00306 { 00307 return (__m128)__builtin_ia32_cmpps(__b, __a, 6); 00308 } 00309 00310 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00311 _mm_cmpord_ss(__m128 __a, __m128 __b) 00312 { 00313 return (__m128)__builtin_ia32_cmpss(__a, __b, 7); 00314 } 00315 00316 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00317 _mm_cmpord_ps(__m128 __a, __m128 __b) 00318 { 00319 return (__m128)__builtin_ia32_cmpps(__a, __b, 7); 00320 } 00321 00322 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00323 _mm_cmpunord_ss(__m128 __a, __m128 __b) 00324 { 00325 return (__m128)__builtin_ia32_cmpss(__a, __b, 3); 00326 } 00327 00328 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00329 _mm_cmpunord_ps(__m128 __a, __m128 __b) 00330 { 00331 return (__m128)__builtin_ia32_cmpps(__a, __b, 3); 00332 } 00333 00334 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00335 _mm_comieq_ss(__m128 __a, __m128 __b) 00336 { 00337 return __builtin_ia32_comieq(__a, __b); 00338 } 00339 00340 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00341 _mm_comilt_ss(__m128 __a, __m128 __b) 00342 { 00343 return __builtin_ia32_comilt(__a, __b); 00344 } 00345 00346 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00347 _mm_comile_ss(__m128 __a, __m128 __b) 00348 { 00349 return __builtin_ia32_comile(__a, __b); 00350 } 00351 00352 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00353 _mm_comigt_ss(__m128 __a, __m128 __b) 00354 { 00355 return __builtin_ia32_comigt(__a, __b); 00356 } 00357 00358 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00359 _mm_comige_ss(__m128 __a, __m128 __b) 00360 { 00361 return __builtin_ia32_comige(__a, __b); 00362 } 00363 00364 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00365 _mm_comineq_ss(__m128 __a, __m128 __b) 00366 { 00367 return __builtin_ia32_comineq(__a, __b); 00368 } 00369 00370 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00371 _mm_ucomieq_ss(__m128 __a, __m128 __b) 00372 { 00373 return __builtin_ia32_ucomieq(__a, __b); 00374 } 00375 00376 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00377 _mm_ucomilt_ss(__m128 __a, __m128 __b) 00378 { 00379 return __builtin_ia32_ucomilt(__a, __b); 00380 } 00381 00382 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00383 _mm_ucomile_ss(__m128 __a, __m128 __b) 00384 { 00385 return __builtin_ia32_ucomile(__a, __b); 00386 } 00387 00388 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00389 _mm_ucomigt_ss(__m128 __a, __m128 __b) 00390 { 00391 return __builtin_ia32_ucomigt(__a, __b); 00392 } 00393 00394 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00395 _mm_ucomige_ss(__m128 __a, __m128 __b) 00396 { 00397 return __builtin_ia32_ucomige(__a, __b); 00398 } 00399 00400 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00401 _mm_ucomineq_ss(__m128 __a, __m128 __b) 00402 { 00403 return __builtin_ia32_ucomineq(__a, __b); 00404 } 00405 00406 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00407 _mm_cvtss_si32(__m128 __a) 00408 { 00409 return __builtin_ia32_cvtss2si(__a); 00410 } 00411 00412 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00413 _mm_cvt_ss2si(__m128 __a) 00414 { 00415 return _mm_cvtss_si32(__a); 00416 } 00417 00418 #ifdef __x86_64__ 00419 00420 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 00421 _mm_cvtss_si64(__m128 __a) 00422 { 00423 return __builtin_ia32_cvtss2si64(__a); 00424 } 00425 00426 #endif 00427 00428 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00429 _mm_cvtps_pi32(__m128 __a) 00430 { 00431 return (__m64)__builtin_ia32_cvtps2pi(__a); 00432 } 00433 00434 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00435 _mm_cvt_ps2pi(__m128 __a) 00436 { 00437 return _mm_cvtps_pi32(__a); 00438 } 00439 00440 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00441 _mm_cvttss_si32(__m128 __a) 00442 { 00443 return __a[0]; 00444 } 00445 00446 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00447 _mm_cvtt_ss2si(__m128 __a) 00448 { 00449 return _mm_cvttss_si32(__a); 00450 } 00451 00452 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 00453 _mm_cvttss_si64(__m128 __a) 00454 { 00455 return __a[0]; 00456 } 00457 00458 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00459 _mm_cvttps_pi32(__m128 __a) 00460 { 00461 return (__m64)__builtin_ia32_cvttps2pi(__a); 00462 } 00463 00464 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00465 _mm_cvtt_ps2pi(__m128 __a) 00466 { 00467 return _mm_cvttps_pi32(__a); 00468 } 00469 00470 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00471 _mm_cvtsi32_ss(__m128 __a, int __b) 00472 { 00473 __a[0] = __b; 00474 return __a; 00475 } 00476 00477 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00478 _mm_cvt_si2ss(__m128 __a, int __b) 00479 { 00480 return _mm_cvtsi32_ss(__a, __b); 00481 } 00482 00483 #ifdef __x86_64__ 00484 00485 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00486 _mm_cvtsi64_ss(__m128 __a, long long __b) 00487 { 00488 __a[0] = __b; 00489 return __a; 00490 } 00491 00492 #endif 00493 00494 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00495 _mm_cvtpi32_ps(__m128 __a, __m64 __b) 00496 { 00497 return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b); 00498 } 00499 00500 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00501 _mm_cvt_pi2ps(__m128 __a, __m64 __b) 00502 { 00503 return _mm_cvtpi32_ps(__a, __b); 00504 } 00505 00506 static __inline__ float __attribute__((__always_inline__, __nodebug__)) 00507 _mm_cvtss_f32(__m128 __a) 00508 { 00509 return __a[0]; 00510 } 00511 00512 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00513 _mm_loadh_pi(__m128 __a, const __m64 *__p) 00514 { 00515 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); 00516 struct __mm_loadh_pi_struct { 00517 __mm_loadh_pi_v2f32 __u; 00518 } __attribute__((__packed__, __may_alias__)); 00519 __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u; 00520 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 00521 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); 00522 } 00523 00524 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00525 _mm_loadl_pi(__m128 __a, const __m64 *__p) 00526 { 00527 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); 00528 struct __mm_loadl_pi_struct { 00529 __mm_loadl_pi_v2f32 __u; 00530 } __attribute__((__packed__, __may_alias__)); 00531 __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u; 00532 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 00533 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); 00534 } 00535 00536 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00537 _mm_load_ss(const float *__p) 00538 { 00539 struct __mm_load_ss_struct { 00540 float __u; 00541 } __attribute__((__packed__, __may_alias__)); 00542 float __u = ((struct __mm_load_ss_struct*)__p)->__u; 00543 return (__m128){ __u, 0, 0, 0 }; 00544 } 00545 00546 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00547 _mm_load1_ps(const float *__p) 00548 { 00549 struct __mm_load1_ps_struct { 00550 float __u; 00551 } __attribute__((__packed__, __may_alias__)); 00552 float __u = ((struct __mm_load1_ps_struct*)__p)->__u; 00553 return (__m128){ __u, __u, __u, __u }; 00554 } 00555 00556 #define _mm_load_ps1(p) _mm_load1_ps(p) 00557 00558 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00559 _mm_load_ps(const float *__p) 00560 { 00561 return *(__m128*)__p; 00562 } 00563 00564 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00565 _mm_loadu_ps(const float *__p) 00566 { 00567 struct __loadu_ps { 00568 __m128 __v; 00569 } __attribute__((__packed__, __may_alias__)); 00570 return ((struct __loadu_ps*)__p)->__v; 00571 } 00572 00573 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00574 _mm_loadr_ps(const float *__p) 00575 { 00576 __m128 __a = _mm_load_ps(__p); 00577 return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); 00578 } 00579 00580 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00581 _mm_set_ss(float __w) 00582 { 00583 return (__m128){ __w, 0, 0, 0 }; 00584 } 00585 00586 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00587 _mm_set1_ps(float __w) 00588 { 00589 return (__m128){ __w, __w, __w, __w }; 00590 } 00591 00592 /* Microsoft specific. */ 00593 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00594 _mm_set_ps1(float __w) 00595 { 00596 return _mm_set1_ps(__w); 00597 } 00598 00599 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00600 _mm_set_ps(float __z, float __y, float __x, float __w) 00601 { 00602 return (__m128){ __w, __x, __y, __z }; 00603 } 00604 00605 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00606 _mm_setr_ps(float __z, float __y, float __x, float __w) 00607 { 00608 return (__m128){ __z, __y, __x, __w }; 00609 } 00610 00611 static __inline__ __m128 __attribute__((__always_inline__)) 00612 _mm_setzero_ps(void) 00613 { 00614 return (__m128){ 0, 0, 0, 0 }; 00615 } 00616 00617 static __inline__ void __attribute__((__always_inline__)) 00618 _mm_storeh_pi(__m64 *__p, __m128 __a) 00619 { 00620 __builtin_ia32_storehps((__v2si *)__p, __a); 00621 } 00622 00623 static __inline__ void __attribute__((__always_inline__)) 00624 _mm_storel_pi(__m64 *__p, __m128 __a) 00625 { 00626 __builtin_ia32_storelps((__v2si *)__p, __a); 00627 } 00628 00629 static __inline__ void __attribute__((__always_inline__)) 00630 _mm_store_ss(float *__p, __m128 __a) 00631 { 00632 struct __mm_store_ss_struct { 00633 float __u; 00634 } __attribute__((__packed__, __may_alias__)); 00635 ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; 00636 } 00637 00638 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00639 _mm_storeu_ps(float *__p, __m128 __a) 00640 { 00641 __builtin_ia32_storeups(__p, __a); 00642 } 00643 00644 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00645 _mm_store1_ps(float *__p, __m128 __a) 00646 { 00647 __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0); 00648 _mm_storeu_ps(__p, __a); 00649 } 00650 00651 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00652 _mm_store_ps1(float *__p, __m128 __a) 00653 { 00654 return _mm_store1_ps(__p, __a); 00655 } 00656 00657 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00658 _mm_store_ps(float *__p, __m128 __a) 00659 { 00660 *(__m128 *)__p = __a; 00661 } 00662 00663 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00664 _mm_storer_ps(float *__p, __m128 __a) 00665 { 00666 __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0); 00667 _mm_store_ps(__p, __a); 00668 } 00669 00670 #define _MM_HINT_T0 3 00671 #define _MM_HINT_T1 2 00672 #define _MM_HINT_T2 1 00673 #define _MM_HINT_NTA 0 00674 00675 #ifndef _MSC_VER 00676 /* FIXME: We have to #define this because "sel" must be a constant integer, and 00677 Sema doesn't do any form of constant propagation yet. */ 00678 00679 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel))) 00680 #endif 00681 00682 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00683 _mm_stream_pi(__m64 *__p, __m64 __a) 00684 { 00685 __builtin_ia32_movntq(__p, __a); 00686 } 00687 00688 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00689 _mm_stream_ps(float *__p, __m128 __a) 00690 { 00691 __builtin_ia32_movntps(__p, __a); 00692 } 00693 00694 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00695 _mm_sfence(void) 00696 { 00697 __builtin_ia32_sfence(); 00698 } 00699 00700 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00701 _mm_extract_pi16(__m64 __a, int __n) 00702 { 00703 __v4hi __b = (__v4hi)__a; 00704 return (unsigned short)__b[__n & 3]; 00705 } 00706 00707 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00708 _mm_insert_pi16(__m64 __a, int __d, int __n) 00709 { 00710 __v4hi __b = (__v4hi)__a; 00711 __b[__n & 3] = __d; 00712 return (__m64)__b; 00713 } 00714 00715 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00716 _mm_max_pi16(__m64 __a, __m64 __b) 00717 { 00718 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); 00719 } 00720 00721 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00722 _mm_max_pu8(__m64 __a, __m64 __b) 00723 { 00724 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); 00725 } 00726 00727 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00728 _mm_min_pi16(__m64 __a, __m64 __b) 00729 { 00730 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); 00731 } 00732 00733 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00734 _mm_min_pu8(__m64 __a, __m64 __b) 00735 { 00736 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); 00737 } 00738 00739 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00740 _mm_movemask_pi8(__m64 __a) 00741 { 00742 return __builtin_ia32_pmovmskb((__v8qi)__a); 00743 } 00744 00745 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00746 _mm_mulhi_pu16(__m64 __a, __m64 __b) 00747 { 00748 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); 00749 } 00750 00751 #define _mm_shuffle_pi16(a, n) __extension__ ({ \ 00752 __m64 __a = (a); \ 00753 (__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); }) 00754 00755 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00756 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) 00757 { 00758 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); 00759 } 00760 00761 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00762 _mm_avg_pu8(__m64 __a, __m64 __b) 00763 { 00764 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); 00765 } 00766 00767 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00768 _mm_avg_pu16(__m64 __a, __m64 __b) 00769 { 00770 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); 00771 } 00772 00773 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00774 _mm_sad_pu8(__m64 __a, __m64 __b) 00775 { 00776 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); 00777 } 00778 00779 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 00780 _mm_getcsr(void) 00781 { 00782 return __builtin_ia32_stmxcsr(); 00783 } 00784 00785 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00786 _mm_setcsr(unsigned int __i) 00787 { 00788 __builtin_ia32_ldmxcsr(__i); 00789 } 00790 00791 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \ 00792 __m128 __a = (a); \ 00793 __m128 __b = (b); \ 00794 (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \ 00795 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 00796 (((mask) & 0x30) >> 4) + 4, \ 00797 (((mask) & 0xc0) >> 6) + 4); }) 00798 00799 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00800 _mm_unpackhi_ps(__m128 __a, __m128 __b) 00801 { 00802 return __builtin_shufflevector(__a, __b, 2, 6, 3, 7); 00803 } 00804 00805 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00806 _mm_unpacklo_ps(__m128 __a, __m128 __b) 00807 { 00808 return __builtin_shufflevector(__a, __b, 0, 4, 1, 5); 00809 } 00810 00811 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00812 _mm_move_ss(__m128 __a, __m128 __b) 00813 { 00814 return __builtin_shufflevector(__a, __b, 4, 1, 2, 3); 00815 } 00816 00817 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00818 _mm_movehl_ps(__m128 __a, __m128 __b) 00819 { 00820 return __builtin_shufflevector(__a, __b, 6, 7, 2, 3); 00821 } 00822 00823 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00824 _mm_movelh_ps(__m128 __a, __m128 __b) 00825 { 00826 return __builtin_shufflevector(__a, __b, 0, 1, 4, 5); 00827 } 00828 00829 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00830 _mm_cvtpi16_ps(__m64 __a) 00831 { 00832 __m64 __b, __c; 00833 __m128 __r; 00834 00835 __b = _mm_setzero_si64(); 00836 __b = _mm_cmpgt_pi16(__b, __a); 00837 __c = _mm_unpackhi_pi16(__a, __b); 00838 __r = _mm_setzero_ps(); 00839 __r = _mm_cvtpi32_ps(__r, __c); 00840 __r = _mm_movelh_ps(__r, __r); 00841 __c = _mm_unpacklo_pi16(__a, __b); 00842 __r = _mm_cvtpi32_ps(__r, __c); 00843 00844 return __r; 00845 } 00846 00847 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00848 _mm_cvtpu16_ps(__m64 __a) 00849 { 00850 __m64 __b, __c; 00851 __m128 __r; 00852 00853 __b = _mm_setzero_si64(); 00854 __c = _mm_unpackhi_pi16(__a, __b); 00855 __r = _mm_setzero_ps(); 00856 __r = _mm_cvtpi32_ps(__r, __c); 00857 __r = _mm_movelh_ps(__r, __r); 00858 __c = _mm_unpacklo_pi16(__a, __b); 00859 __r = _mm_cvtpi32_ps(__r, __c); 00860 00861 return __r; 00862 } 00863 00864 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00865 _mm_cvtpi8_ps(__m64 __a) 00866 { 00867 __m64 __b; 00868 00869 __b = _mm_setzero_si64(); 00870 __b = _mm_cmpgt_pi8(__b, __a); 00871 __b = _mm_unpacklo_pi8(__a, __b); 00872 00873 return _mm_cvtpi16_ps(__b); 00874 } 00875 00876 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00877 _mm_cvtpu8_ps(__m64 __a) 00878 { 00879 __m64 __b; 00880 00881 __b = _mm_setzero_si64(); 00882 __b = _mm_unpacklo_pi8(__a, __b); 00883 00884 return _mm_cvtpi16_ps(__b); 00885 } 00886 00887 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00888 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) 00889 { 00890 __m128 __c; 00891 00892 __c = _mm_setzero_ps(); 00893 __c = _mm_cvtpi32_ps(__c, __b); 00894 __c = _mm_movelh_ps(__c, __c); 00895 00896 return _mm_cvtpi32_ps(__c, __a); 00897 } 00898 00899 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00900 _mm_cvtps_pi16(__m128 __a) 00901 { 00902 __m64 __b, __c; 00903 00904 __b = _mm_cvtps_pi32(__a); 00905 __a = _mm_movehl_ps(__a, __a); 00906 __c = _mm_cvtps_pi32(__a); 00907 00908 return _mm_packs_pi32(__b, __c); 00909 } 00910 00911 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00912 _mm_cvtps_pi8(__m128 __a) 00913 { 00914 __m64 __b, __c; 00915 00916 __b = _mm_cvtps_pi16(__a); 00917 __c = _mm_setzero_si64(); 00918 00919 return _mm_packs_pi16(__b, __c); 00920 } 00921 00922 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00923 _mm_movemask_ps(__m128 __a) 00924 { 00925 return __builtin_ia32_movmskps(__a); 00926 } 00927 00928 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 00929 00930 #define _MM_EXCEPT_INVALID (0x0001) 00931 #define _MM_EXCEPT_DENORM (0x0002) 00932 #define _MM_EXCEPT_DIV_ZERO (0x0004) 00933 #define _MM_EXCEPT_OVERFLOW (0x0008) 00934 #define _MM_EXCEPT_UNDERFLOW (0x0010) 00935 #define _MM_EXCEPT_INEXACT (0x0020) 00936 #define _MM_EXCEPT_MASK (0x003f) 00937 00938 #define _MM_MASK_INVALID (0x0080) 00939 #define _MM_MASK_DENORM (0x0100) 00940 #define _MM_MASK_DIV_ZERO (0x0200) 00941 #define _MM_MASK_OVERFLOW (0x0400) 00942 #define _MM_MASK_UNDERFLOW (0x0800) 00943 #define _MM_MASK_INEXACT (0x1000) 00944 #define _MM_MASK_MASK (0x1f80) 00945 00946 #define _MM_ROUND_NEAREST (0x0000) 00947 #define _MM_ROUND_DOWN (0x2000) 00948 #define _MM_ROUND_UP (0x4000) 00949 #define _MM_ROUND_TOWARD_ZERO (0x6000) 00950 #define _MM_ROUND_MASK (0x6000) 00951 00952 #define _MM_FLUSH_ZERO_MASK (0x8000) 00953 #define _MM_FLUSH_ZERO_ON (0x8000) 00954 #define _MM_FLUSH_ZERO_OFF (0x0000) 00955 00956 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 00957 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 00958 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 00959 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 00960 00961 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 00962 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 00963 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 00964 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 00965 00966 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 00967 do { \ 00968 __m128 tmp3, tmp2, tmp1, tmp0; \ 00969 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 00970 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 00971 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 00972 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 00973 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 00974 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 00975 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 00976 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 00977 } while (0) 00978 00979 /* Aliases for compatibility. */ 00980 #define _m_pextrw _mm_extract_pi16 00981 #define _m_pinsrw _mm_insert_pi16 00982 #define _m_pmaxsw _mm_max_pi16 00983 #define _m_pmaxub _mm_max_pu8 00984 #define _m_pminsw _mm_min_pi16 00985 #define _m_pminub _mm_min_pu8 00986 #define _m_pmovmskb _mm_movemask_pi8 00987 #define _m_pmulhuw _mm_mulhi_pu16 00988 #define _m_pshufw _mm_shuffle_pi16 00989 #define _m_maskmovq _mm_maskmove_si64 00990 #define _m_pavgb _mm_avg_pu8 00991 #define _m_pavgw _mm_avg_pu16 00992 #define _m_psadbw _mm_sad_pu8 00993 #define _m_ _mm_ 00994 #define _m_ _mm_ 00995 00996 /* Ugly hack for backwards-compatibility (compatible with gcc) */ 00997 #ifdef __SSE2__ 00998 #include <emmintrin.h> 00999 #endif 01000 01001 #endif /* __SSE__ */ 01002 01003 #endif /* __XMMINTRIN_H */