clang API Documentation
00001 /*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 00002 * 00003 * Permission is hereby granted, free of charge, to any person obtaining a copy 00004 * of this software and associated documentation files (the "Software"), to deal 00005 * in the Software without restriction, including without limitation the rights 00006 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 00007 * copies of the Software, and to permit persons to whom the Software is 00008 * furnished to do so, subject to the following conditions: 00009 * 00010 * The above copyright notice and this permission notice shall be included in 00011 * all copies or substantial portions of the Software. 00012 * 00013 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00014 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00015 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 00016 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00017 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 00018 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 00019 * THE SOFTWARE. 00020 * 00021 *===-----------------------------------------------------------------------=== 00022 */ 00023 00024 #ifndef __IMMINTRIN_H 00025 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 00026 #endif 00027 00028 #ifndef __AVXINTRIN_H 00029 #define __AVXINTRIN_H 00030 00031 typedef double __v4df __attribute__ ((__vector_size__ (32))); 00032 typedef float __v8sf __attribute__ ((__vector_size__ (32))); 00033 typedef long long __v4di __attribute__ ((__vector_size__ (32))); 00034 typedef int __v8si __attribute__ ((__vector_size__ (32))); 00035 typedef short __v16hi __attribute__ ((__vector_size__ (32))); 00036 typedef char __v32qi __attribute__ ((__vector_size__ (32))); 00037 00038 typedef float __m256 __attribute__ ((__vector_size__ (32))); 00039 typedef double __m256d __attribute__((__vector_size__(32))); 00040 typedef long long __m256i __attribute__((__vector_size__(32))); 00041 00042 /* Arithmetic */ 00043 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00044 _mm256_add_pd(__m256d __a, __m256d __b) 00045 { 00046 return __a+__b; 00047 } 00048 00049 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00050 _mm256_add_ps(__m256 __a, __m256 __b) 00051 { 00052 return __a+__b; 00053 } 00054 00055 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00056 _mm256_sub_pd(__m256d __a, __m256d __b) 00057 { 00058 return __a-__b; 00059 } 00060 00061 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00062 _mm256_sub_ps(__m256 __a, __m256 __b) 00063 { 00064 return __a-__b; 00065 } 00066 00067 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00068 _mm256_addsub_pd(__m256d __a, __m256d __b) 00069 { 00070 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); 00071 } 00072 00073 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00074 _mm256_addsub_ps(__m256 __a, __m256 __b) 00075 { 00076 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); 00077 } 00078 00079 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00080 _mm256_div_pd(__m256d __a, __m256d __b) 00081 { 00082 return __a / __b; 00083 } 00084 00085 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00086 _mm256_div_ps(__m256 __a, __m256 __b) 00087 { 00088 return __a / __b; 00089 } 00090 00091 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00092 _mm256_max_pd(__m256d __a, __m256d __b) 00093 { 00094 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); 00095 } 00096 00097 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00098 _mm256_max_ps(__m256 __a, __m256 __b) 00099 { 00100 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); 00101 } 00102 00103 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00104 _mm256_min_pd(__m256d __a, __m256d __b) 00105 { 00106 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); 00107 } 00108 00109 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00110 _mm256_min_ps(__m256 __a, __m256 __b) 00111 { 00112 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); 00113 } 00114 00115 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00116 _mm256_mul_pd(__m256d __a, __m256d __b) 00117 { 00118 return __a * __b; 00119 } 00120 00121 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00122 _mm256_mul_ps(__m256 __a, __m256 __b) 00123 { 00124 return __a * __b; 00125 } 00126 00127 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00128 _mm256_sqrt_pd(__m256d __a) 00129 { 00130 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); 00131 } 00132 00133 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00134 _mm256_sqrt_ps(__m256 __a) 00135 { 00136 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); 00137 } 00138 00139 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00140 _mm256_rsqrt_ps(__m256 __a) 00141 { 00142 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); 00143 } 00144 00145 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00146 _mm256_rcp_ps(__m256 __a) 00147 { 00148 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); 00149 } 00150 00151 #define _mm256_round_pd(V, M) __extension__ ({ \ 00152 __m256d __V = (V); \ 00153 (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); }) 00154 00155 #define _mm256_round_ps(V, M) __extension__ ({ \ 00156 __m256 __V = (V); \ 00157 (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); }) 00158 00159 #define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 00160 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 00161 #define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 00162 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 00163 00164 /* Logical */ 00165 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00166 _mm256_and_pd(__m256d __a, __m256d __b) 00167 { 00168 return (__m256d)((__v4di)__a & (__v4di)__b); 00169 } 00170 00171 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00172 _mm256_and_ps(__m256 __a, __m256 __b) 00173 { 00174 return (__m256)((__v8si)__a & (__v8si)__b); 00175 } 00176 00177 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00178 _mm256_andnot_pd(__m256d __a, __m256d __b) 00179 { 00180 return (__m256d)(~(__v4di)__a & (__v4di)__b); 00181 } 00182 00183 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00184 _mm256_andnot_ps(__m256 __a, __m256 __b) 00185 { 00186 return (__m256)(~(__v8si)__a & (__v8si)__b); 00187 } 00188 00189 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00190 _mm256_or_pd(__m256d __a, __m256d __b) 00191 { 00192 return (__m256d)((__v4di)__a | (__v4di)__b); 00193 } 00194 00195 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00196 _mm256_or_ps(__m256 __a, __m256 __b) 00197 { 00198 return (__m256)((__v8si)__a | (__v8si)__b); 00199 } 00200 00201 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00202 _mm256_xor_pd(__m256d __a, __m256d __b) 00203 { 00204 return (__m256d)((__v4di)__a ^ (__v4di)__b); 00205 } 00206 00207 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00208 _mm256_xor_ps(__m256 __a, __m256 __b) 00209 { 00210 return (__m256)((__v8si)__a ^ (__v8si)__b); 00211 } 00212 00213 /* Horizontal arithmetic */ 00214 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00215 _mm256_hadd_pd(__m256d __a, __m256d __b) 00216 { 00217 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); 00218 } 00219 00220 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00221 _mm256_hadd_ps(__m256 __a, __m256 __b) 00222 { 00223 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); 00224 } 00225 00226 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00227 _mm256_hsub_pd(__m256d __a, __m256d __b) 00228 { 00229 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); 00230 } 00231 00232 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00233 _mm256_hsub_ps(__m256 __a, __m256 __b) 00234 { 00235 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); 00236 } 00237 00238 /* Vector permutations */ 00239 static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 00240 _mm_permutevar_pd(__m128d __a, __m128i __c) 00241 { 00242 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); 00243 } 00244 00245 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00246 _mm256_permutevar_pd(__m256d __a, __m256i __c) 00247 { 00248 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); 00249 } 00250 00251 static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 00252 _mm_permutevar_ps(__m128 __a, __m128i __c) 00253 { 00254 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); 00255 } 00256 00257 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00258 _mm256_permutevar_ps(__m256 __a, __m256i __c) 00259 { 00260 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, 00261 (__v8si)__c); 00262 } 00263 00264 #define _mm_permute_pd(A, C) __extension__ ({ \ 00265 __m128d __A = (A); \ 00266 (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \ 00267 (C) & 0x1, ((C) & 0x2) >> 1); }) 00268 00269 #define _mm256_permute_pd(A, C) __extension__ ({ \ 00270 __m256d __A = (A); \ 00271 (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \ 00272 (C) & 0x1, ((C) & 0x2) >> 1, \ 00273 2 + (((C) & 0x4) >> 2), \ 00274 2 + (((C) & 0x8) >> 3)); }) 00275 00276 #define _mm_permute_ps(A, C) __extension__ ({ \ 00277 __m128 __A = (A); \ 00278 (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \ 00279 (C) & 0x3, ((C) & 0xc) >> 2, \ 00280 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); }) 00281 00282 #define _mm256_permute_ps(A, C) __extension__ ({ \ 00283 __m256 __A = (A); \ 00284 (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \ 00285 (C) & 0x3, ((C) & 0xc) >> 2, \ 00286 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \ 00287 4 + (((C) & 0x03) >> 0), \ 00288 4 + (((C) & 0x0c) >> 2), \ 00289 4 + (((C) & 0x30) >> 4), \ 00290 4 + (((C) & 0xc0) >> 6)); }) 00291 00292 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ 00293 __m256d __V1 = (V1); \ 00294 __m256d __V2 = (V2); \ 00295 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); }) 00296 00297 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ 00298 __m256 __V1 = (V1); \ 00299 __m256 __V2 = (V2); \ 00300 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) 00301 00302 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ 00303 __m256i __V1 = (V1); \ 00304 __m256i __V2 = (V2); \ 00305 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); }) 00306 00307 /* Vector Blend */ 00308 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ 00309 __m256d __V1 = (V1); \ 00310 __m256d __V2 = (V2); \ 00311 (__m256d)__builtin_shufflevector((__v4df)__V1, (__v4df)__V2, \ 00312 (((M) & 0x01) ? 4 : 0), \ 00313 (((M) & 0x02) ? 5 : 1), \ 00314 (((M) & 0x04) ? 6 : 2), \ 00315 (((M) & 0x08) ? 7 : 3)); }) 00316 00317 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ 00318 __m256 __V1 = (V1); \ 00319 __m256 __V2 = (V2); \ 00320 (__m256)__builtin_shufflevector((__v8sf)__V1, (__v8sf)__V2, \ 00321 (((M) & 0x01) ? 8 : 0), \ 00322 (((M) & 0x02) ? 9 : 1), \ 00323 (((M) & 0x04) ? 10 : 2), \ 00324 (((M) & 0x08) ? 11 : 3), \ 00325 (((M) & 0x10) ? 12 : 4), \ 00326 (((M) & 0x20) ? 13 : 5), \ 00327 (((M) & 0x40) ? 14 : 6), \ 00328 (((M) & 0x80) ? 15 : 7)); }) 00329 00330 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00331 _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) 00332 { 00333 return (__m256d)__builtin_ia32_blendvpd256( 00334 (__v4df)__a, (__v4df)__b, (__v4df)__c); 00335 } 00336 00337 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00338 _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) 00339 { 00340 return (__m256)__builtin_ia32_blendvps256( 00341 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c); 00342 } 00343 00344 /* Vector Dot Product */ 00345 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ 00346 __m256 __V1 = (V1); \ 00347 __m256 __V2 = (V2); \ 00348 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) 00349 00350 /* Vector shuffle */ 00351 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ 00352 __m256 __a = (a); \ 00353 __m256 __b = (b); \ 00354 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \ 00355 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 00356 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \ 00357 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \ 00358 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); }) 00359 00360 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ 00361 __m256d __a = (a); \ 00362 __m256d __b = (b); \ 00363 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \ 00364 (mask) & 0x1, \ 00365 (((mask) & 0x2) >> 1) + 4, \ 00366 (((mask) & 0x4) >> 2) + 2, \ 00367 (((mask) & 0x8) >> 3) + 6); }) 00368 00369 /* Compare */ 00370 #define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 00371 #define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 00372 #define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 00373 #define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 00374 #define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 00375 #define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 00376 #define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 00377 #define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ 00378 #define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 00379 #define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ 00380 #define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 00381 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 00382 #define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 00383 #define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 00384 #define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 00385 #define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 00386 #define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 00387 #define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 00388 #define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 00389 #define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 00390 #define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 00391 #define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 00392 #define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ 00393 #define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 00394 #define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 00395 #define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ 00396 #define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 00397 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 00398 #define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 00399 #define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 00400 #define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 00401 #define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 00402 00403 #define _mm_cmp_pd(a, b, c) __extension__ ({ \ 00404 __m128d __a = (a); \ 00405 __m128d __b = (b); \ 00406 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); }) 00407 00408 #define _mm_cmp_ps(a, b, c) __extension__ ({ \ 00409 __m128 __a = (a); \ 00410 __m128 __b = (b); \ 00411 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); }) 00412 00413 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \ 00414 __m256d __a = (a); \ 00415 __m256d __b = (b); \ 00416 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); }) 00417 00418 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \ 00419 __m256 __a = (a); \ 00420 __m256 __b = (b); \ 00421 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); }) 00422 00423 #define _mm_cmp_sd(a, b, c) __extension__ ({ \ 00424 __m128d __a = (a); \ 00425 __m128d __b = (b); \ 00426 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); }) 00427 00428 #define _mm_cmp_ss(a, b, c) __extension__ ({ \ 00429 __m128 __a = (a); \ 00430 __m128 __b = (b); \ 00431 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); }) 00432 00433 /* Vector extract */ 00434 #define _mm256_extractf128_pd(A, O) __extension__ ({ \ 00435 __m256d __A = (A); \ 00436 (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); }) 00437 00438 #define _mm256_extractf128_ps(A, O) __extension__ ({ \ 00439 __m256 __A = (A); \ 00440 (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); }) 00441 00442 #define _mm256_extractf128_si256(A, O) __extension__ ({ \ 00443 __m256i __A = (A); \ 00444 (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); }) 00445 00446 static __inline int __attribute__((__always_inline__, __nodebug__)) 00447 _mm256_extract_epi32(__m256i __a, int const __imm) 00448 { 00449 __v8si __b = (__v8si)__a; 00450 return __b[__imm & 7]; 00451 } 00452 00453 static __inline int __attribute__((__always_inline__, __nodebug__)) 00454 _mm256_extract_epi16(__m256i __a, int const __imm) 00455 { 00456 __v16hi __b = (__v16hi)__a; 00457 return __b[__imm & 15]; 00458 } 00459 00460 static __inline int __attribute__((__always_inline__, __nodebug__)) 00461 _mm256_extract_epi8(__m256i __a, int const __imm) 00462 { 00463 __v32qi __b = (__v32qi)__a; 00464 return __b[__imm & 31]; 00465 } 00466 00467 #ifdef __x86_64__ 00468 static __inline long long __attribute__((__always_inline__, __nodebug__)) 00469 _mm256_extract_epi64(__m256i __a, const int __imm) 00470 { 00471 __v4di __b = (__v4di)__a; 00472 return __b[__imm & 3]; 00473 } 00474 #endif 00475 00476 /* Vector insert */ 00477 #define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \ 00478 __m256d __V1 = (V1); \ 00479 __m128d __V2 = (V2); \ 00480 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); }) 00481 00482 #define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \ 00483 __m256 __V1 = (V1); \ 00484 __m128 __V2 = (V2); \ 00485 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); }) 00486 00487 #define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \ 00488 __m256i __V1 = (V1); \ 00489 __m128i __V2 = (V2); \ 00490 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); }) 00491 00492 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00493 _mm256_insert_epi32(__m256i __a, int __b, int const __imm) 00494 { 00495 __v8si __c = (__v8si)__a; 00496 __c[__imm & 7] = __b; 00497 return (__m256i)__c; 00498 } 00499 00500 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00501 _mm256_insert_epi16(__m256i __a, int __b, int const __imm) 00502 { 00503 __v16hi __c = (__v16hi)__a; 00504 __c[__imm & 15] = __b; 00505 return (__m256i)__c; 00506 } 00507 00508 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00509 _mm256_insert_epi8(__m256i __a, int __b, int const __imm) 00510 { 00511 __v32qi __c = (__v32qi)__a; 00512 __c[__imm & 31] = __b; 00513 return (__m256i)__c; 00514 } 00515 00516 #ifdef __x86_64__ 00517 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00518 _mm256_insert_epi64(__m256i __a, int __b, int const __imm) 00519 { 00520 __v4di __c = (__v4di)__a; 00521 __c[__imm & 3] = __b; 00522 return (__m256i)__c; 00523 } 00524 #endif 00525 00526 /* Conversion */ 00527 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00528 _mm256_cvtepi32_pd(__m128i __a) 00529 { 00530 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a); 00531 } 00532 00533 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00534 _mm256_cvtepi32_ps(__m256i __a) 00535 { 00536 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a); 00537 } 00538 00539 static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 00540 _mm256_cvtpd_ps(__m256d __a) 00541 { 00542 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); 00543 } 00544 00545 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00546 _mm256_cvtps_epi32(__m256 __a) 00547 { 00548 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); 00549 } 00550 00551 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00552 _mm256_cvtps_pd(__m128 __a) 00553 { 00554 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a); 00555 } 00556 00557 static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 00558 _mm256_cvttpd_epi32(__m256d __a) 00559 { 00560 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); 00561 } 00562 00563 static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 00564 _mm256_cvtpd_epi32(__m256d __a) 00565 { 00566 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); 00567 } 00568 00569 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00570 _mm256_cvttps_epi32(__m256 __a) 00571 { 00572 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); 00573 } 00574 00575 /* Vector replicate */ 00576 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00577 _mm256_movehdup_ps(__m256 __a) 00578 { 00579 return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7); 00580 } 00581 00582 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00583 _mm256_moveldup_ps(__m256 __a) 00584 { 00585 return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6); 00586 } 00587 00588 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00589 _mm256_movedup_pd(__m256d __a) 00590 { 00591 return __builtin_shufflevector(__a, __a, 0, 0, 2, 2); 00592 } 00593 00594 /* Unpack and Interleave */ 00595 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00596 _mm256_unpackhi_pd(__m256d __a, __m256d __b) 00597 { 00598 return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2); 00599 } 00600 00601 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00602 _mm256_unpacklo_pd(__m256d __a, __m256d __b) 00603 { 00604 return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2); 00605 } 00606 00607 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00608 _mm256_unpackhi_ps(__m256 __a, __m256 __b) 00609 { 00610 return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 00611 } 00612 00613 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00614 _mm256_unpacklo_ps(__m256 __a, __m256 __b) 00615 { 00616 return __builtin_shufflevector(__a, __b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 00617 } 00618 00619 /* Bit Test */ 00620 static __inline int __attribute__((__always_inline__, __nodebug__)) 00621 _mm_testz_pd(__m128d __a, __m128d __b) 00622 { 00623 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); 00624 } 00625 00626 static __inline int __attribute__((__always_inline__, __nodebug__)) 00627 _mm_testc_pd(__m128d __a, __m128d __b) 00628 { 00629 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); 00630 } 00631 00632 static __inline int __attribute__((__always_inline__, __nodebug__)) 00633 _mm_testnzc_pd(__m128d __a, __m128d __b) 00634 { 00635 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); 00636 } 00637 00638 static __inline int __attribute__((__always_inline__, __nodebug__)) 00639 _mm_testz_ps(__m128 __a, __m128 __b) 00640 { 00641 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); 00642 } 00643 00644 static __inline int __attribute__((__always_inline__, __nodebug__)) 00645 _mm_testc_ps(__m128 __a, __m128 __b) 00646 { 00647 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); 00648 } 00649 00650 static __inline int __attribute__((__always_inline__, __nodebug__)) 00651 _mm_testnzc_ps(__m128 __a, __m128 __b) 00652 { 00653 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); 00654 } 00655 00656 static __inline int __attribute__((__always_inline__, __nodebug__)) 00657 _mm256_testz_pd(__m256d __a, __m256d __b) 00658 { 00659 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); 00660 } 00661 00662 static __inline int __attribute__((__always_inline__, __nodebug__)) 00663 _mm256_testc_pd(__m256d __a, __m256d __b) 00664 { 00665 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); 00666 } 00667 00668 static __inline int __attribute__((__always_inline__, __nodebug__)) 00669 _mm256_testnzc_pd(__m256d __a, __m256d __b) 00670 { 00671 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); 00672 } 00673 00674 static __inline int __attribute__((__always_inline__, __nodebug__)) 00675 _mm256_testz_ps(__m256 __a, __m256 __b) 00676 { 00677 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); 00678 } 00679 00680 static __inline int __attribute__((__always_inline__, __nodebug__)) 00681 _mm256_testc_ps(__m256 __a, __m256 __b) 00682 { 00683 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); 00684 } 00685 00686 static __inline int __attribute__((__always_inline__, __nodebug__)) 00687 _mm256_testnzc_ps(__m256 __a, __m256 __b) 00688 { 00689 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); 00690 } 00691 00692 static __inline int __attribute__((__always_inline__, __nodebug__)) 00693 _mm256_testz_si256(__m256i __a, __m256i __b) 00694 { 00695 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); 00696 } 00697 00698 static __inline int __attribute__((__always_inline__, __nodebug__)) 00699 _mm256_testc_si256(__m256i __a, __m256i __b) 00700 { 00701 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); 00702 } 00703 00704 static __inline int __attribute__((__always_inline__, __nodebug__)) 00705 _mm256_testnzc_si256(__m256i __a, __m256i __b) 00706 { 00707 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b); 00708 } 00709 00710 /* Vector extract sign mask */ 00711 static __inline int __attribute__((__always_inline__, __nodebug__)) 00712 _mm256_movemask_pd(__m256d __a) 00713 { 00714 return __builtin_ia32_movmskpd256((__v4df)__a); 00715 } 00716 00717 static __inline int __attribute__((__always_inline__, __nodebug__)) 00718 _mm256_movemask_ps(__m256 __a) 00719 { 00720 return __builtin_ia32_movmskps256((__v8sf)__a); 00721 } 00722 00723 /* Vector __zero */ 00724 static __inline void __attribute__((__always_inline__, __nodebug__)) 00725 _mm256_zeroall(void) 00726 { 00727 __builtin_ia32_vzeroall(); 00728 } 00729 00730 static __inline void __attribute__((__always_inline__, __nodebug__)) 00731 _mm256_zeroupper(void) 00732 { 00733 __builtin_ia32_vzeroupper(); 00734 } 00735 00736 /* Vector load with broadcast */ 00737 static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 00738 _mm_broadcast_ss(float const *__a) 00739 { 00740 float __f = *__a; 00741 return (__m128)(__v4sf){ __f, __f, __f, __f }; 00742 } 00743 00744 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00745 _mm256_broadcast_sd(double const *__a) 00746 { 00747 double __d = *__a; 00748 return (__m256d)(__v4df){ __d, __d, __d, __d }; 00749 } 00750 00751 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00752 _mm256_broadcast_ss(float const *__a) 00753 { 00754 float __f = *__a; 00755 return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; 00756 } 00757 00758 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00759 _mm256_broadcast_pd(__m128d const *__a) 00760 { 00761 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a); 00762 } 00763 00764 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00765 _mm256_broadcast_ps(__m128 const *__a) 00766 { 00767 return (__m256)__builtin_ia32_vbroadcastf128_ps256(__a); 00768 } 00769 00770 /* SIMD load ops */ 00771 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00772 _mm256_load_pd(double const *__p) 00773 { 00774 return *(__m256d *)__p; 00775 } 00776 00777 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00778 _mm256_load_ps(float const *__p) 00779 { 00780 return *(__m256 *)__p; 00781 } 00782 00783 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00784 _mm256_loadu_pd(double const *__p) 00785 { 00786 struct __loadu_pd { 00787 __m256d __v; 00788 } __attribute__((packed, may_alias)); 00789 return ((struct __loadu_pd*)__p)->__v; 00790 } 00791 00792 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00793 _mm256_loadu_ps(float const *__p) 00794 { 00795 struct __loadu_ps { 00796 __m256 __v; 00797 } __attribute__((packed, may_alias)); 00798 return ((struct __loadu_ps*)__p)->__v; 00799 } 00800 00801 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00802 _mm256_load_si256(__m256i const *__p) 00803 { 00804 return *__p; 00805 } 00806 00807 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00808 _mm256_loadu_si256(__m256i const *__p) 00809 { 00810 struct __loadu_si256 { 00811 __m256i __v; 00812 } __attribute__((packed, may_alias)); 00813 return ((struct __loadu_si256*)__p)->__v; 00814 } 00815 00816 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00817 _mm256_lddqu_si256(__m256i const *__p) 00818 { 00819 return (__m256i)__builtin_ia32_lddqu256((char const *)__p); 00820 } 00821 00822 /* SIMD store ops */ 00823 static __inline void __attribute__((__always_inline__, __nodebug__)) 00824 _mm256_store_pd(double *__p, __m256d __a) 00825 { 00826 *(__m256d *)__p = __a; 00827 } 00828 00829 static __inline void __attribute__((__always_inline__, __nodebug__)) 00830 _mm256_store_ps(float *__p, __m256 __a) 00831 { 00832 *(__m256 *)__p = __a; 00833 } 00834 00835 static __inline void __attribute__((__always_inline__, __nodebug__)) 00836 _mm256_storeu_pd(double *__p, __m256d __a) 00837 { 00838 __builtin_ia32_storeupd256(__p, (__v4df)__a); 00839 } 00840 00841 static __inline void __attribute__((__always_inline__, __nodebug__)) 00842 _mm256_storeu_ps(float *__p, __m256 __a) 00843 { 00844 __builtin_ia32_storeups256(__p, (__v8sf)__a); 00845 } 00846 00847 static __inline void __attribute__((__always_inline__, __nodebug__)) 00848 _mm256_store_si256(__m256i *__p, __m256i __a) 00849 { 00850 *__p = __a; 00851 } 00852 00853 static __inline void __attribute__((__always_inline__, __nodebug__)) 00854 _mm256_storeu_si256(__m256i *__p, __m256i __a) 00855 { 00856 __builtin_ia32_storedqu256((char *)__p, (__v32qi)__a); 00857 } 00858 00859 /* Conditional load ops */ 00860 static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 00861 _mm_maskload_pd(double const *__p, __m128d __m) 00862 { 00863 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2df)__m); 00864 } 00865 00866 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00867 _mm256_maskload_pd(double const *__p, __m256d __m) 00868 { 00869 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p, 00870 (__v4df)__m); 00871 } 00872 00873 static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 00874 _mm_maskload_ps(float const *__p, __m128 __m) 00875 { 00876 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4sf)__m); 00877 } 00878 00879 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00880 _mm256_maskload_ps(float const *__p, __m256 __m) 00881 { 00882 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8sf)__m); 00883 } 00884 00885 /* Conditional store ops */ 00886 static __inline void __attribute__((__always_inline__, __nodebug__)) 00887 _mm256_maskstore_ps(float *__p, __m256 __m, __m256 __a) 00888 { 00889 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8sf)__m, (__v8sf)__a); 00890 } 00891 00892 static __inline void __attribute__((__always_inline__, __nodebug__)) 00893 _mm_maskstore_pd(double *__p, __m128d __m, __m128d __a) 00894 { 00895 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2df)__m, (__v2df)__a); 00896 } 00897 00898 static __inline void __attribute__((__always_inline__, __nodebug__)) 00899 _mm256_maskstore_pd(double *__p, __m256d __m, __m256d __a) 00900 { 00901 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4df)__m, (__v4df)__a); 00902 } 00903 00904 static __inline void __attribute__((__always_inline__, __nodebug__)) 00905 _mm_maskstore_ps(float *__p, __m128 __m, __m128 __a) 00906 { 00907 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4sf)__m, (__v4sf)__a); 00908 } 00909 00910 /* Cacheability support ops */ 00911 static __inline void __attribute__((__always_inline__, __nodebug__)) 00912 _mm256_stream_si256(__m256i *__a, __m256i __b) 00913 { 00914 __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b); 00915 } 00916 00917 static __inline void __attribute__((__always_inline__, __nodebug__)) 00918 _mm256_stream_pd(double *__a, __m256d __b) 00919 { 00920 __builtin_ia32_movntpd256(__a, (__v4df)__b); 00921 } 00922 00923 static __inline void __attribute__((__always_inline__, __nodebug__)) 00924 _mm256_stream_ps(float *__p, __m256 __a) 00925 { 00926 __builtin_ia32_movntps256(__p, (__v8sf)__a); 00927 } 00928 00929 /* Create vectors */ 00930 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00931 _mm256_set_pd(double __a, double __b, double __c, double __d) 00932 { 00933 return (__m256d){ __d, __c, __b, __a }; 00934 } 00935 00936 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00937 _mm256_set_ps(float __a, float __b, float __c, float __d, 00938 float __e, float __f, float __g, float __h) 00939 { 00940 return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; 00941 } 00942 00943 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00944 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, 00945 int __i4, int __i5, int __i6, int __i7) 00946 { 00947 return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; 00948 } 00949 00950 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00951 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, 00952 short __w11, short __w10, short __w09, short __w08, 00953 short __w07, short __w06, short __w05, short __w04, 00954 short __w03, short __w02, short __w01, short __w00) 00955 { 00956 return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, 00957 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; 00958 } 00959 00960 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00961 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, 00962 char __b27, char __b26, char __b25, char __b24, 00963 char __b23, char __b22, char __b21, char __b20, 00964 char __b19, char __b18, char __b17, char __b16, 00965 char __b15, char __b14, char __b13, char __b12, 00966 char __b11, char __b10, char __b09, char __b08, 00967 char __b07, char __b06, char __b05, char __b04, 00968 char __b03, char __b02, char __b01, char __b00) 00969 { 00970 return (__m256i)(__v32qi){ 00971 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, 00972 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, 00973 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, 00974 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31 00975 }; 00976 } 00977 00978 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00979 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) 00980 { 00981 return (__m256i)(__v4di){ __d, __c, __b, __a }; 00982 } 00983 00984 /* Create vectors with elements in reverse order */ 00985 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00986 _mm256_setr_pd(double __a, double __b, double __c, double __d) 00987 { 00988 return (__m256d){ __a, __b, __c, __d }; 00989 } 00990 00991 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00992 _mm256_setr_ps(float __a, float __b, float __c, float __d, 00993 float __e, float __f, float __g, float __h) 00994 { 00995 return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h }; 00996 } 00997 00998 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00999 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, 01000 int __i4, int __i5, int __i6, int __i7) 01001 { 01002 return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 }; 01003 } 01004 01005 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01006 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, 01007 short __w11, short __w10, short __w09, short __w08, 01008 short __w07, short __w06, short __w05, short __w04, 01009 short __w03, short __w02, short __w01, short __w00) 01010 { 01011 return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09, 01012 __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 }; 01013 } 01014 01015 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01016 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, 01017 char __b27, char __b26, char __b25, char __b24, 01018 char __b23, char __b22, char __b21, char __b20, 01019 char __b19, char __b18, char __b17, char __b16, 01020 char __b15, char __b14, char __b13, char __b12, 01021 char __b11, char __b10, char __b09, char __b08, 01022 char __b07, char __b06, char __b05, char __b04, 01023 char __b03, char __b02, char __b01, char __b00) 01024 { 01025 return (__m256i)(__v32qi){ 01026 __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24, 01027 __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16, 01028 __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08, 01029 __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 }; 01030 } 01031 01032 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01033 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) 01034 { 01035 return (__m256i)(__v4di){ __a, __b, __c, __d }; 01036 } 01037 01038 /* Create vectors with repeated elements */ 01039 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 01040 _mm256_set1_pd(double __w) 01041 { 01042 return (__m256d){ __w, __w, __w, __w }; 01043 } 01044 01045 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 01046 _mm256_set1_ps(float __w) 01047 { 01048 return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w }; 01049 } 01050 01051 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01052 _mm256_set1_epi32(int __i) 01053 { 01054 return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i }; 01055 } 01056 01057 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01058 _mm256_set1_epi16(short __w) 01059 { 01060 return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, 01061 __w, __w, __w, __w, __w, __w }; 01062 } 01063 01064 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01065 _mm256_set1_epi8(char __b) 01066 { 01067 return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 01068 __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 01069 __b, __b, __b, __b, __b, __b, __b }; 01070 } 01071 01072 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01073 _mm256_set1_epi64x(long long __q) 01074 { 01075 return (__m256i)(__v4di){ __q, __q, __q, __q }; 01076 } 01077 01078 /* Create __zeroed vectors */ 01079 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 01080 _mm256_setzero_pd(void) 01081 { 01082 return (__m256d){ 0, 0, 0, 0 }; 01083 } 01084 01085 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 01086 _mm256_setzero_ps(void) 01087 { 01088 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 01089 } 01090 01091 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01092 _mm256_setzero_si256(void) 01093 { 01094 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 01095 } 01096 01097 /* Cast between vector types */ 01098 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 01099 _mm256_castpd_ps(__m256d __a) 01100 { 01101 return (__m256)__a; 01102 } 01103 01104 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01105 _mm256_castpd_si256(__m256d __a) 01106 { 01107 return (__m256i)__a; 01108 } 01109 01110 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 01111 _mm256_castps_pd(__m256 __a) 01112 { 01113 return (__m256d)__a; 01114 } 01115 01116 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01117 _mm256_castps_si256(__m256 __a) 01118 { 01119 return (__m256i)__a; 01120 } 01121 01122 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 01123 _mm256_castsi256_ps(__m256i __a) 01124 { 01125 return (__m256)__a; 01126 } 01127 01128 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 01129 _mm256_castsi256_pd(__m256i __a) 01130 { 01131 return (__m256d)__a; 01132 } 01133 01134 static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 01135 _mm256_castpd256_pd128(__m256d __a) 01136 { 01137 return __builtin_shufflevector(__a, __a, 0, 1); 01138 } 01139 01140 static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 01141 _mm256_castps256_ps128(__m256 __a) 01142 { 01143 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3); 01144 } 01145 01146 static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 01147 _mm256_castsi256_si128(__m256i __a) 01148 { 01149 return __builtin_shufflevector(__a, __a, 0, 1); 01150 } 01151 01152 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 01153 _mm256_castpd128_pd256(__m128d __a) 01154 { 01155 return __builtin_shufflevector(__a, __a, 0, 1, -1, -1); 01156 } 01157 01158 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 01159 _mm256_castps128_ps256(__m128 __a) 01160 { 01161 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1); 01162 } 01163 01164 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01165 _mm256_castsi128_si256(__m128i __a) 01166 { 01167 return __builtin_shufflevector(__a, __a, 0, 1, -1, -1); 01168 } 01169 01170 /* SIMD load ops (unaligned) */ 01171 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 01172 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) 01173 { 01174 struct __loadu_ps { 01175 __m128 __v; 01176 } __attribute__((__packed__, __may_alias__)); 01177 01178 __m256 __v256 = _mm256_castps128_ps256(((struct __loadu_ps*)__addr_lo)->__v); 01179 return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1); 01180 } 01181 01182 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 01183 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) 01184 { 01185 struct __loadu_pd { 01186 __m128d __v; 01187 } __attribute__((__packed__, __may_alias__)); 01188 01189 __m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v); 01190 return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1); 01191 } 01192 01193 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01194 _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo) 01195 { 01196 struct __loadu_si128 { 01197 __m128i __v; 01198 } __attribute__((packed, may_alias)); 01199 __m256i __v256 = _mm256_castsi128_si256( 01200 ((struct __loadu_si128*)__addr_lo)->__v); 01201 return _mm256_insertf128_si256(__v256, 01202 ((struct __loadu_si128*)__addr_hi)->__v, 1); 01203 } 01204 01205 /* SIMD store ops (unaligned) */ 01206 static __inline void __attribute__((__always_inline__, __nodebug__)) 01207 _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) 01208 { 01209 __m128 __v128; 01210 01211 __v128 = _mm256_castps256_ps128(__a); 01212 __builtin_ia32_storeups(__addr_lo, __v128); 01213 __v128 = _mm256_extractf128_ps(__a, 1); 01214 __builtin_ia32_storeups(__addr_hi, __v128); 01215 } 01216 01217 static __inline void __attribute__((__always_inline__, __nodebug__)) 01218 _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) 01219 { 01220 __m128d __v128; 01221 01222 __v128 = _mm256_castpd256_pd128(__a); 01223 __builtin_ia32_storeupd(__addr_lo, __v128); 01224 __v128 = _mm256_extractf128_pd(__a, 1); 01225 __builtin_ia32_storeupd(__addr_hi, __v128); 01226 } 01227 01228 static __inline void __attribute__((__always_inline__, __nodebug__)) 01229 _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) 01230 { 01231 __m128i __v128; 01232 01233 __v128 = _mm256_castsi256_si128(__a); 01234 __builtin_ia32_storedqu((char *)__addr_lo, (__v16qi)__v128); 01235 __v128 = _mm256_extractf128_si256(__a, 1); 01236 __builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128); 01237 } 01238 01239 #endif /* __AVXINTRIN_H */