clang API Documentation
00001 /*===---- avx512fintrin.h - AVX2 intrinsics --------------------------------=== 00002 * 00003 * Permission is hereby granted, free of charge, to any person obtaining a copy 00004 * of this software and associated documentation files (the "Software"), to deal 00005 * in the Software without restriction, including without limitation the rights 00006 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 00007 * copies of the Software, and to permit persons to whom the Software is 00008 * furnished to do so, subject to the following conditions: 00009 * 00010 * The above copyright notice and this permission notice shall be included in 00011 * all copies or substantial portions of the Software. 00012 * 00013 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00014 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00015 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 00016 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00017 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 00018 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 00019 * THE SOFTWARE. 00020 * 00021 *===-----------------------------------------------------------------------=== 00022 */ 00023 #ifndef __IMMINTRIN_H 00024 #error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead." 00025 #endif 00026 00027 #ifndef __AVX512FINTRIN_H 00028 #define __AVX512FINTRIN_H 00029 00030 typedef double __v8df __attribute__((__vector_size__(64))); 00031 typedef float __v16sf __attribute__((__vector_size__(64))); 00032 typedef long long __v8di __attribute__((__vector_size__(64))); 00033 typedef int __v16si __attribute__((__vector_size__(64))); 00034 00035 typedef float __m512 __attribute__((__vector_size__(64))); 00036 typedef double __m512d __attribute__((__vector_size__(64))); 00037 typedef long long __m512i __attribute__((__vector_size__(64))); 00038 00039 typedef unsigned char __mmask8; 00040 typedef unsigned short __mmask16; 00041 00042 /* Rounding mode macros. */ 00043 #define _MM_FROUND_TO_NEAREST_INT 0x00 00044 #define _MM_FROUND_TO_NEG_INF 0x01 00045 #define _MM_FROUND_TO_POS_INF 0x02 00046 #define _MM_FROUND_TO_ZERO 0x03 00047 #define _MM_FROUND_CUR_DIRECTION 0x04 00048 00049 /* Create vectors with repeated elements */ 00050 00051 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00052 _mm512_setzero_si512(void) 00053 { 00054 return (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 }; 00055 } 00056 00057 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00058 _mm512_maskz_set1_epi32(__mmask16 __M, int __A) 00059 { 00060 return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, 00061 (__v16si) 00062 _mm512_setzero_si512 (), 00063 __M); 00064 } 00065 00066 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00067 _mm512_maskz_set1_epi64(__mmask8 __M, long long __A) 00068 { 00069 #ifdef __x86_64__ 00070 return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, 00071 (__v8di) 00072 _mm512_setzero_si512 (), 00073 __M); 00074 #else 00075 return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A, 00076 (__v8di) 00077 _mm512_setzero_si512 (), 00078 __M); 00079 #endif 00080 } 00081 00082 static __inline __m512 __attribute__ ((__always_inline__, __nodebug__)) 00083 _mm512_setzero_ps(void) 00084 { 00085 return (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 00086 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; 00087 } 00088 static __inline __m512d __attribute__ ((__always_inline__, __nodebug__)) 00089 _mm512_setzero_pd(void) 00090 { 00091 return (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; 00092 } 00093 00094 static __inline __m512 __attribute__((__always_inline__, __nodebug__)) 00095 _mm512_set1_ps(float __w) 00096 { 00097 return (__m512){ __w, __w, __w, __w, __w, __w, __w, __w, 00098 __w, __w, __w, __w, __w, __w, __w, __w }; 00099 } 00100 00101 static __inline __m512d __attribute__((__always_inline__, __nodebug__)) 00102 _mm512_set1_pd(double __w) 00103 { 00104 return (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w }; 00105 } 00106 00107 static __inline __m512i __attribute__((__always_inline__, __nodebug__)) 00108 _mm512_set1_epi32(int __s) 00109 { 00110 return (__m512i)(__v16si){ __s, __s, __s, __s, __s, __s, __s, __s, 00111 __s, __s, __s, __s, __s, __s, __s, __s }; 00112 } 00113 00114 static __inline __m512i __attribute__((__always_inline__, __nodebug__)) 00115 _mm512_set1_epi64(long long __d) 00116 { 00117 return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d }; 00118 } 00119 00120 static __inline__ __m512 __attribute__((__always_inline__, __nodebug__)) 00121 _mm512_broadcastss_ps(__m128 __X) 00122 { 00123 float __f = __X[0]; 00124 return (__v16sf){ __f, __f, __f, __f, 00125 __f, __f, __f, __f, 00126 __f, __f, __f, __f, 00127 __f, __f, __f, __f }; 00128 } 00129 00130 static __inline__ __m512d __attribute__((__always_inline__, __nodebug__)) 00131 _mm512_broadcastsd_pd(__m128d __X) 00132 { 00133 double __d = __X[0]; 00134 return (__v8df){ __d, __d, __d, __d, 00135 __d, __d, __d, __d }; 00136 } 00137 00138 /* Cast between vector types */ 00139 00140 static __inline __m512d __attribute__((__always_inline__, __nodebug__)) 00141 _mm512_castpd256_pd512(__m256d __a) 00142 { 00143 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1); 00144 } 00145 00146 static __inline __m512 __attribute__((__always_inline__, __nodebug__)) 00147 _mm512_castps256_ps512(__m256 __a) 00148 { 00149 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 00150 -1, -1, -1, -1, -1, -1, -1, -1); 00151 } 00152 00153 static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 00154 _mm512_castpd512_pd128(__m512d __a) 00155 { 00156 return __builtin_shufflevector(__a, __a, 0, 1); 00157 } 00158 00159 static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 00160 _mm512_castps512_ps128(__m512 __a) 00161 { 00162 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3); 00163 } 00164 00165 /* Arithmetic */ 00166 00167 static __inline __m512d __attribute__((__always_inline__, __nodebug__)) 00168 _mm512_add_pd(__m512d __a, __m512d __b) 00169 { 00170 return __a + __b; 00171 } 00172 00173 static __inline __m512 __attribute__((__always_inline__, __nodebug__)) 00174 _mm512_add_ps(__m512 __a, __m512 __b) 00175 { 00176 return __a + __b; 00177 } 00178 00179 static __inline __m512d __attribute__((__always_inline__, __nodebug__)) 00180 _mm512_mul_pd(__m512d __a, __m512d __b) 00181 { 00182 return __a * __b; 00183 } 00184 00185 static __inline __m512 __attribute__((__always_inline__, __nodebug__)) 00186 _mm512_mul_ps(__m512 __a, __m512 __b) 00187 { 00188 return __a * __b; 00189 } 00190 00191 static __inline __m512d __attribute__((__always_inline__, __nodebug__)) 00192 _mm512_sub_pd(__m512d __a, __m512d __b) 00193 { 00194 return __a - __b; 00195 } 00196 00197 static __inline __m512 __attribute__((__always_inline__, __nodebug__)) 00198 _mm512_sub_ps(__m512 __a, __m512 __b) 00199 { 00200 return __a - __b; 00201 } 00202 00203 static __inline__ __m512d __attribute__((__always_inline__, __nodebug__)) 00204 _mm512_max_pd(__m512d __A, __m512d __B) 00205 { 00206 return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, 00207 (__v8df) __B, 00208 (__v8df) 00209 _mm512_setzero_pd (), 00210 (__mmask8) -1, 00211 _MM_FROUND_CUR_DIRECTION); 00212 } 00213 00214 static __inline__ __m512 __attribute__((__always_inline__, __nodebug__)) 00215 _mm512_max_ps(__m512 __A, __m512 __B) 00216 { 00217 return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, 00218 (__v16sf) __B, 00219 (__v16sf) 00220 _mm512_setzero_ps (), 00221 (__mmask16) -1, 00222 _MM_FROUND_CUR_DIRECTION); 00223 } 00224 00225 static __inline __m512i 00226 __attribute__ ((__always_inline__, __nodebug__)) 00227 _mm512_max_epi32(__m512i __A, __m512i __B) 00228 { 00229 return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, 00230 (__v16si) __B, 00231 (__v16si) 00232 _mm512_setzero_si512 (), 00233 (__mmask16) -1); 00234 } 00235 00236 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00237 _mm512_max_epu32(__m512i __A, __m512i __B) 00238 { 00239 return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, 00240 (__v16si) __B, 00241 (__v16si) 00242 _mm512_setzero_si512 (), 00243 (__mmask16) -1); 00244 } 00245 00246 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00247 _mm512_max_epi64(__m512i __A, __m512i __B) 00248 { 00249 return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, 00250 (__v8di) __B, 00251 (__v8di) 00252 _mm512_setzero_si512 (), 00253 (__mmask8) -1); 00254 } 00255 00256 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00257 _mm512_max_epu64(__m512i __A, __m512i __B) 00258 { 00259 return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, 00260 (__v8di) __B, 00261 (__v8di) 00262 _mm512_setzero_si512 (), 00263 (__mmask8) -1); 00264 } 00265 00266 static __inline__ __m512d __attribute__((__always_inline__, __nodebug__)) 00267 _mm512_min_pd(__m512d __A, __m512d __B) 00268 { 00269 return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, 00270 (__v8df) __B, 00271 (__v8df) 00272 _mm512_setzero_pd (), 00273 (__mmask8) -1, 00274 _MM_FROUND_CUR_DIRECTION); 00275 } 00276 00277 static __inline__ __m512 __attribute__((__always_inline__, __nodebug__)) 00278 _mm512_min_ps(__m512 __A, __m512 __B) 00279 { 00280 return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, 00281 (__v16sf) __B, 00282 (__v16sf) 00283 _mm512_setzero_ps (), 00284 (__mmask16) -1, 00285 _MM_FROUND_CUR_DIRECTION); 00286 } 00287 00288 static __inline __m512i 00289 __attribute__ ((__always_inline__, __nodebug__)) 00290 _mm512_min_epi32(__m512i __A, __m512i __B) 00291 { 00292 return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, 00293 (__v16si) __B, 00294 (__v16si) 00295 _mm512_setzero_si512 (), 00296 (__mmask16) -1); 00297 } 00298 00299 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00300 _mm512_min_epu32(__m512i __A, __m512i __B) 00301 { 00302 return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, 00303 (__v16si) __B, 00304 (__v16si) 00305 _mm512_setzero_si512 (), 00306 (__mmask16) -1); 00307 } 00308 00309 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00310 _mm512_min_epi64(__m512i __A, __m512i __B) 00311 { 00312 return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, 00313 (__v8di) __B, 00314 (__v8di) 00315 _mm512_setzero_si512 (), 00316 (__mmask8) -1); 00317 } 00318 00319 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00320 _mm512_min_epu64(__m512i __A, __m512i __B) 00321 { 00322 return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, 00323 (__v8di) __B, 00324 (__v8di) 00325 _mm512_setzero_si512 (), 00326 (__mmask8) -1); 00327 } 00328 00329 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00330 _mm512_mul_epi32(__m512i __X, __m512i __Y) 00331 { 00332 return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, 00333 (__v16si) __Y, 00334 (__v8di) 00335 _mm512_setzero_si512 (), 00336 (__mmask8) -1); 00337 } 00338 00339 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00340 _mm512_mul_epu32(__m512i __X, __m512i __Y) 00341 { 00342 return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, 00343 (__v16si) __Y, 00344 (__v8di) 00345 _mm512_setzero_si512 (), 00346 (__mmask8) -1); 00347 } 00348 00349 static __inline__ __m512d __attribute__((__always_inline__, __nodebug__)) 00350 _mm512_sqrt_pd(__m512d a) 00351 { 00352 return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)a, 00353 (__v8df) _mm512_setzero_pd (), 00354 (__mmask8) -1, 00355 _MM_FROUND_CUR_DIRECTION); 00356 } 00357 00358 static __inline__ __m512 __attribute__((__always_inline__, __nodebug__)) 00359 _mm512_sqrt_ps(__m512 a) 00360 { 00361 return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)a, 00362 (__v16sf) _mm512_setzero_ps (), 00363 (__mmask16) -1, 00364 _MM_FROUND_CUR_DIRECTION); 00365 } 00366 00367 static __inline__ __m512d __attribute__((__always_inline__, __nodebug__)) 00368 _mm512_rsqrt14_pd(__m512d __A) 00369 { 00370 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, 00371 (__v8df) 00372 _mm512_setzero_pd (), 00373 (__mmask8) -1);} 00374 00375 static __inline__ __m512 __attribute__((__always_inline__, __nodebug__)) 00376 _mm512_rsqrt14_ps(__m512 __A) 00377 { 00378 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, 00379 (__v16sf) 00380 _mm512_setzero_ps (), 00381 (__mmask16) -1); 00382 } 00383 00384 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00385 _mm_rsqrt14_ss(__m128 __A, __m128 __B) 00386 { 00387 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, 00388 (__v4sf) __B, 00389 (__v4sf) 00390 _mm_setzero_ps (), 00391 (__mmask8) -1); 00392 } 00393 00394 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00395 _mm_rsqrt14_sd(__m128d __A, __m128d __B) 00396 { 00397 return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A, 00398 (__v2df) __B, 00399 (__v2df) 00400 _mm_setzero_pd (), 00401 (__mmask8) -1); 00402 } 00403 00404 static __inline__ __m512d __attribute__((__always_inline__, __nodebug__)) 00405 _mm512_rcp14_pd(__m512d __A) 00406 { 00407 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, 00408 (__v8df) 00409 _mm512_setzero_pd (), 00410 (__mmask8) -1); 00411 } 00412 00413 static __inline__ __m512 __attribute__((__always_inline__, __nodebug__)) 00414 _mm512_rcp14_ps(__m512 __A) 00415 { 00416 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, 00417 (__v16sf) 00418 _mm512_setzero_ps (), 00419 (__mmask16) -1); 00420 } 00421 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00422 _mm_rcp14_ss(__m128 __A, __m128 __B) 00423 { 00424 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, 00425 (__v4sf) __B, 00426 (__v4sf) 00427 _mm_setzero_ps (), 00428 (__mmask8) -1); 00429 } 00430 00431 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00432 _mm_rcp14_sd(__m128d __A, __m128d __B) 00433 { 00434 return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A, 00435 (__v2df) __B, 00436 (__v2df) 00437 _mm_setzero_pd (), 00438 (__mmask8) -1); 00439 } 00440 00441 static __inline __m512 __attribute__ ((__always_inline__, __nodebug__)) 00442 _mm512_floor_ps(__m512 __A) 00443 { 00444 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 00445 _MM_FROUND_FLOOR, 00446 (__v16sf) __A, -1, 00447 _MM_FROUND_CUR_DIRECTION); 00448 } 00449 00450 static __inline __m512d __attribute__ ((__always_inline__, __nodebug__)) 00451 _mm512_floor_pd(__m512d __A) 00452 { 00453 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 00454 _MM_FROUND_FLOOR, 00455 (__v8df) __A, -1, 00456 _MM_FROUND_CUR_DIRECTION); 00457 } 00458 00459 static __inline __m512 __attribute__ ((__always_inline__, __nodebug__)) 00460 _mm512_ceil_ps(__m512 __A) 00461 { 00462 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 00463 _MM_FROUND_CEIL, 00464 (__v16sf) __A, -1, 00465 _MM_FROUND_CUR_DIRECTION); 00466 } 00467 00468 static __inline __m512d __attribute__ ((__always_inline__, __nodebug__)) 00469 _mm512_ceil_pd(__m512d __A) 00470 { 00471 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 00472 _MM_FROUND_CEIL, 00473 (__v8df) __A, -1, 00474 _MM_FROUND_CUR_DIRECTION); 00475 } 00476 00477 static __inline __m512i __attribute__ (( __always_inline__, __nodebug__)) 00478 _mm512_abs_epi64(__m512i __A) 00479 { 00480 return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, 00481 (__v8di) 00482 _mm512_setzero_si512 (), 00483 (__mmask8) -1); 00484 } 00485 00486 static __inline __m512i __attribute__ (( __always_inline__, __nodebug__)) 00487 _mm512_abs_epi32(__m512i __A) 00488 { 00489 return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, 00490 (__v16si) 00491 _mm512_setzero_si512 (), 00492 (__mmask16) -1); 00493 } 00494 00495 static __inline __m512 __attribute__ ((__always_inline__, __nodebug__)) 00496 _mm512_roundscale_ps(__m512 __A, const int __imm) 00497 { 00498 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm, 00499 (__v16sf) __A, -1, 00500 _MM_FROUND_CUR_DIRECTION); 00501 } 00502 static __inline __m512d __attribute__ ((__always_inline__, __nodebug__)) 00503 _mm512_roundscale_pd(__m512d __A, const int __imm) 00504 { 00505 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm, 00506 (__v8df) __A, -1, 00507 _MM_FROUND_CUR_DIRECTION); 00508 } 00509 00510 static __inline__ __m512d __attribute__((__always_inline__, __nodebug__)) 00511 _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) 00512 { 00513 return (__m512d) 00514 __builtin_ia32_vfmaddpd512_mask(__A, 00515 __B, 00516 __C, 00517 (__mmask8) -1, 00518 _MM_FROUND_CUR_DIRECTION); 00519 } 00520 00521 static __inline__ __m512d __attribute__((__always_inline__, __nodebug__)) 00522 _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) 00523 { 00524 return (__m512d) 00525 __builtin_ia32_vfmsubpd512_mask(__A, 00526 __B, 00527 __C, 00528 (__mmask8) -1, 00529 _MM_FROUND_CUR_DIRECTION); 00530 } 00531 00532 static __inline__ __m512d __attribute__((__always_inline__, __nodebug__)) 00533 _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) 00534 { 00535 return (__m512d) 00536 __builtin_ia32_vfnmaddpd512_mask(__A, 00537 __B, 00538 __C, 00539 (__mmask8) -1, 00540 _MM_FROUND_CUR_DIRECTION); 00541 } 00542 00543 static __inline__ __m512 __attribute__((__always_inline__, __nodebug__)) 00544 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) 00545 { 00546 return (__m512) 00547 __builtin_ia32_vfmaddps512_mask(__A, 00548 __B, 00549 __C, 00550 (__mmask16) -1, 00551 _MM_FROUND_CUR_DIRECTION); 00552 } 00553 00554 static __inline__ __m512 __attribute__((__always_inline__, __nodebug__)) 00555 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) 00556 { 00557 return (__m512) 00558 __builtin_ia32_vfmsubps512_mask(__A, 00559 __B, 00560 __C, 00561 (__mmask16) -1, 00562 _MM_FROUND_CUR_DIRECTION); 00563 } 00564 00565 static __inline__ __m512 __attribute__((__always_inline__, __nodebug__)) 00566 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) 00567 { 00568 return (__m512) 00569 __builtin_ia32_vfnmaddps512_mask(__A, 00570 __B, 00571 __C, 00572 (__mmask16) -1, 00573 _MM_FROUND_CUR_DIRECTION); 00574 } 00575 00576 /* Vector permutations */ 00577 00578 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00579 _mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) 00580 { 00581 return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I 00582 /* idx */ , 00583 (__v16si) __A, 00584 (__v16si) __B, 00585 (__mmask16) -1); 00586 } 00587 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00588 _mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) 00589 { 00590 return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I 00591 /* idx */ , 00592 (__v8di) __A, 00593 (__v8di) __B, 00594 (__mmask8) -1); 00595 } 00596 00597 static __inline __m512d __attribute__ ((__always_inline__, __nodebug__)) 00598 _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) 00599 { 00600 return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I 00601 /* idx */ , 00602 (__v8df) __A, 00603 (__v8df) __B, 00604 (__mmask8) -1); 00605 } 00606 static __inline __m512 __attribute__ ((__always_inline__, __nodebug__)) 00607 _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) 00608 { 00609 return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I 00610 /* idx */ , 00611 (__v16sf) __A, 00612 (__v16sf) __B, 00613 (__mmask16) -1); 00614 } 00615 00616 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00617 _mm512_valign_epi64(__m512i __A, __m512i __B, const int __I) 00618 { 00619 return (__m512i) __builtin_ia32_alignq512_mask((__v8di)__A, 00620 (__v8di)__B, 00621 __I, 00622 (__v8di)_mm512_setzero_si512(), 00623 (__mmask8) -1); 00624 } 00625 00626 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00627 _mm512_valign_epi32(__m512i __A, __m512i __B, const int __I) 00628 { 00629 return (__m512i)__builtin_ia32_alignd512_mask((__v16si)__A, 00630 (__v16si)__B, 00631 __I, 00632 (__v16si)_mm512_setzero_si512(), 00633 (__mmask16) -1); 00634 } 00635 00636 /* Vector Blend */ 00637 00638 static __inline __m512d __attribute__ ((__always_inline__, __nodebug__)) 00639 _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) 00640 { 00641 return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A, 00642 (__v8df) __W, 00643 (__mmask8) __U); 00644 } 00645 00646 static __inline __m512 __attribute__ ((__always_inline__, __nodebug__)) 00647 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W) 00648 { 00649 return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A, 00650 (__v16sf) __W, 00651 (__mmask16) __U); 00652 } 00653 00654 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00655 _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W) 00656 { 00657 return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A, 00658 (__v8di) __W, 00659 (__mmask8) __U); 00660 } 00661 00662 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00663 _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) 00664 { 00665 return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A, 00666 (__v16si) __W, 00667 (__mmask16) __U); 00668 } 00669 00670 /* Compare */ 00671 00672 static __inline __mmask16 __attribute__ ((__always_inline__, __nodebug__)) 00673 _mm512_cmp_ps_mask(__m512 a, __m512 b, const int p) 00674 { 00675 return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) a, 00676 (__v16sf) b, p, (__mmask16) -1, 00677 _MM_FROUND_CUR_DIRECTION); 00678 } 00679 00680 static __inline __mmask8 __attribute__ ((__always_inline__, __nodebug__)) 00681 _mm512_cmp_pd_mask(__m512d __X, __m512d __Y, const int __P) 00682 { 00683 return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, 00684 (__v8df) __Y, __P, 00685 (__mmask8) -1, 00686 _MM_FROUND_CUR_DIRECTION); 00687 } 00688 00689 /* Conversion */ 00690 00691 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00692 _mm512_cvttps_epu32(__m512 __A) 00693 { 00694 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, 00695 (__v16si) 00696 _mm512_setzero_si512 (), 00697 (__mmask16) -1, 00698 _MM_FROUND_CUR_DIRECTION); 00699 } 00700 00701 static __inline __m512 __attribute__ (( __always_inline__, __nodebug__)) 00702 _mm512_cvt_roundepi32_ps(__m512i __A, const int __R) 00703 { 00704 return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, 00705 (__v16sf) 00706 _mm512_setzero_ps (), 00707 (__mmask16) -1, 00708 __R); 00709 } 00710 00711 static __inline __m512 __attribute__ (( __always_inline__, __nodebug__)) 00712 _mm512_cvt_roundepu32_ps(__m512i __A, const int __R) 00713 { 00714 return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, 00715 (__v16sf) 00716 _mm512_setzero_ps (), 00717 (__mmask16) -1, 00718 __R); 00719 } 00720 00721 static __inline __m512d __attribute__ (( __always_inline__, __nodebug__)) 00722 _mm512_cvtepi32_pd(__m256i __A) 00723 { 00724 return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, 00725 (__v8df) 00726 _mm512_setzero_pd (), 00727 (__mmask8) -1); 00728 } 00729 00730 static __inline __m512d __attribute__ (( __always_inline__, __nodebug__)) 00731 _mm512_cvtepu32_pd(__m256i __A) 00732 { 00733 return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, 00734 (__v8df) 00735 _mm512_setzero_pd (), 00736 (__mmask8) -1); 00737 } 00738 static __inline __m256 __attribute__ (( __always_inline__, __nodebug__)) 00739 _mm512_cvt_roundpd_ps(__m512d __A, const int __R) 00740 { 00741 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, 00742 (__v8sf) 00743 _mm256_setzero_ps (), 00744 (__mmask8) -1, 00745 __R); 00746 } 00747 00748 static __inline __m256i __attribute__ ((__always_inline__, __nodebug__)) 00749 _mm512_cvtps_ph(__m512 __A, const int __I) 00750 { 00751 return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, 00752 __I, 00753 (__v16hi) 00754 _mm256_setzero_si256 (), 00755 -1); 00756 } 00757 00758 static __inline __m512 __attribute__ ((__always_inline__, __nodebug__)) 00759 _mm512_cvtph_ps(__m256i __A) 00760 { 00761 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, 00762 (__v16sf) 00763 _mm512_setzero_ps (), 00764 (__mmask16) -1, 00765 _MM_FROUND_CUR_DIRECTION); 00766 } 00767 00768 static __inline __m512i __attribute__((__always_inline__, __nodebug__)) 00769 _mm512_cvttps_epi32(__m512 a) 00770 { 00771 return (__m512i) 00772 __builtin_ia32_cvttps2dq512_mask((__v16sf) a, 00773 (__v16si) _mm512_setzero_si512 (), 00774 (__mmask16) -1, _MM_FROUND_CUR_DIRECTION); 00775 } 00776 00777 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00778 _mm512_cvttpd_epi32(__m512d a) 00779 { 00780 return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) a, 00781 (__v8si)_mm256_setzero_si256(), 00782 (__mmask8) -1, 00783 _MM_FROUND_CUR_DIRECTION); 00784 } 00785 00786 static __inline __m256i __attribute__ ((__always_inline__, __nodebug__)) 00787 _mm512_cvtt_roundpd_epi32(__m512d __A, const int __R) 00788 { 00789 return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, 00790 (__v8si) 00791 _mm256_setzero_si256 (), 00792 (__mmask8) -1, 00793 __R); 00794 } 00795 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00796 _mm512_cvtt_roundps_epi32(__m512 __A, const int __R) 00797 { 00798 return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, 00799 (__v16si) 00800 _mm512_setzero_si512 (), 00801 (__mmask16) -1, 00802 __R); 00803 } 00804 00805 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00806 _mm512_cvt_roundps_epi32(__m512 __A, const int __R) 00807 { 00808 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, 00809 (__v16si) 00810 _mm512_setzero_si512 (), 00811 (__mmask16) -1, 00812 __R); 00813 } 00814 static __inline __m256i __attribute__ ((__always_inline__, __nodebug__)) 00815 _mm512_cvt_roundpd_epi32(__m512d __A, const int __R) 00816 { 00817 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, 00818 (__v8si) 00819 _mm256_setzero_si256 (), 00820 (__mmask8) -1, 00821 __R); 00822 } 00823 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00824 _mm512_cvt_roundps_epu32(__m512 __A, const int __R) 00825 { 00826 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, 00827 (__v16si) 00828 _mm512_setzero_si512 (), 00829 (__mmask16) -1, 00830 __R); 00831 } 00832 static __inline __m256i __attribute__ ((__always_inline__, __nodebug__)) 00833 _mm512_cvt_roundpd_epu32(__m512d __A, const int __R) 00834 { 00835 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, 00836 (__v8si) 00837 _mm256_setzero_si256 (), 00838 (__mmask8) -1, 00839 __R); 00840 } 00841 00842 /* Bit Test */ 00843 00844 static __inline __mmask16 __attribute__ ((__always_inline__, __nodebug__)) 00845 _mm512_test_epi32_mask(__m512i __A, __m512i __B) 00846 { 00847 return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A, 00848 (__v16si) __B, 00849 (__mmask16) -1); 00850 } 00851 00852 static __inline __mmask8 __attribute__ ((__always_inline__, __nodebug__)) 00853 _mm512_test_epi64_mask(__m512i __A, __m512i __B) 00854 { 00855 return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, 00856 (__v8di) __B, 00857 (__mmask8) -1); 00858 } 00859 00860 /* SIMD load ops */ 00861 00862 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00863 _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P) 00864 { 00865 return (__m512i) __builtin_ia32_loaddqusi512_mask ((const __v16si *)__P, 00866 (__v16si) 00867 _mm512_setzero_si512 (), 00868 (__mmask16) __U); 00869 } 00870 00871 static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) 00872 _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P) 00873 { 00874 return (__m512i) __builtin_ia32_loaddqudi512_mask ((const __v8di *)__P, 00875 (__v8di) 00876 _mm512_setzero_si512 (), 00877 (__mmask8) __U); 00878 } 00879 00880 static __inline __m512 __attribute__ ((__always_inline__, __nodebug__)) 00881 _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P) 00882 { 00883 return (__m512) __builtin_ia32_loadups512_mask ((const __v16sf *)__P, 00884 (__v16sf) 00885 _mm512_setzero_ps (), 00886 (__mmask16) __U); 00887 } 00888 00889 static __inline __m512d __attribute__ ((__always_inline__, __nodebug__)) 00890 _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P) 00891 { 00892 return (__m512d) __builtin_ia32_loadupd512_mask ((const __v8df *)__P, 00893 (__v8df) 00894 _mm512_setzero_pd (), 00895 (__mmask8) __U); 00896 } 00897 00898 static __inline __m512d __attribute__((__always_inline__, __nodebug__)) 00899 _mm512_loadu_pd(double const *__p) 00900 { 00901 struct __loadu_pd { 00902 __m512d __v; 00903 } __attribute__((packed, may_alias)); 00904 return ((struct __loadu_pd*)__p)->__v; 00905 } 00906 00907 static __inline __m512 __attribute__((__always_inline__, __nodebug__)) 00908 _mm512_loadu_ps(float const *__p) 00909 { 00910 struct __loadu_ps { 00911 __m512 __v; 00912 } __attribute__((packed, may_alias)); 00913 return ((struct __loadu_ps*)__p)->__v; 00914 } 00915 00916 /* SIMD store ops */ 00917 00918 static __inline void __attribute__ ((__always_inline__, __nodebug__)) 00919 _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) 00920 { 00921 __builtin_ia32_storedqudi512_mask ((__v8di *)__P, (__v8di) __A, 00922 (__mmask8) __U); 00923 } 00924 00925 static __inline void __attribute__ ((__always_inline__, __nodebug__)) 00926 _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) 00927 { 00928 __builtin_ia32_storedqusi512_mask ((__v16si *)__P, (__v16si) __A, 00929 (__mmask16) __U); 00930 } 00931 00932 static __inline void __attribute__ ((__always_inline__, __nodebug__)) 00933 _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A) 00934 { 00935 __builtin_ia32_storeupd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U); 00936 } 00937 00938 static __inline void __attribute__ ((__always_inline__, __nodebug__)) 00939 _mm512_storeu_pd(void *__P, __m512d __A) 00940 { 00941 __builtin_ia32_storeupd512_mask((__v8df *)__P, (__v8df)__A, (__mmask8)-1); 00942 } 00943 00944 static __inline void __attribute__ ((__always_inline__, __nodebug__)) 00945 _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A) 00946 { 00947 __builtin_ia32_storeups512_mask ((__v16sf *)__P, (__v16sf) __A, 00948 (__mmask16) __U); 00949 } 00950 00951 static __inline void __attribute__ ((__always_inline__, __nodebug__)) 00952 _mm512_storeu_ps(void *__P, __m512 __A) 00953 { 00954 __builtin_ia32_storeups512_mask((__v16sf *)__P, (__v16sf)__A, (__mmask16)-1); 00955 } 00956 00957 static __inline void __attribute__ ((__always_inline__, __nodebug__)) 00958 _mm512_store_ps(void *__P, __m512 __A) 00959 { 00960 *(__m512*)__P = __A; 00961 } 00962 00963 static __inline void __attribute__ ((__always_inline__, __nodebug__)) 00964 _mm512_store_pd(void *__P, __m512d __A) 00965 { 00966 *(__m512d*)__P = __A; 00967 } 00968 00969 /* Mask ops */ 00970 00971 static __inline __mmask16 __attribute__ ((__always_inline__, __nodebug__)) 00972 _mm512_knot(__mmask16 __M) 00973 { 00974 return __builtin_ia32_knothi(__M); 00975 } 00976 00977 /* Integer compare */ 00978 00979 static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) 00980 _mm512_cmpeq_epi32_mask(__m512i __a, __m512i __b) { 00981 return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b, 00982 (__mmask16)-1); 00983 } 00984 00985 static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) 00986 _mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) { 00987 return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b, 00988 __u); 00989 } 00990 00991 static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) 00992 _mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) { 00993 return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b, 00994 __u); 00995 } 00996 00997 static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) 00998 _mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) { 00999 return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b, 01000 (__mmask8)-1); 01001 } 01002 01003 #endif // __AVX512FINTRIN_H