clang API Documentation

avxintrin.h
Go to the documentation of this file.
00001 /*===---- avxintrin.h - AVX intrinsics -------------------------------------===
00002  *
00003  * Permission is hereby granted, free of charge, to any person obtaining a copy
00004  * of this software and associated documentation files (the "Software"), to deal
00005  * in the Software without restriction, including without limitation the rights
00006  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00007  * copies of the Software, and to permit persons to whom the Software is
00008  * furnished to do so, subject to the following conditions:
00009  *
00010  * The above copyright notice and this permission notice shall be included in
00011  * all copies or substantial portions of the Software.
00012  *
00013  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00014  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00015  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00016  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00017  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00018  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00019  * THE SOFTWARE.
00020  *
00021  *===-----------------------------------------------------------------------===
00022  */
00023 
00024 #ifndef __IMMINTRIN_H
00025 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
00026 #endif
00027 
00028 #ifndef __AVXINTRIN_H
00029 #define __AVXINTRIN_H
00030 
00031 typedef double __v4df __attribute__ ((__vector_size__ (32)));
00032 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
00033 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
00034 typedef int __v8si __attribute__ ((__vector_size__ (32)));
00035 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
00036 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
00037 
00038 typedef float __m256 __attribute__ ((__vector_size__ (32)));
00039 typedef double __m256d __attribute__((__vector_size__(32)));
00040 typedef long long __m256i __attribute__((__vector_size__(32)));
00041 
00042 /* Arithmetic */
00043 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00044 _mm256_add_pd(__m256d __a, __m256d __b)
00045 {
00046   return __a+__b;
00047 }
00048 
00049 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00050 _mm256_add_ps(__m256 __a, __m256 __b)
00051 {
00052   return __a+__b;
00053 }
00054 
00055 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00056 _mm256_sub_pd(__m256d __a, __m256d __b)
00057 {
00058   return __a-__b;
00059 }
00060 
00061 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00062 _mm256_sub_ps(__m256 __a, __m256 __b)
00063 {
00064   return __a-__b;
00065 }
00066 
00067 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00068 _mm256_addsub_pd(__m256d __a, __m256d __b)
00069 {
00070   return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
00071 }
00072 
00073 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00074 _mm256_addsub_ps(__m256 __a, __m256 __b)
00075 {
00076   return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
00077 }
00078 
00079 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00080 _mm256_div_pd(__m256d __a, __m256d __b)
00081 {
00082   return __a / __b;
00083 }
00084 
00085 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00086 _mm256_div_ps(__m256 __a, __m256 __b)
00087 {
00088   return __a / __b;
00089 }
00090 
00091 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00092 _mm256_max_pd(__m256d __a, __m256d __b)
00093 {
00094   return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
00095 }
00096 
00097 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00098 _mm256_max_ps(__m256 __a, __m256 __b)
00099 {
00100   return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
00101 }
00102 
00103 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00104 _mm256_min_pd(__m256d __a, __m256d __b)
00105 {
00106   return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
00107 }
00108 
00109 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00110 _mm256_min_ps(__m256 __a, __m256 __b)
00111 {
00112   return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
00113 }
00114 
00115 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00116 _mm256_mul_pd(__m256d __a, __m256d __b)
00117 {
00118   return __a * __b;
00119 }
00120 
00121 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00122 _mm256_mul_ps(__m256 __a, __m256 __b)
00123 {
00124   return __a * __b;
00125 }
00126 
00127 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00128 _mm256_sqrt_pd(__m256d __a)
00129 {
00130   return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
00131 }
00132 
00133 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00134 _mm256_sqrt_ps(__m256 __a)
00135 {
00136   return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
00137 }
00138 
00139 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00140 _mm256_rsqrt_ps(__m256 __a)
00141 {
00142   return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
00143 }
00144 
00145 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00146 _mm256_rcp_ps(__m256 __a)
00147 {
00148   return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
00149 }
00150 
00151 #define _mm256_round_pd(V, M) __extension__ ({ \
00152     __m256d __V = (V); \
00153     (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
00154 
00155 #define _mm256_round_ps(V, M) __extension__ ({ \
00156   __m256 __V = (V); \
00157   (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
00158 
00159 #define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
00160 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
00161 #define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
00162 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
00163 
00164 /* Logical */
00165 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00166 _mm256_and_pd(__m256d __a, __m256d __b)
00167 {
00168   return (__m256d)((__v4di)__a & (__v4di)__b);
00169 }
00170 
00171 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00172 _mm256_and_ps(__m256 __a, __m256 __b)
00173 {
00174   return (__m256)((__v8si)__a & (__v8si)__b);
00175 }
00176 
00177 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00178 _mm256_andnot_pd(__m256d __a, __m256d __b)
00179 {
00180   return (__m256d)(~(__v4di)__a & (__v4di)__b);
00181 }
00182 
00183 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00184 _mm256_andnot_ps(__m256 __a, __m256 __b)
00185 {
00186   return (__m256)(~(__v8si)__a & (__v8si)__b);
00187 }
00188 
00189 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00190 _mm256_or_pd(__m256d __a, __m256d __b)
00191 {
00192   return (__m256d)((__v4di)__a | (__v4di)__b);
00193 }
00194 
00195 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00196 _mm256_or_ps(__m256 __a, __m256 __b)
00197 {
00198   return (__m256)((__v8si)__a | (__v8si)__b);
00199 }
00200 
00201 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00202 _mm256_xor_pd(__m256d __a, __m256d __b)
00203 {
00204   return (__m256d)((__v4di)__a ^ (__v4di)__b);
00205 }
00206 
00207 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00208 _mm256_xor_ps(__m256 __a, __m256 __b)
00209 {
00210   return (__m256)((__v8si)__a ^ (__v8si)__b);
00211 }
00212 
00213 /* Horizontal arithmetic */
00214 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00215 _mm256_hadd_pd(__m256d __a, __m256d __b)
00216 {
00217   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
00218 }
00219 
00220 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00221 _mm256_hadd_ps(__m256 __a, __m256 __b)
00222 {
00223   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
00224 }
00225 
00226 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00227 _mm256_hsub_pd(__m256d __a, __m256d __b)
00228 {
00229   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
00230 }
00231 
00232 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00233 _mm256_hsub_ps(__m256 __a, __m256 __b)
00234 {
00235   return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
00236 }
00237 
00238 /* Vector permutations */
00239 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
00240 _mm_permutevar_pd(__m128d __a, __m128i __c)
00241 {
00242   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
00243 }
00244 
00245 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00246 _mm256_permutevar_pd(__m256d __a, __m256i __c)
00247 {
00248   return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
00249 }
00250 
00251 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
00252 _mm_permutevar_ps(__m128 __a, __m128i __c)
00253 {
00254   return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
00255 }
00256 
00257 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00258 _mm256_permutevar_ps(__m256 __a, __m256i __c)
00259 {
00260   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a,
00261               (__v8si)__c);
00262 }
00263 
00264 #define _mm_permute_pd(A, C) __extension__ ({ \
00265   __m128d __A = (A); \
00266   (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \
00267                                    (C) & 0x1, ((C) & 0x2) >> 1); })
00268 
00269 #define _mm256_permute_pd(A, C) __extension__ ({ \
00270   __m256d __A = (A); \
00271   (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \
00272                                    (C) & 0x1, ((C) & 0x2) >> 1, \
00273                                    2 + (((C) & 0x4) >> 2), \
00274                                    2 + (((C) & 0x8) >> 3)); })
00275 
00276 #define _mm_permute_ps(A, C) __extension__ ({ \
00277   __m128 __A = (A); \
00278   (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \
00279                                    (C) & 0x3, ((C) & 0xc) >> 2, \
00280                                    ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
00281 
00282 #define _mm256_permute_ps(A, C) __extension__ ({ \
00283   __m256 __A = (A); \
00284   (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \
00285                                   (C) & 0x3, ((C) & 0xc) >> 2, \
00286                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
00287                                   4 + (((C) & 0x03) >> 0), \
00288                                   4 + (((C) & 0x0c) >> 2), \
00289                                   4 + (((C) & 0x30) >> 4), \
00290                                   4 + (((C) & 0xc0) >> 6)); })
00291 
00292 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
00293   __m256d __V1 = (V1); \
00294   __m256d __V2 = (V2); \
00295   (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); })
00296 
00297 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
00298   __m256 __V1 = (V1); \
00299   __m256 __V2 = (V2); \
00300   (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
00301 
00302 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
00303   __m256i __V1 = (V1); \
00304   __m256i __V2 = (V2); \
00305   (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); })
00306 
00307 /* Vector Blend */
00308 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
00309   __m256d __V1 = (V1); \
00310   __m256d __V2 = (V2); \
00311   (__m256d)__builtin_shufflevector((__v4df)__V1, (__v4df)__V2, \
00312                                    (((M) & 0x01) ? 4 : 0), \
00313                                    (((M) & 0x02) ? 5 : 1), \
00314                                    (((M) & 0x04) ? 6 : 2), \
00315                                    (((M) & 0x08) ? 7 : 3)); })
00316 
00317 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
00318   __m256 __V1 = (V1); \
00319   __m256 __V2 = (V2); \
00320   (__m256)__builtin_shufflevector((__v8sf)__V1, (__v8sf)__V2, \
00321                                   (((M) & 0x01) ?  8 : 0), \
00322                                   (((M) & 0x02) ?  9 : 1), \
00323                                   (((M) & 0x04) ? 10 : 2), \
00324                                   (((M) & 0x08) ? 11 : 3), \
00325                                   (((M) & 0x10) ? 12 : 4), \
00326                                   (((M) & 0x20) ? 13 : 5), \
00327                                   (((M) & 0x40) ? 14 : 6), \
00328                                   (((M) & 0x80) ? 15 : 7)); })
00329 
00330 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00331 _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
00332 {
00333   return (__m256d)__builtin_ia32_blendvpd256(
00334     (__v4df)__a, (__v4df)__b, (__v4df)__c);
00335 }
00336 
00337 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00338 _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
00339 {
00340   return (__m256)__builtin_ia32_blendvps256(
00341     (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
00342 }
00343 
00344 /* Vector Dot Product */
00345 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
00346   __m256 __V1 = (V1); \
00347   __m256 __V2 = (V2); \
00348   (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
00349 
00350 /* Vector shuffle */
00351 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
00352         __m256 __a = (a); \
00353         __m256 __b = (b); \
00354         (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
00355         (mask) & 0x3,                ((mask) & 0xc) >> 2, \
00356         (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
00357         ((mask) & 0x3) + 4,          (((mask) & 0xc) >> 2) + 4, \
00358         (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
00359 
00360 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
00361         __m256d __a = (a); \
00362         __m256d __b = (b); \
00363         (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
00364         (mask) & 0x1, \
00365         (((mask) & 0x2) >> 1) + 4, \
00366         (((mask) & 0x4) >> 2) + 2, \
00367         (((mask) & 0x8) >> 3) + 6); })
00368 
00369 /* Compare */
00370 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
00371 #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
00372 #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
00373 #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
00374 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
00375 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
00376 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
00377 #define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
00378 #define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
00379 #define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
00380 #define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
00381 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
00382 #define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
00383 #define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
00384 #define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
00385 #define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
00386 #define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
00387 #define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
00388 #define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
00389 #define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
00390 #define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
00391 #define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
00392 #define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
00393 #define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
00394 #define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
00395 #define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
00396 #define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
00397 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
00398 #define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
00399 #define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
00400 #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
00401 #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
00402 
00403 #define _mm_cmp_pd(a, b, c) __extension__ ({ \
00404   __m128d __a = (a); \
00405   __m128d __b = (b); \
00406   (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
00407 
00408 #define _mm_cmp_ps(a, b, c) __extension__ ({ \
00409   __m128 __a = (a); \
00410   __m128 __b = (b); \
00411   (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
00412 
00413 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \
00414   __m256d __a = (a); \
00415   __m256d __b = (b); \
00416   (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
00417 
00418 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \
00419   __m256 __a = (a); \
00420   __m256 __b = (b); \
00421   (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
00422 
00423 #define _mm_cmp_sd(a, b, c) __extension__ ({ \
00424   __m128d __a = (a); \
00425   __m128d __b = (b); \
00426   (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
00427 
00428 #define _mm_cmp_ss(a, b, c) __extension__ ({ \
00429   __m128 __a = (a); \
00430   __m128 __b = (b); \
00431   (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
00432 
00433 /* Vector extract */
00434 #define _mm256_extractf128_pd(A, O) __extension__ ({ \
00435   __m256d __A = (A); \
00436   (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
00437 
00438 #define _mm256_extractf128_ps(A, O) __extension__ ({ \
00439   __m256 __A = (A); \
00440   (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
00441 
00442 #define _mm256_extractf128_si256(A, O) __extension__ ({ \
00443   __m256i __A = (A); \
00444   (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
00445 
00446 static __inline int __attribute__((__always_inline__, __nodebug__))
00447 _mm256_extract_epi32(__m256i __a, int const __imm)
00448 {
00449   __v8si __b = (__v8si)__a;
00450   return __b[__imm & 7];
00451 }
00452 
00453 static __inline int __attribute__((__always_inline__, __nodebug__))
00454 _mm256_extract_epi16(__m256i __a, int const __imm)
00455 {
00456   __v16hi __b = (__v16hi)__a;
00457   return __b[__imm & 15];
00458 }
00459 
00460 static __inline int __attribute__((__always_inline__, __nodebug__))
00461 _mm256_extract_epi8(__m256i __a, int const __imm)
00462 {
00463   __v32qi __b = (__v32qi)__a;
00464   return __b[__imm & 31];
00465 }
00466 
00467 #ifdef __x86_64__
00468 static __inline long long  __attribute__((__always_inline__, __nodebug__))
00469 _mm256_extract_epi64(__m256i __a, const int __imm)
00470 {
00471   __v4di __b = (__v4di)__a;
00472   return __b[__imm & 3];
00473 }
00474 #endif
00475 
00476 /* Vector insert */
00477 #define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
00478   __m256d __V1 = (V1); \
00479   __m128d __V2 = (V2); \
00480   (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); })
00481 
00482 #define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
00483   __m256 __V1 = (V1); \
00484   __m128 __V2 = (V2); \
00485   (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); })
00486 
00487 #define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
00488   __m256i __V1 = (V1); \
00489   __m128i __V2 = (V2); \
00490   (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); })
00491 
00492 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00493 _mm256_insert_epi32(__m256i __a, int __b, int const __imm)
00494 {
00495   __v8si __c = (__v8si)__a;
00496   __c[__imm & 7] = __b;
00497   return (__m256i)__c;
00498 }
00499 
00500 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00501 _mm256_insert_epi16(__m256i __a, int __b, int const __imm)
00502 {
00503   __v16hi __c = (__v16hi)__a;
00504   __c[__imm & 15] = __b;
00505   return (__m256i)__c;
00506 }
00507 
00508 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00509 _mm256_insert_epi8(__m256i __a, int __b, int const __imm)
00510 {
00511   __v32qi __c = (__v32qi)__a;
00512   __c[__imm & 31] = __b;
00513   return (__m256i)__c;
00514 }
00515 
00516 #ifdef __x86_64__
00517 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00518 _mm256_insert_epi64(__m256i __a, int __b, int const __imm)
00519 {
00520   __v4di __c = (__v4di)__a;
00521   __c[__imm & 3] = __b;
00522   return (__m256i)__c;
00523 }
00524 #endif
00525 
00526 /* Conversion */
00527 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00528 _mm256_cvtepi32_pd(__m128i __a)
00529 {
00530   return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a);
00531 }
00532 
00533 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00534 _mm256_cvtepi32_ps(__m256i __a)
00535 {
00536   return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
00537 }
00538 
00539 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
00540 _mm256_cvtpd_ps(__m256d __a)
00541 {
00542   return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
00543 }
00544 
00545 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00546 _mm256_cvtps_epi32(__m256 __a)
00547 {
00548   return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
00549 }
00550 
00551 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00552 _mm256_cvtps_pd(__m128 __a)
00553 {
00554   return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a);
00555 }
00556 
00557 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
00558 _mm256_cvttpd_epi32(__m256d __a)
00559 {
00560   return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
00561 }
00562 
00563 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
00564 _mm256_cvtpd_epi32(__m256d __a)
00565 {
00566   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
00567 }
00568 
00569 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00570 _mm256_cvttps_epi32(__m256 __a)
00571 {
00572   return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
00573 }
00574 
00575 /* Vector replicate */
00576 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00577 _mm256_movehdup_ps(__m256 __a)
00578 {
00579   return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7);
00580 }
00581 
00582 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00583 _mm256_moveldup_ps(__m256 __a)
00584 {
00585   return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6);
00586 }
00587 
00588 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00589 _mm256_movedup_pd(__m256d __a)
00590 {
00591   return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
00592 }
00593 
00594 /* Unpack and Interleave */
00595 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00596 _mm256_unpackhi_pd(__m256d __a, __m256d __b)
00597 {
00598   return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2);
00599 }
00600 
00601 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00602 _mm256_unpacklo_pd(__m256d __a, __m256d __b)
00603 {
00604   return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2);
00605 }
00606 
00607 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00608 _mm256_unpackhi_ps(__m256 __a, __m256 __b)
00609 {
00610   return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
00611 }
00612 
00613 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00614 _mm256_unpacklo_ps(__m256 __a, __m256 __b)
00615 {
00616   return __builtin_shufflevector(__a, __b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
00617 }
00618 
00619 /* Bit Test */
00620 static __inline int __attribute__((__always_inline__, __nodebug__))
00621 _mm_testz_pd(__m128d __a, __m128d __b)
00622 {
00623   return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
00624 }
00625 
00626 static __inline int __attribute__((__always_inline__, __nodebug__))
00627 _mm_testc_pd(__m128d __a, __m128d __b)
00628 {
00629   return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
00630 }
00631 
00632 static __inline int __attribute__((__always_inline__, __nodebug__))
00633 _mm_testnzc_pd(__m128d __a, __m128d __b)
00634 {
00635   return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
00636 }
00637 
00638 static __inline int __attribute__((__always_inline__, __nodebug__))
00639 _mm_testz_ps(__m128 __a, __m128 __b)
00640 {
00641   return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
00642 }
00643 
00644 static __inline int __attribute__((__always_inline__, __nodebug__))
00645 _mm_testc_ps(__m128 __a, __m128 __b)
00646 {
00647   return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
00648 }
00649 
00650 static __inline int __attribute__((__always_inline__, __nodebug__))
00651 _mm_testnzc_ps(__m128 __a, __m128 __b)
00652 {
00653   return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
00654 }
00655 
00656 static __inline int __attribute__((__always_inline__, __nodebug__))
00657 _mm256_testz_pd(__m256d __a, __m256d __b)
00658 {
00659   return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
00660 }
00661 
00662 static __inline int __attribute__((__always_inline__, __nodebug__))
00663 _mm256_testc_pd(__m256d __a, __m256d __b)
00664 {
00665   return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
00666 }
00667 
00668 static __inline int __attribute__((__always_inline__, __nodebug__))
00669 _mm256_testnzc_pd(__m256d __a, __m256d __b)
00670 {
00671   return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
00672 }
00673 
00674 static __inline int __attribute__((__always_inline__, __nodebug__))
00675 _mm256_testz_ps(__m256 __a, __m256 __b)
00676 {
00677   return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
00678 }
00679 
00680 static __inline int __attribute__((__always_inline__, __nodebug__))
00681 _mm256_testc_ps(__m256 __a, __m256 __b)
00682 {
00683   return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
00684 }
00685 
00686 static __inline int __attribute__((__always_inline__, __nodebug__))
00687 _mm256_testnzc_ps(__m256 __a, __m256 __b)
00688 {
00689   return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
00690 }
00691 
00692 static __inline int __attribute__((__always_inline__, __nodebug__))
00693 _mm256_testz_si256(__m256i __a, __m256i __b)
00694 {
00695   return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
00696 }
00697 
00698 static __inline int __attribute__((__always_inline__, __nodebug__))
00699 _mm256_testc_si256(__m256i __a, __m256i __b)
00700 {
00701   return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
00702 }
00703 
00704 static __inline int __attribute__((__always_inline__, __nodebug__))
00705 _mm256_testnzc_si256(__m256i __a, __m256i __b)
00706 {
00707   return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
00708 }
00709 
00710 /* Vector extract sign mask */
00711 static __inline int __attribute__((__always_inline__, __nodebug__))
00712 _mm256_movemask_pd(__m256d __a)
00713 {
00714   return __builtin_ia32_movmskpd256((__v4df)__a);
00715 }
00716 
00717 static __inline int __attribute__((__always_inline__, __nodebug__))
00718 _mm256_movemask_ps(__m256 __a)
00719 {
00720   return __builtin_ia32_movmskps256((__v8sf)__a);
00721 }
00722 
00723 /* Vector __zero */
00724 static __inline void __attribute__((__always_inline__, __nodebug__))
00725 _mm256_zeroall(void)
00726 {
00727   __builtin_ia32_vzeroall();
00728 }
00729 
00730 static __inline void __attribute__((__always_inline__, __nodebug__))
00731 _mm256_zeroupper(void)
00732 {
00733   __builtin_ia32_vzeroupper();
00734 }
00735 
00736 /* Vector load with broadcast */
00737 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
00738 _mm_broadcast_ss(float const *__a)
00739 {
00740   float __f = *__a;
00741   return (__m128)(__v4sf){ __f, __f, __f, __f };
00742 }
00743 
00744 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00745 _mm256_broadcast_sd(double const *__a)
00746 {
00747   double __d = *__a;
00748   return (__m256d)(__v4df){ __d, __d, __d, __d };
00749 }
00750 
00751 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00752 _mm256_broadcast_ss(float const *__a)
00753 {
00754   float __f = *__a;
00755   return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
00756 }
00757 
00758 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00759 _mm256_broadcast_pd(__m128d const *__a)
00760 {
00761   return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a);
00762 }
00763 
00764 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00765 _mm256_broadcast_ps(__m128 const *__a)
00766 {
00767   return (__m256)__builtin_ia32_vbroadcastf128_ps256(__a);
00768 }
00769 
00770 /* SIMD load ops */
00771 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00772 _mm256_load_pd(double const *__p)
00773 {
00774   return *(__m256d *)__p;
00775 }
00776 
00777 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00778 _mm256_load_ps(float const *__p)
00779 {
00780   return *(__m256 *)__p;
00781 }
00782 
00783 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00784 _mm256_loadu_pd(double const *__p)
00785 {
00786   struct __loadu_pd {
00787     __m256d __v;
00788   } __attribute__((packed, may_alias));
00789   return ((struct __loadu_pd*)__p)->__v;
00790 }
00791 
00792 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00793 _mm256_loadu_ps(float const *__p)
00794 {
00795   struct __loadu_ps {
00796     __m256 __v;
00797   } __attribute__((packed, may_alias));
00798   return ((struct __loadu_ps*)__p)->__v;
00799 }
00800 
00801 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00802 _mm256_load_si256(__m256i const *__p)
00803 {
00804   return *__p;
00805 }
00806 
00807 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00808 _mm256_loadu_si256(__m256i const *__p)
00809 {
00810   struct __loadu_si256 {
00811     __m256i __v;
00812   } __attribute__((packed, may_alias));
00813   return ((struct __loadu_si256*)__p)->__v;
00814 }
00815 
00816 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00817 _mm256_lddqu_si256(__m256i const *__p)
00818 {
00819   return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
00820 }
00821 
00822 /* SIMD store ops */
00823 static __inline void __attribute__((__always_inline__, __nodebug__))
00824 _mm256_store_pd(double *__p, __m256d __a)
00825 {
00826   *(__m256d *)__p = __a;
00827 }
00828 
00829 static __inline void __attribute__((__always_inline__, __nodebug__))
00830 _mm256_store_ps(float *__p, __m256 __a)
00831 {
00832   *(__m256 *)__p = __a;
00833 }
00834 
00835 static __inline void __attribute__((__always_inline__, __nodebug__))
00836 _mm256_storeu_pd(double *__p, __m256d __a)
00837 {
00838   __builtin_ia32_storeupd256(__p, (__v4df)__a);
00839 }
00840 
00841 static __inline void __attribute__((__always_inline__, __nodebug__))
00842 _mm256_storeu_ps(float *__p, __m256 __a)
00843 {
00844   __builtin_ia32_storeups256(__p, (__v8sf)__a);
00845 }
00846 
00847 static __inline void __attribute__((__always_inline__, __nodebug__))
00848 _mm256_store_si256(__m256i *__p, __m256i __a)
00849 {
00850   *__p = __a;
00851 }
00852 
00853 static __inline void __attribute__((__always_inline__, __nodebug__))
00854 _mm256_storeu_si256(__m256i *__p, __m256i __a)
00855 {
00856   __builtin_ia32_storedqu256((char *)__p, (__v32qi)__a);
00857 }
00858 
00859 /* Conditional load ops */
00860 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
00861 _mm_maskload_pd(double const *__p, __m128d __m)
00862 {
00863   return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2df)__m);
00864 }
00865 
00866 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00867 _mm256_maskload_pd(double const *__p, __m256d __m)
00868 {
00869   return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
00870                                                (__v4df)__m);
00871 }
00872 
00873 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
00874 _mm_maskload_ps(float const *__p, __m128 __m)
00875 {
00876   return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4sf)__m);
00877 }
00878 
00879 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00880 _mm256_maskload_ps(float const *__p, __m256 __m)
00881 {
00882   return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8sf)__m);
00883 }
00884 
00885 /* Conditional store ops */
00886 static __inline void __attribute__((__always_inline__, __nodebug__))
00887 _mm256_maskstore_ps(float *__p, __m256 __m, __m256 __a)
00888 {
00889   __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8sf)__m, (__v8sf)__a);
00890 }
00891 
00892 static __inline void __attribute__((__always_inline__, __nodebug__))
00893 _mm_maskstore_pd(double *__p, __m128d __m, __m128d __a)
00894 {
00895   __builtin_ia32_maskstorepd((__v2df *)__p, (__v2df)__m, (__v2df)__a);
00896 }
00897 
00898 static __inline void __attribute__((__always_inline__, __nodebug__))
00899 _mm256_maskstore_pd(double *__p, __m256d __m, __m256d __a)
00900 {
00901   __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4df)__m, (__v4df)__a);
00902 }
00903 
00904 static __inline void __attribute__((__always_inline__, __nodebug__))
00905 _mm_maskstore_ps(float *__p, __m128 __m, __m128 __a)
00906 {
00907   __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4sf)__m, (__v4sf)__a);
00908 }
00909 
00910 /* Cacheability support ops */
00911 static __inline void __attribute__((__always_inline__, __nodebug__))
00912 _mm256_stream_si256(__m256i *__a, __m256i __b)
00913 {
00914   __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
00915 }
00916 
00917 static __inline void __attribute__((__always_inline__, __nodebug__))
00918 _mm256_stream_pd(double *__a, __m256d __b)
00919 {
00920   __builtin_ia32_movntpd256(__a, (__v4df)__b);
00921 }
00922 
00923 static __inline void __attribute__((__always_inline__, __nodebug__))
00924 _mm256_stream_ps(float *__p, __m256 __a)
00925 {
00926   __builtin_ia32_movntps256(__p, (__v8sf)__a);
00927 }
00928 
00929 /* Create vectors */
00930 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00931 _mm256_set_pd(double __a, double __b, double __c, double __d)
00932 {
00933   return (__m256d){ __d, __c, __b, __a };
00934 }
00935 
00936 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00937 _mm256_set_ps(float __a, float __b, float __c, float __d,
00938               float __e, float __f, float __g, float __h)
00939 {
00940   return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
00941 }
00942 
00943 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00944 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
00945                  int __i4, int __i5, int __i6, int __i7)
00946 {
00947   return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
00948 }
00949 
00950 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00951 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
00952                  short __w11, short __w10, short __w09, short __w08,
00953                  short __w07, short __w06, short __w05, short __w04,
00954                  short __w03, short __w02, short __w01, short __w00)
00955 {
00956   return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
00957     __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
00958 }
00959 
00960 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00961 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
00962                 char __b27, char __b26, char __b25, char __b24,
00963                 char __b23, char __b22, char __b21, char __b20,
00964                 char __b19, char __b18, char __b17, char __b16,
00965                 char __b15, char __b14, char __b13, char __b12,
00966                 char __b11, char __b10, char __b09, char __b08,
00967                 char __b07, char __b06, char __b05, char __b04,
00968                 char __b03, char __b02, char __b01, char __b00)
00969 {
00970   return (__m256i)(__v32qi){
00971     __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
00972     __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
00973     __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
00974     __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
00975   };
00976 }
00977 
00978 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00979 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
00980 {
00981   return (__m256i)(__v4di){ __d, __c, __b, __a };
00982 }
00983 
00984 /* Create vectors with elements in reverse order */
00985 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00986 _mm256_setr_pd(double __a, double __b, double __c, double __d)
00987 {
00988   return (__m256d){ __a, __b, __c, __d };
00989 }
00990 
00991 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00992 _mm256_setr_ps(float __a, float __b, float __c, float __d,
00993                float __e, float __f, float __g, float __h)
00994 {
00995   return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
00996 }
00997 
00998 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00999 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
01000                   int __i4, int __i5, int __i6, int __i7)
01001 {
01002   return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
01003 }
01004 
01005 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01006 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
01007        short __w11, short __w10, short __w09, short __w08,
01008        short __w07, short __w06, short __w05, short __w04,
01009        short __w03, short __w02, short __w01, short __w00)
01010 {
01011   return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
01012     __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
01013 }
01014 
01015 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01016 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
01017                  char __b27, char __b26, char __b25, char __b24,
01018                  char __b23, char __b22, char __b21, char __b20,
01019                  char __b19, char __b18, char __b17, char __b16,
01020                  char __b15, char __b14, char __b13, char __b12,
01021                  char __b11, char __b10, char __b09, char __b08,
01022                  char __b07, char __b06, char __b05, char __b04,
01023                  char __b03, char __b02, char __b01, char __b00)
01024 {
01025   return (__m256i)(__v32qi){
01026     __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
01027     __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
01028     __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
01029     __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
01030 }
01031 
01032 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01033 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
01034 {
01035   return (__m256i)(__v4di){ __a, __b, __c, __d };
01036 }
01037 
01038 /* Create vectors with repeated elements */
01039 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
01040 _mm256_set1_pd(double __w)
01041 {
01042   return (__m256d){ __w, __w, __w, __w };
01043 }
01044 
01045 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
01046 _mm256_set1_ps(float __w)
01047 {
01048   return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
01049 }
01050 
01051 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01052 _mm256_set1_epi32(int __i)
01053 {
01054   return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
01055 }
01056 
01057 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01058 _mm256_set1_epi16(short __w)
01059 {
01060   return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
01061     __w, __w, __w, __w, __w, __w };
01062 }
01063 
01064 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01065 _mm256_set1_epi8(char __b)
01066 {
01067   return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
01068     __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
01069     __b, __b, __b, __b, __b, __b, __b };
01070 }
01071 
01072 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01073 _mm256_set1_epi64x(long long __q)
01074 {
01075   return (__m256i)(__v4di){ __q, __q, __q, __q };
01076 }
01077 
01078 /* Create __zeroed vectors */
01079 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
01080 _mm256_setzero_pd(void)
01081 {
01082   return (__m256d){ 0, 0, 0, 0 };
01083 }
01084 
01085 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
01086 _mm256_setzero_ps(void)
01087 {
01088   return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
01089 }
01090 
01091 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01092 _mm256_setzero_si256(void)
01093 {
01094   return (__m256i){ 0LL, 0LL, 0LL, 0LL };
01095 }
01096 
01097 /* Cast between vector types */
01098 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
01099 _mm256_castpd_ps(__m256d __a)
01100 {
01101   return (__m256)__a;
01102 }
01103 
01104 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01105 _mm256_castpd_si256(__m256d __a)
01106 {
01107   return (__m256i)__a;
01108 }
01109 
01110 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
01111 _mm256_castps_pd(__m256 __a)
01112 {
01113   return (__m256d)__a;
01114 }
01115 
01116 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01117 _mm256_castps_si256(__m256 __a)
01118 {
01119   return (__m256i)__a;
01120 }
01121 
01122 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
01123 _mm256_castsi256_ps(__m256i __a)
01124 {
01125   return (__m256)__a;
01126 }
01127 
01128 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
01129 _mm256_castsi256_pd(__m256i __a)
01130 {
01131   return (__m256d)__a;
01132 }
01133 
01134 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
01135 _mm256_castpd256_pd128(__m256d __a)
01136 {
01137   return __builtin_shufflevector(__a, __a, 0, 1);
01138 }
01139 
01140 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
01141 _mm256_castps256_ps128(__m256 __a)
01142 {
01143   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
01144 }
01145 
01146 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
01147 _mm256_castsi256_si128(__m256i __a)
01148 {
01149   return __builtin_shufflevector(__a, __a, 0, 1);
01150 }
01151 
01152 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
01153 _mm256_castpd128_pd256(__m128d __a)
01154 {
01155   return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
01156 }
01157 
01158 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
01159 _mm256_castps128_ps256(__m128 __a)
01160 {
01161   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
01162 }
01163 
01164 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01165 _mm256_castsi128_si256(__m128i __a)
01166 {
01167   return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
01168 }
01169 
01170 /* SIMD load ops (unaligned) */
01171 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
01172 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
01173 {
01174   struct __loadu_ps {
01175     __m128 __v;
01176   } __attribute__((__packed__, __may_alias__));
01177 
01178   __m256 __v256 = _mm256_castps128_ps256(((struct __loadu_ps*)__addr_lo)->__v);
01179   return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1);
01180 }
01181 
01182 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
01183 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
01184 {
01185   struct __loadu_pd {
01186     __m128d __v;
01187   } __attribute__((__packed__, __may_alias__));
01188   
01189   __m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v);
01190   return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1);
01191 }
01192 
01193 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01194 _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
01195 {
01196   struct __loadu_si128 {
01197     __m128i __v;
01198   } __attribute__((packed, may_alias));
01199   __m256i __v256 = _mm256_castsi128_si256(
01200     ((struct __loadu_si128*)__addr_lo)->__v);
01201   return _mm256_insertf128_si256(__v256,
01202                                  ((struct __loadu_si128*)__addr_hi)->__v, 1);
01203 }
01204 
01205 /* SIMD store ops (unaligned) */
01206 static __inline void __attribute__((__always_inline__, __nodebug__))
01207 _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
01208 {
01209   __m128 __v128;
01210 
01211   __v128 = _mm256_castps256_ps128(__a);
01212   __builtin_ia32_storeups(__addr_lo, __v128);
01213   __v128 = _mm256_extractf128_ps(__a, 1);
01214   __builtin_ia32_storeups(__addr_hi, __v128);
01215 }
01216 
01217 static __inline void __attribute__((__always_inline__, __nodebug__))
01218 _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
01219 {
01220   __m128d __v128;
01221 
01222   __v128 = _mm256_castpd256_pd128(__a);
01223   __builtin_ia32_storeupd(__addr_lo, __v128);
01224   __v128 = _mm256_extractf128_pd(__a, 1);
01225   __builtin_ia32_storeupd(__addr_hi, __v128);
01226 }
01227 
01228 static __inline void __attribute__((__always_inline__, __nodebug__))
01229 _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
01230 {
01231   __m128i __v128;
01232 
01233   __v128 = _mm256_castsi256_si128(__a);
01234   __builtin_ia32_storedqu((char *)__addr_lo, (__v16qi)__v128);
01235   __v128 = _mm256_extractf128_si256(__a, 1);
01236   __builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128);
01237 }
01238 
01239 #endif /* __AVXINTRIN_H */