21 #include <immintrin.h>
25 #if defined(_MSC_VER) && defined(__clang__)
30 #include <smmintrin.h>
32 #include <avxintrin.h>
33 #include <avx2intrin.h>
34 #include <f16cintrin.h>
35 #include <fmaintrin.h>
37 #include <avx512fintrin.h>
38 #include <avx512vlintrin.h>
39 #include <avx512bwintrin.h>
40 #include <avx512dqintrin.h>
41 #include <avx512vlbwintrin.h>
42 #include <avx512vldqintrin.h>
43 #include <avx512bitalgintrin.h>
44 #include <avx512vlbitalgintrin.h>
45 #include <avx512vpopcntdqintrin.h>
46 #include <avx512vpopcntdqvlintrin.h>
79 template <
size_t size>
100 template <
typename T>
108 return *
this = (*
this * other);
111 return *
this = (*
this / other);
114 return *
this = (*
this + other);
117 return *
this = (*
this - other);
120 return *
this = (*
this & other);
123 return *
this = (*
this | other);
126 return *
this = (*
this ^ other);
133 template <
typename T>
145 return _mm512_castpd_si512(v);
148 template <
typename T>
154 template <
typename T>
167 template <
typename T>
174 template <
typename T,
typename FromT>
182 template <
typename T>
184 return Vec512<T>{_mm512_setzero_si512()};
205 _mm512_set1_epi64(
static_cast<long long>(t))};
218 _mm512_set1_epi64(
static_cast<long long>(t))};
231 template <
typename T>
235 return Vec512<T>{_mm512_undefined_epi32()};
250 template <
typename T>
255 Vec512<TU>{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)});
260 template <
typename T>
275 template <
typename T>
290 template <
typename T>
304 template <
typename T>
318 template <
typename T>
323 template <
typename T>
328 template <
typename T>
336 #if HWY_TARGET == HWY_AVX3_DL
338 #ifdef HWY_NATIVE_POPCNT
339 #undef HWY_NATIVE_POPCNT
341 #define HWY_NATIVE_POPCNT
346 template <
typename T>
350 template <
typename T>
354 template <
typename T>
358 template <
typename T>
365 template <
typename T>
376 template <
typename T>
378 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
394 const __m512i out = _mm512_ternarylogic_epi32(
399 template <
typename T>
420 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
423 m.raw =
static_cast<decltype(m.raw)
>(_bzhi_u32(~uint32_t(0), n));
427 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
429 const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t(0);
430 return Mask512<T>{
static_cast<__mmask64
>(bits)};
436 template <
typename T>
440 m.
raw =
static_cast<decltype(m.raw)
>(_bzhi_u64(~uint64_t(0), n));
443 return detail::FirstN<T>(n);
454 template <
typename T>
460 template <
typename T>
466 template <
typename T>
472 template <
typename T>
481 template <
typename T>
499 template <
typename T>
505 template <
typename T>
511 template <
typename T>
517 template <
typename T>
526 template <
typename T>
541 template <
typename T>
547 template <
typename T>
552 template <
typename T>
557 template <
typename T>
565 template <
typename T>
578 template <
typename T, HWY_IF_FLOAT(T)>
746 #if HWY_COMPILER_MSVC
803 template <
int kBits,
typename T, HWY_IF_LANE_SIZE(T, 1)>
807 const auto shifted =
BitCast(d8, ShiftLeft<kBits>(
BitCast(d16, v)));
810 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
835 return shifted &
Set(d8, 0xFF >> kBits);
857 const auto shifted =
BitCast(di, ShiftRight<kBits>(
BitCast(du, v)));
858 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
859 return (shifted ^ shifted_sign) - shifted_sign;
889 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
894 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
916 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >> bits));
937 const auto shifted_sign =
938 BitCast(di,
Set(du,
static_cast<uint8_t
>(0x80 >> bits)));
939 return (shifted ^ shifted_sign) - shifted_sign;
960 template <
typename T, HWY_IF_SIGNED(T)>
1126 template <
typename T, HWY_IF_FLOAT(T)>
1131 template <
typename T, HWY_IF_NOT_FLOAT(T)>
1133 return Zero(Full512<T>()) - v;
1232 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1236 v.
raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1242 _mm512_roundscale_ps(v.
raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1246 _mm512_roundscale_pd(v.
raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1252 _mm512_roundscale_ps(v.
raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1256 _mm512_roundscale_pd(v.
raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1262 _mm512_roundscale_ps(v.
raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1266 _mm512_roundscale_pd(v.
raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1275 template <
typename TFrom,
typename TTo>
1277 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1283 template <
typename T>
1288 template <
typename T>
1293 template <
typename T>
1298 template <
typename T>
1306 template <
typename T>
1308 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1314 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1318 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1320 return Mask512<T>{_mm512_cmpeq_epi16_mask(a.raw, b.raw)};
1322 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1324 return Mask512<T>{_mm512_cmpeq_epi32_mask(a.raw, b.raw)};
1326 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1328 return Mask512<T>{_mm512_cmpeq_epi64_mask(a.raw, b.raw)};
1341 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1345 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1347 return Mask512<T>{_mm512_cmpneq_epi16_mask(a.raw, b.raw)};
1349 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1351 return Mask512<T>{_mm512_cmpneq_epi32_mask(a.raw, b.raw)};
1353 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1355 return Mask512<T>{_mm512_cmpneq_epi64_mask(a.raw, b.raw)};
1398 template <
typename T>
1403 template <
typename T>
1412 template <
typename T>
1416 template <
typename T>
1420 template <
typename T>
1424 template <
typename T>
1431 template <
typename T>
1477 template <
typename T>
1486 template <
typename T>
1488 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1494 template <
typename T>
1496 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1502 template <
typename T>
1504 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1510 template <
typename T>
1512 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1519 template <
typename T>
1522 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1528 template <
typename T>
1531 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1537 template <
typename T>
1540 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1546 template <
typename T>
1549 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1556 template <
typename T>
1559 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1565 template <
typename T>
1568 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1574 template <
typename T>
1577 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1583 template <
typename T>
1586 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1593 template <
typename T>
1596 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1602 template <
typename T>
1605 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1611 template <
typename T>
1614 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1620 template <
typename T>
1623 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1630 template <
typename T>
1633 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1639 template <
typename T>
1642 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1648 template <
typename T>
1651 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1657 template <
typename T>
1660 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1669 template <
typename T>
1674 template <
typename T>
1679 template <
typename T>
1684 template <
typename T>
1689 template <
typename T>
1701 return ShiftRight<15>(v);
1705 return ShiftRight<31>(v);
1716 template <
typename T>
1718 return Vec512<T>{_mm512_load_si512(aligned)};
1729 template <
typename T>
1731 return Vec512<T>{_mm512_loadu_si512(p)};
1744 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1747 return Vec512<T>{_mm512_maskz_load_epi32(m.
raw, aligned)};
1750 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1753 return Vec512<T>{_mm512_maskz_load_epi64(m.raw, aligned)};
1767 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1770 return Vec512<T>{_mm512_maskz_loadu_epi8(m.raw, aligned)};
1773 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1776 return Vec512<T>{_mm512_maskz_loadu_epi16(m.raw, aligned)};
1783 template <
typename T>
1791 asm(
"vbroadcasti128 %1, %[reg]" : [ reg ]
"=x"(out) :
"m"(p[0]));
1795 return Vec512<T>{_mm512_broadcast_i32x4(x4.raw)};
1802 asm(
"vbroadcastf128 %1, %[reg]" : [ reg ]
"=x"(out) :
"m"(p[0]));
1805 const __m128 x4 = _mm_loadu_ps(p);
1814 asm(
"vbroadcastf128 %1, %[reg]" : [ reg ]
"=x"(out) :
"m"(p[0]));
1817 const __m128d x2 = _mm_loadu_pd(p);
1824 template <
typename T>
1827 _mm512_store_si512(
reinterpret_cast<__m512i*
>(aligned), v.
raw);
1831 _mm512_store_ps(aligned, v.
raw);
1835 _mm512_store_pd(aligned, v.
raw);
1838 template <
typename T>
1841 _mm512_storeu_si512(
reinterpret_cast<__m512i*
>(p), v.
raw);
1845 _mm512_storeu_ps(p, v.
raw);
1849 _mm512_storeu_pd(p, v.
raw);
1854 template <
typename T>
1857 _mm512_stream_si512(
reinterpret_cast<__m512i*
>(aligned), v.
raw);
1861 _mm512_stream_ps(aligned, v.
raw);
1865 _mm512_stream_pd(aligned, v.
raw);
1876 template <
typename T>
1880 _mm512_i32scatter_epi32(base, offset.
raw, v.
raw, 1);
1882 template <
typename T>
1886 _mm512_i32scatter_epi32(base, index.
raw, v.
raw, 4);
1889 template <
typename T>
1893 _mm512_i64scatter_epi64(base, offset.
raw, v.
raw, 1);
1895 template <
typename T>
1899 _mm512_i64scatter_epi64(base, index.
raw, v.
raw, 8);
1904 template <
typename T,
typename Offset>
1907 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1910 template <
typename T,
typename Index>
1913 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1920 _mm512_i32scatter_ps(base, offset.
raw, v.
raw, 1);
1925 _mm512_i32scatter_ps(base, index.
raw, v.
raw, 4);
1931 _mm512_i64scatter_pd(base, offset.
raw, v.
raw, 1);
1936 _mm512_i64scatter_pd(base, index.
raw, v.
raw, 8);
1943 template <
typename T>
1948 return Vec512<T>{_mm512_i32gather_epi32(offset.
raw, base, 1)};
1950 template <
typename T>
1955 return Vec512<T>{_mm512_i32gather_epi32(index.
raw, base, 4)};
1958 template <
typename T>
1963 return Vec512<T>{_mm512_i64gather_epi64(offset.
raw, base, 1)};
1965 template <
typename T>
1970 return Vec512<T>{_mm512_i64gather_epi64(index.
raw, base, 8)};
1975 template <
typename T,
typename Offset>
1978 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1981 template <
typename T,
typename Index>
1984 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2016 template <
typename T>
2027 template <
typename T>
2034 template <
typename T>
2046 template <
typename T>
2064 template <
typename T>
2066 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2067 return Vec512<T>{_mm512_inserti32x8(_mm512_setzero_si512(), lo.
raw, 0)};
2074 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2082 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2091 template <
typename T>
2094 return Vec512<T>{_mm512_inserti32x8(lo512.raw, hi.
raw, 1)};
2109 template <
int kBytes,
typename T>
2111 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2115 template <
int kBytes,
typename T>
2117 return ShiftLeftBytes<kBytes>(
Full512<T>(), v);
2122 template <
int kLanes,
typename T>
2128 template <
int kLanes,
typename T>
2130 return ShiftLeftLanes<kLanes>(
Full512<T>(), v);
2134 template <
int kBytes,
typename T>
2136 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2141 template <
int kLanes,
typename T>
2149 template <
int kBytes,
typename T,
class V = Vec512<T>>
2159 template <
int kLane>
2161 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2163 const __m512i lo = _mm512_shufflelo_epi16(v.
raw, (0x55 * kLane) & 0xFF);
2167 _mm512_shufflehi_epi16(v.
raw, (0x55 * (kLane - 4)) & 0xFF);
2171 template <
int kLane>
2173 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2174 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
2177 template <
int kLane>
2179 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2180 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2185 template <
int kLane>
2187 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2189 const __m512i lo = _mm512_shufflelo_epi16(v.
raw, (0x55 * kLane) & 0xFF);
2193 _mm512_shufflehi_epi16(v.
raw, (0x55 * (kLane - 4)) & 0xFF);
2197 template <
int kLane>
2199 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2200 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
2203 template <
int kLane>
2205 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2206 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2211 template <
int kLane>
2213 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2214 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
2217 template <
int kLane>
2219 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2220 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0xFF * kLane);
2300 template <
typename T>
2305 template <
typename T>
2307 #if HWY_IS_DEBUG_BUILD
2308 const size_t N = 64 /
sizeof(T);
2309 for (
size_t i = 0; i < N; ++i) {
2310 HWY_DASSERT(0 <= idx[i] && idx[i] <
static_cast<int32_t
>(N));
2331 template <
typename T>
2333 alignas(32) constexpr int32_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
2334 7, 6, 5, 4, 3, 2, 1, 0};
2388 template <
typename T,
class V = Vec512<T>>
2443 template <
typename T,
class V = Vec512<T>>
2452 template <
typename T,
typename TW = MakeW
ide<T>>
2456 template <
typename T,
typename TW = MakeW
ide<T>>
2461 template <
typename T,
typename TW = MakeW
ide<T>>
2469 template <
typename T>
2472 return Vec512<T>{_mm512_shuffle_i32x4(lo.
raw, hi.
raw, _MM_PERM_BABA)};
2486 template <
typename T>
2489 return Vec512<T>{_mm512_shuffle_i32x4(lo.
raw, hi.
raw, _MM_PERM_DCDC)};
2503 template <
typename T>
2520 template <
typename T>
2525 const __mmask32 mask = (0x0000FFFF);
2531 const __mmask16 mask = (0x00FF);
2537 const __mmask8 mask = (0x0F);
2543 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2546 alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2547 17, 19, 21, 23, 25, 27, 29, 31};
2550 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
2556 alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2557 17, 19, 21, 23, 25, 27, 29, 31};
2559 __mmask16{0xFFFF}, hi.
raw)};
2562 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2565 alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2566 return BitCast(d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2567 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
2574 alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2576 __mmask8{0xFF}, hi.
raw)};
2581 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2584 alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
2585 16, 18, 20, 22, 24, 26, 28, 30};
2588 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
2594 alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
2595 16, 18, 20, 22, 24, 26, 28, 30};
2597 __mmask16{0xFFFF}, hi.
raw)};
2600 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2603 alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2604 return BitCast(d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2605 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
2612 alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2614 __mmask8{0xFF}, hi.
raw)};
2619 template <
typename T>
2621 constexpr
size_t s =
sizeof(T);
2622 constexpr
int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56;
2629 template <
typename T>
2635 template <
typename T,
typename TI,
size_t NI>
2638 const Half<decltype(d512)> d256;
2639 const Half<decltype(d256)> d128;
2642 const auto from_512 =
2648 template <
typename T,
typename TI>
2655 template <
typename T,
size_t N,
typename TI>
2658 const Half<decltype(d512)> d256;
2659 const Half<decltype(d256)> d128;
2662 const auto bytes_512 =
2666 template <
typename T,
typename TI>
2739 const Rebind<uint16_t, decltype(df32)> du16;
2759 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
2770 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
2772 const Vec512<int16_t> even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)};
2782 _mm512_and_si512(u16.raw, _mm512_set1_epi16(0x7FFF))};
2785 alignas(16)
static constexpr uint32_t kLanes[4] = {0, 4, 8, 12};
2787 const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
2796 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
2798 const Vec512<uint8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
2807 alignas(16)
static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12,
2808 0, 4, 8, 12, 0, 4, 8, 12};
2810 const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
2819 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
2821 const Vec512<int8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
2837 const Rebind<int32_t, decltype(dbf16)> di32;
2838 const Rebind<uint32_t, decltype(dbf16)> du32;
2839 const Rebind<uint16_t, decltype(dbf16)> du16;
2840 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32, v)));
2848 const Repartition<uint32_t, decltype(dbf16)> du32;
2869 alignas(16)
static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
2873 alignas(16)
static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
2875 _mm512_permutexvar_epi32(
LoadDup128(d32, kIndex32).raw, quads.raw)};
2906 #if !defined(HWY_DISABLE_PCLMUL_AES)
2909 #ifdef HWY_NATIVE_AES
2910 #undef HWY_NATIVE_AES
2912 #define HWY_NATIVE_AES
2917 #if HWY_TARGET == HWY_AVX3_DL
2920 alignas(64) uint8_t a[64];
2921 alignas(64) uint8_t b[64];
2925 Store(round_key, d, b);
2926 for (
size_t i = 0; i < 64; i += 16) {
2928 Store(enc, d128, a + i);
2935 #if HWY_TARGET == HWY_AVX3_DL
2938 alignas(64) uint64_t a[8];
2939 alignas(64) uint64_t b[8];
2944 for (
size_t i = 0; i < 8; i += 2) {
2946 Store(mul, d128, a + i);
2953 #if HWY_TARGET == HWY_AVX3_DL
2956 alignas(64) uint64_t a[8];
2957 alignas(64) uint64_t b[8];
2962 for (
size_t i = 0; i < 8; i += 2) {
2964 Store(mul, d128, a + i);
2975 template <
typename T,
typename T2>
2978 for (
size_t i = 0; i < 64 /
sizeof(T); ++i) {
2979 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
2981 return Load(d, lanes);
2990 template <
typename T>
2992 #if HWY_COMPILER_HAS_MASK_INTRINSICS
2993 return _kortestz_mask64_u8(mask.
raw, mask.
raw);
2995 return mask.
raw == 0;
2998 template <
typename T>
3000 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3001 return _kortestz_mask32_u8(mask.
raw, mask.
raw);
3003 return mask.
raw == 0;
3006 template <
typename T>
3008 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3009 return _kortestz_mask16_u8(mask.
raw, mask.
raw);
3011 return mask.
raw == 0;
3014 template <
typename T>
3016 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3017 return _kortestz_mask8_u8(mask.
raw, mask.
raw);
3019 return mask.
raw == 0;
3025 template <
typename T>
3032 template <
typename T>
3034 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3035 return _kortestc_mask64_u8(mask.
raw, mask.
raw);
3037 return mask.
raw == 0xFFFFFFFFFFFFFFFFull;
3040 template <
typename T>
3042 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3043 return _kortestc_mask32_u8(mask.
raw, mask.
raw);
3045 return mask.
raw == 0xFFFFFFFFull;
3048 template <
typename T>
3050 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3051 return _kortestc_mask16_u8(mask.
raw, mask.
raw);
3053 return mask.
raw == 0xFFFFull;
3056 template <
typename T>
3058 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3059 return _kortestc_mask8_u8(mask.
raw, mask.
raw);
3061 return mask.
raw == 0xFFull;
3067 template <
typename T>
3073 template <
typename T>
3083 template <
typename T>
3086 const size_t kNumBytes = 8 /
sizeof(T);
3087 CopyBytes<kNumBytes>(&mask.
raw, bits);
3092 template <
typename T>
3097 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
3103 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
3105 const Mask512<T> mask) {
3111 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3116 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3118 return Vec512<T>{_mm512_maskz_compress_epi64(mask.raw, v.raw)};
3133 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3136 const Rebind<uint16_t, decltype(d)> du;
3137 const auto vu =
BitCast(du, v);
3139 #if HWY_TARGET == HWY_AVX3_DL
3140 const Vec256<uint16_t> cu{_mm256_maskz_compress_epi16(mask.raw, vu.raw)};
3144 const Mask512<int32_t> mask32{
static_cast<__mmask16
>(mask.raw)};
3152 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3155 const Rebind<uint16_t, decltype(d)> du;
3156 const auto vu =
BitCast(du, v);
3158 #if HWY_TARGET == HWY_AVX3_DL
3159 const Vec512<uint16_t> cu{_mm512_maskz_compress_epi16(mask.raw, v.raw)};
3162 const Half<decltype(du)> duh;
3166 const uint32_t mask_bits{mask.raw};
3167 const Mask512<int32_t> mask0{
static_cast<__mmask16
>(mask_bits & 0xFFFF)};
3168 const Mask512<int32_t> mask1{
static_cast<__mmask16
>(mask_bits >> 16)};
3169 const auto compressed0 =
Compress(promoted0, mask0);
3170 const auto compressed1 =
Compress(promoted1, mask1);
3176 const size_t num0 =
CountTrue(dw, mask0);
3177 const __mmask32 m_upper = ~((1u << num0) - 1);
3178 alignas(64) uint16_t iota[64] = {
3179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3181 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3182 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
3183 const auto idx =
LoadU(du, iota + 32 - num0);
3184 const Vec512<uint16_t> cu{_mm512_mask_permutexvar_epi16(
3185 demoted0.raw, m_upper, idx.raw, demoted1.raw)};
3192 template <
typename T>
3199 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3202 const Rebind<uint16_t, decltype(d)> du;
3203 const auto vu =
BitCast(du, v);
3205 const uint64_t mask_bits{mask.
raw};
3207 #if HWY_TARGET == HWY_AVX3_DL
3208 _mm512_mask_compressstoreu_epi16(unaligned, mask.
raw, v.
raw);
3211 const Half<decltype(du)> duh;
3215 const uint64_t maskL = mask_bits & 0xFFFF;
3216 const uint64_t maskH = mask_bits >> 16;
3219 const auto compressed0 =
Compress(promoted0, mask0);
3220 const auto compressed1 =
Compress(promoted1, mask1);
3222 const Half<decltype(d)> dh;
3227 StoreU(demoted0, dh, unaligned);
3234 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3237 _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
3238 return PopCount(uint64_t{mask.raw});
3241 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3244 _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
3245 return PopCount(uint64_t{mask.raw});
3251 _mm512_mask_compressstoreu_ps(unaligned, mask.
raw, v.
raw);
3258 _mm512_mask_compressstoreu_pd(unaligned, mask.
raw, v.
raw);
3263 template <
typename T>
3275 const auto k5 =
Set(d, 5);
3276 const auto k6 =
Set(d, 6);
3280 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
3281 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
3282 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
3283 alignas(16)
static constexpr uint8_t tbl_g0[16] = {
3284 0x80, 0, 0x80, 0x80, 1, 0x80,
3285 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
3288 const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
3292 const auto i = (r0 | g0 | b0).raw;
3295 const auto shuf_r1 = shuf_b0 + k6;
3296 const auto shuf_g1 = shuf_r0 + k5;
3297 const auto shuf_b1 = shuf_g0 + k5;
3301 const auto j = (r1 | g1 | b1).raw;
3304 const auto shuf_r2 = shuf_b1 + k6;
3305 const auto shuf_g2 = shuf_r1 + k5;
3306 const auto shuf_b2 = shuf_g1 + k5;
3310 const auto k = (r2 | g2 | b2).raw;
3313 const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_SHUFFLE(3, 0, 3, 0));
3314 const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_SHUFFLE(1, 2, 0, 1));
3315 const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_SHUFFLE(2, 3, 1, 2));
3318 const __mmask8 m = 0xCC;
3319 const auto i1_k0_j0_i0 = _mm512_mask_blend_epi64(m, k3_k0_i3_i0, i1_i2_j0_j1);
3320 const auto j2_i2_k1_j1 = _mm512_mask_blend_epi64(m, i1_i2_j0_j1, j2_j3_k1_k2);
3321 const auto k3_j3_i3_k2 = _mm512_mask_blend_epi64(m, j2_j3_k1_k2, k3_k0_i3_i0);
3338 const auto ba0 =
ZipLower(d16, v0, v1);
3339 const auto dc0 =
ZipLower(d16, v2, v3);
3340 const auto ba8 =
ZipUpper(d16, v0, v1);
3341 const auto dc8 =
ZipUpper(d16, v2, v3);
3342 const auto i =
ZipLower(d32, ba0, dc0).raw;
3343 const auto j =
ZipUpper(d32, ba0, dc0).raw;
3344 const auto k =
ZipLower(d32, ba8, dc8).raw;
3345 const auto l =
ZipUpper(d32, ba8, dc8).raw;
3347 const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(1, 0, 1, 0));
3348 const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(1, 0, 1, 0));
3349 const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(3, 2, 3, 2));
3350 const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(3, 2, 3, 2));
3351 constexpr
int k20 = _MM_SHUFFLE(2, 0, 2, 0);
3352 constexpr
int k31 = _MM_SHUFFLE(3, 1, 3, 1);
3353 const auto l0_k0_j0_i0 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k20);
3354 const auto l1_k1_j1_i1 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k31);
3355 const auto l2_k2_j2_i2 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k20);
3356 const auto l3_k3_j3_i3 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k31);
3367 const DFromV<decltype(a)> du64;
3369 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
3370 const auto a32 =
BitCast(du32, a);
3371 const auto b32 =
BitCast(du32, b);
3379 const auto aLbL =
MulEven(a32, b32);
3380 const auto w3 = aLbL & maskL;
3382 const auto t2 =
MulEven(aH, b32) + ShiftRight<32>(aLbL);
3383 const auto w2 = t2 & maskL;
3384 const auto w1 = ShiftRight<32>(t2);
3386 const auto t =
MulEven(a32, bH) + w2;
3387 const auto k = ShiftRight<32>(t);
3389 const auto mulH =
MulEven(aH, bH) + w1 + k;
3390 const auto mulL = ShiftLeft<32>(t) + w3;
3396 const DFromV<decltype(a)> du64;
3398 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
3399 const auto a32 =
BitCast(du32, a);
3400 const auto b32 =
BitCast(du32, b);
3406 const auto aLbL =
MulEven(a32, b32);
3407 const auto w3 = aLbL & maskL;
3409 const auto t2 =
MulEven(aH, b32) + ShiftRight<32>(aLbL);
3410 const auto w2 = t2 & maskL;
3411 const auto w1 = ShiftRight<32>(t2);
3413 const auto t =
MulEven(a32, bH) + w2;
3414 const auto k = ShiftRight<32>(t);
3416 const auto mulH =
MulEven(aH, bH) + w1 + k;
3417 const auto mulL = ShiftLeft<32>(t) + w3;
3446 return Set(d, _mm512_reduce_add_epi32(v.
raw));
3449 return Set(d, _mm512_reduce_add_epi64(v.
raw));
3452 return Set(d,
static_cast<uint32_t
>(_mm512_reduce_add_epi32(v.
raw)));
3455 return Set(d,
static_cast<uint64_t
>(_mm512_reduce_add_epi64(v.
raw)));
3458 return Set(d, _mm512_reduce_add_ps(v.
raw));
3461 return Set(d, _mm512_reduce_add_pd(v.
raw));
3466 return Set(d, _mm512_reduce_min_epi32(v.
raw));
3469 return Set(d, _mm512_reduce_min_epi64(v.
raw));
3472 return Set(d, _mm512_reduce_min_epu32(v.
raw));
3475 return Set(d, _mm512_reduce_min_epu64(v.
raw));
3478 return Set(d, _mm512_reduce_min_ps(v.
raw));
3481 return Set(d, _mm512_reduce_min_pd(v.
raw));
3486 return Set(d, _mm512_reduce_max_epi32(v.
raw));
3489 return Set(d, _mm512_reduce_max_epi64(v.
raw));
3492 return Set(d, _mm512_reduce_max_epu32(v.
raw));
3495 return Set(d, _mm512_reduce_max_epu64(v.
raw));
3498 return Set(d, _mm512_reduce_max_ps(v.
raw));
3501 return Set(d, _mm512_reduce_max_pd(v.
raw));
3506 template <
typename T>
3511 template <
typename T>
3516 template <
typename T>
3521 template <
typename T>
3526 template <
typename T>
3531 template <
typename T>
3536 template <
typename T>
3541 template <
typename T>
3546 template <
int kBytes,
typename T>
3548 return ShiftRightBytes<kBytes>(
Full512<T>(), v);
3551 template <
int kLanes,
typename T>
3553 return ShiftRightBytes<kLanes>(
Full512<T>(), v);
3556 template <
size_t kBytes,
typename T>
3558 return CombineShiftRightBytes<kBytes>(
Full512<T>(), hi, lo);
3561 template <
typename T>
3566 template <
typename T>
3571 template <
typename T>
3576 template <
typename T>
3581 template <
typename T>
3586 template <
typename T>
3591 template <
typename T>
3596 template <
typename T>
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:66
#define HWY_API
Definition: base.h:117
#define HWY_INLINE
Definition: base.h:59
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:67
#define HWY_DASSERT(condition)
Definition: base.h:163
Definition: arm_neon-inl.h:468
Raw raw
Definition: arm_neon-inl.h:501
Definition: x86_256-inl.h:67
Raw raw
Definition: x86_256-inl.h:95
Definition: x86_512-inl.h:101
typename detail::Raw512< T >::type Raw
Definition: x86_512-inl.h:102
HWY_INLINE Vec512 & operator*=(const Vec512 other)
Definition: x86_512-inl.h:107
Raw raw
Definition: x86_512-inl.h:129
HWY_INLINE Vec512 & operator+=(const Vec512 other)
Definition: x86_512-inl.h:113
HWY_INLINE Vec512 & operator&=(const Vec512 other)
Definition: x86_512-inl.h:119
HWY_INLINE Vec512 & operator|=(const Vec512 other)
Definition: x86_512-inl.h:122
HWY_INLINE Vec512 & operator-=(const Vec512 other)
Definition: x86_512-inl.h:116
HWY_INLINE Vec512 & operator^=(const Vec512 other)
Definition: x86_512-inl.h:125
HWY_INLINE Vec512 & operator/=(const Vec512 other)
Definition: x86_512-inl.h:110
const double shift
Definition: RateControl.cpp:165
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1463
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:842
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:2739
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:2798
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1278
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1136
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:127
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:3589
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:879
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:672
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:578
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:768
HWY_INLINE auto FixConversionOverflow(Simd< TI, N > di, decltype(Zero(DF())) original, decltype(Zero(di).raw) converted_raw) -> decltype(Zero(di))
Definition: x86_128-inl.h:4176
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1487
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2184
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:4165
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:551
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3490
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:2728
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:714
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2176
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:805
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:624
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:803
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1573
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:66
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_API Vec128< uint32_t, N > TableLookupLanes(const Vec128< uint32_t, N > v, const Indices128< uint32_t, N > idx)
Definition: arm_neon-inl.h:3342
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3064
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1167
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1520
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3435
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2829
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1073
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3629
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2949
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3052
HWY_API size_t StoreMaskBits(Simd< T, N >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4528
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4207
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_API Vec128< T, N > LoadDup128(Simd< T, N > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2164
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4054
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2352
HWY_API Vec128< T, N > VecFromMask(const Mask128< T, N > v)
Definition: arm_neon-inl.h:1607
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1206
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3760
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3495
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N > d, const int32_t *idx)
Definition: arm_neon-inl.h:3323
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1600
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: shared-inl.h:149
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:904
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3947
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3726
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3389
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1795
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:422
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3070
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3802
Vec128< T, N > Iota(const Simd< T, N > d, const T2 first)
Definition: arm_neon-inl.h:734
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3483
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1438
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1443
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:3907
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2903
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: shared-inl.h:147
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:778
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1619
HWY_API Vec128< T, N > Undefined(Simd< T, N >)
Definition: arm_neon-inl.h:724
HWY_API intptr_t FindFirstTrue(const Simd< T, N >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4520
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1448
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4509
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:392
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3395
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4538
HWY_API bool AllTrue(const Simd< T, N > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4557
HWY_API void StoreInterleaved3(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4829
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_API VFromD< DW > ZipUpper(DW dw, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3538
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1581
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1655
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2170
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1735
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2846
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: shared-inl.h:160
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:794
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:475
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:325
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3506
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1827
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3041
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1126
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:449
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec128< uint8_t, 8 > hi, Vec128< uint8_t, 8 > lo)
Definition: arm_neon-inl.h:3566
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1194
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4071
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2157
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4036
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3637
typename D::template Rebind< T > Rebind
Definition: shared-inl.h:144
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1799
HWY_API V InterleaveUpper(Simd< T, N >, V a, V b)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1084
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< uint32_t, 2 > Shuffle2301(const Vec128< uint32_t, 2 > v)
Definition: arm_neon-inl.h:1698
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:319
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1077
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:532
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2890
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1366
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4276
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4812
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:3844
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4211
Neg(const Vec128< int64_t, 1 > v)
Definition: arm_neon-inl.h:866
HWY_API Vec128< uint8_t, 4 > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2699
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3235
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3752
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4787
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2818
HWY_API VFromD< DW > ZipLower(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3527
typename D::Half Half
Definition: shared-inl.h:164
typename D::template Repartition< T > Repartition
Definition: shared-inl.h:155
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:2665
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4802
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3401
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:3957
HWY_API void StoreInterleaved4(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, const Vec128< uint8_t > v3, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4864
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< uint8_t, 8 > UpperHalf(Simd< uint8_t, 8 >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3096
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3407
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:984
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1590
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3385
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1320
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4780
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:405
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1296
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
Definition: aligned_allocator.h:23
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:634
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:565
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:589
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:555
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:521
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:529
#define HWY_ALIGN
Definition: set_macros-inl.h:78
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
Definition: x86_512-inl.h:2301
__m512i raw
Definition: x86_512-inl.h:2302
Definition: x86_512-inl.h:134
detail::RawMask512< sizeof(T)>::type raw
Definition: x86_512-inl.h:135
Definition: shared-inl.h:35
HWY_INLINE __m512d operator()(__m512i v)
Definition: x86_512-inl.h:164
HWY_INLINE __m512 operator()(__m512i v)
Definition: x86_512-inl.h:160
Definition: x86_512-inl.h:155
HWY_INLINE __m512i operator()(__m512i v)
Definition: x86_512-inl.h:156
__m512d type
Definition: x86_512-inl.h:75
__m512 type
Definition: x86_512-inl.h:71
Definition: x86_512-inl.h:66
__m512i type
Definition: x86_512-inl.h:67
__mmask64 type
Definition: x86_512-inl.h:83
__mmask32 type
Definition: x86_512-inl.h:87
__mmask16 type
Definition: x86_512-inl.h:91
__mmask8 type
Definition: x86_512-inl.h:95
Definition: x86_512-inl.h:80