20 #include <wasm_simd128.h>
25 #ifdef HWY_WASM_OLD_NAMES
26 #define wasm_i8x16_shuffle wasm_v8x16_shuffle
27 #define wasm_i16x8_shuffle wasm_v16x8_shuffle
28 #define wasm_i32x4_shuffle wasm_v32x4_shuffle
29 #define wasm_i64x2_shuffle wasm_v64x2_shuffle
30 #define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
31 #define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
32 #define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
33 #define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
34 #define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
35 #define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
36 #define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
37 #define wasm_u8x16_add_sat wasm_u8x16_add_saturate
38 #define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
39 #define wasm_u16x8_add_sat wasm_u16x8_add_saturate
40 #define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
41 #define wasm_i8x16_add_sat wasm_i8x16_add_saturate
42 #define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
43 #define wasm_i16x8_add_sat wasm_i16x8_add_saturate
44 #define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
52 using Full128 = Simd<T, 16 /
sizeof(T)>;
67 template <
typename T,
size_t N = 16 /
sizeof(T)>
75 return *
this = (*
this * other);
78 return *
this = (*
this / other);
81 return *
this = (*
this + other);
84 return *
this = (*
this - other);
87 return *
this = (*
this & other);
90 return *
this = (*
this | other);
93 return *
this = (*
this ^ other);
100 template <
typename T,
size_t N = 16 /
sizeof(T)>
109 template <
typename T,
size_t N>
118 using DFromV = decltype(detail::DeduceD()(V()));
121 using TFromV = TFromD<DFromV<V>>;
129 return static_cast<__v128_u
>(v);
132 return static_cast<__v128_u
>(v);
135 template <
typename T,
size_t N>
141 template <
typename T>
150 template <
typename T,
size_t N>
152 Vec128<uint8_t, N *
sizeof(T)> v) {
158 template <
typename T,
size_t N,
typename FromT>
160 Vec128<FromT, N *
sizeof(T) /
sizeof(FromT)> v) {
167 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
171 template <
size_t N, HWY_IF_LE128(
float, N)>
182 template <
size_t N, HWY_IF_LE128(u
int8_t, N)>
186 template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
190 template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
194 template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
199 template <
size_t N, HWY_IF_LE128(
int8_t, N)>
203 template <
size_t N, HWY_IF_LE128(
int16_t, N)>
207 template <
size_t N, HWY_IF_LE128(
int32_t, N)>
211 template <
size_t N, HWY_IF_LE128(
int64_t, N)>
216 template <
size_t N, HWY_IF_LE128(
float, N)>
225 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
233 template <
typename T,
size_t N,
typename T2>
234 Vec128<T, N>
Iota(
const Simd<T, N> d,
const T2 first) {
236 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
237 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
239 return Load(d, lanes);
430 template <
int kBits,
size_t N>
434 template <
int kBits,
size_t N>
438 template <
int kBits,
size_t N>
442 template <
int kBits,
size_t N>
448 template <
int kBits,
size_t N>
452 template <
int kBits,
size_t N>
456 template <
int kBits,
size_t N>
460 template <
int kBits,
size_t N>
466 template <
int kBits,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
473 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
476 template <
int kBits,
size_t N>
482 return shifted &
Set(d8, 0xFF >> kBits);
485 template <
int kBits,
size_t N>
489 const auto shifted =
BitCast(di, ShiftRight<kBits>(
BitCast(du, v)));
490 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
491 return (shifted ^ shifted_sign) - shifted_sign;
541 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
547 return shifted &
Set(d8, (0xFF << bits) & 0xFF);
557 return shifted &
Set(d8, 0xFF >> bits);
565 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> bits));
566 return (shifted ^ shifted_sign) - shifted_sign;
588 HWY_API Vec128<uint64_t, N>
Min(
const Vec128<uint64_t, N> a,
589 const Vec128<uint64_t, N> b) {
590 alignas(16)
float min[4];
592 HWY_MIN(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
594 HWY_MIN(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
595 return Vec128<uint64_t, N>{wasm_v128_load(min)};
615 HWY_API Vec128<int64_t, N>
Min(
const Vec128<int64_t, N> a,
616 const Vec128<int64_t, N> b) {
617 alignas(16)
float min[4];
619 HWY_MIN(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
621 HWY_MIN(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
622 return Vec128<int64_t, N>{wasm_v128_load(min)};
651 HWY_API Vec128<uint64_t, N>
Max(
const Vec128<uint64_t, N> a,
652 const Vec128<uint64_t, N> b) {
653 alignas(16)
float max[4];
655 HWY_MAX(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
657 HWY_MAX(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
658 return Vec128<int64_t, N>{wasm_v128_load(max)};
678 HWY_API Vec128<int64_t, N>
Max(
const Vec128<int64_t, N> a,
679 const Vec128<int64_t, N> b) {
680 alignas(16)
float max[4];
682 HWY_MAX(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
684 HWY_MAX(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
685 return Vec128<int64_t, N>{wasm_v128_load(max)};
726 const auto al = wasm_u32x4_extend_low_u16x8(a.
raw);
727 const auto ah = wasm_u32x4_extend_high_u16x8(a.
raw);
728 const auto bl = wasm_u32x4_extend_low_u16x8(b.
raw);
729 const auto bh = wasm_u32x4_extend_high_u16x8(b.
raw);
730 const auto l = wasm_i32x4_mul(al, bl);
731 const auto h = wasm_i32x4_mul(ah, bh);
734 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
740 const auto al = wasm_i32x4_extend_low_i16x8(a.
raw);
741 const auto ah = wasm_i32x4_extend_high_i16x8(a.
raw);
742 const auto bl = wasm_i32x4_extend_low_i16x8(b.
raw);
743 const auto bh = wasm_i32x4_extend_high_i16x8(b.
raw);
744 const auto l = wasm_i32x4_mul(al, bl);
745 const auto h = wasm_i32x4_mul(ah, bh);
748 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
753 HWY_API Vec128<int64_t, (N + 1) / 2>
MulEven(
const Vec128<int32_t, N> a,
754 const Vec128<int32_t, N> b) {
756 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
757 const auto ae = wasm_v128_and(a.raw, kEvenMask);
758 const auto be = wasm_v128_and(b.raw, kEvenMask);
759 return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
762 HWY_API Vec128<uint64_t, (N + 1) / 2>
MulEven(
const Vec128<uint32_t, N> a,
763 const Vec128<uint32_t, N> b) {
765 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
766 const auto ae = wasm_v128_and(a.raw, kEvenMask);
767 const auto be = wasm_v128_and(b.raw, kEvenMask);
768 return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
773 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
804 const Vec128<float, N> b) {
805 return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
811 const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
827 const Vec128<float, N> x,
828 const Vec128<float, N> add) {
831 return mul * x + add;
837 const Vec128<float, N> x,
838 const Vec128<float, N> add) {
840 return add - mul * x;
846 const Vec128<float, N> x,
847 const Vec128<float, N> sub) {
850 return mul * x - sub;
856 const Vec128<float, N> x,
857 const Vec128<float, N> sub) {
859 return Neg(mul) * x - sub;
866 HWY_API Vec128<float, N>
Sqrt(
const Vec128<float, N> v) {
867 return Vec128<float, N>{wasm_f32x4_sqrt(v.raw)};
874 const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
875 return one /
Sqrt(v);
882 HWY_API Vec128<float, N>
Round(
const Vec128<float, N> v) {
883 return Vec128<float, N>{wasm_f32x4_nearest(v.raw)};
888 HWY_API Vec128<float, N>
Trunc(
const Vec128<float, N> v) {
889 return Vec128<float, N>{wasm_f32x4_trunc(v.raw)};
894 HWY_API Vec128<float, N>
Ceil(
const Vec128<float, N> v) {
895 return Vec128<float, N>{wasm_f32x4_ceil(v.raw)};
900 HWY_API Vec128<float, N>
Floor(
const Vec128<float, N> v) {
901 return Vec128<float, N>{wasm_f32x4_floor(v.raw)};
908 template <
typename TFrom,
typename TTo,
size_t N>
910 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
911 return Mask128<TTo, N>{m.raw};
914 template <
typename T,
size_t N>
916 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
917 return (v & bit) == bit;
1028 const auto a32 =
BitCast(d32, a);
1029 const auto b32 =
BitCast(d32, b);
1031 const auto m_gt = a32 < b32;
1034 const auto m_eq = a32 == b32;
1035 const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
1036 const auto lo_gt =
And(m_eq, lo_in_hi);
1038 const auto gt =
Or(lo_gt, m_gt);
1049 template <
typename T,
size_t N>
1070 template <
typename T,
size_t N>
1071 HWY_API Mask128<T, N>
FirstN(
const Simd<T, N> d,
size_t num) {
1080 template <
typename T,
size_t N>
1087 template <
typename T,
size_t N>
1095 template <
typename T,
size_t N>
1102 template <
typename T,
size_t N>
1109 template <
typename T,
size_t N>
1116 template <
typename T,
size_t N>
1117 HWY_API Vec128<T, N>
operator&(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1121 template <
typename T,
size_t N>
1122 HWY_API Vec128<T, N>
operator|(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1126 template <
typename T,
size_t N>
1127 HWY_API Vec128<T, N>
operator^(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1133 template <
typename T,
size_t N>
1135 const Vec128<T, N> sign) {
1136 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1137 const auto msb =
SignBit(Simd<T, N>());
1141 template <
typename T,
size_t N>
1143 const Vec128<T, N> sign) {
1144 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1150 template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
1162 template <
typename T,
size_t N>
1164 return Mask128<T, N>{v.raw};
1167 template <
typename T,
size_t N>
1169 return Vec128<T, N>{v.raw};
1173 template <
typename T,
size_t N>
1175 return Vec128<T, N>{v.raw};
1179 template <
typename T,
size_t N>
1186 template <
typename T,
size_t N>
1192 template <
typename T,
size_t N>
1197 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1200 const auto zero =
Zero(d);
1206 template <
typename T,
size_t N>
1207 HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
1211 template <
typename T,
size_t N>
1212 HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
1217 template <
typename T,
size_t N>
1218 HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
1223 template <
typename T,
size_t N>
1224 HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
1229 template <
typename T,
size_t N>
1230 HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
1245 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1252 test = ShiftLeft<12>(test);
1255 test = ShiftLeft<1>(test);
1259 test = ShiftLeft<1>(test);
1263 test = ShiftLeft<1>(test);
1270 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1277 test = ShiftLeft<27>(test);
1280 test = ShiftLeft<1>(test);
1284 test = ShiftLeft<1>(test);
1288 test = ShiftLeft<1>(test);
1292 test = ShiftLeft<1>(test);
1301 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1308 test = ShiftLeft<12>(test);
1311 test = ShiftLeft<1>(test);
1315 test = ShiftLeft<1>(test);
1319 test = ShiftLeft<1>(test);
1323 return IfThenElse(mask, ShiftRight<1>(v), v);
1326 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1333 test = ShiftLeft<27>(test);
1336 test = ShiftLeft<1>(test);
1340 test = ShiftLeft<1>(test);
1344 test = ShiftLeft<1>(test);
1348 test = ShiftLeft<1>(test);
1352 return IfThenElse(mask, ShiftRight<1>(v), v);
1359 template <
typename T>
1361 return Vec128<T>{wasm_v128_load(aligned)};
1364 template <
typename T,
size_t N>
1371 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1374 CopyBytes<sizeof(T) * N>(p, &v);
1379 template <
typename T,
size_t N>
1385 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
1392 template <
typename T>
1394 wasm_v128_store(aligned, v.
raw);
1398 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1400 CopyBytes<sizeof(T) * N>(&v, p);
1405 *p = wasm_f32x4_extract_lane(v.
raw, 0);
1409 template <
typename T,
size_t N>
1418 template <
typename T,
size_t N>
1421 wasm_v128_store(aligned, v.raw);
1426 template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
1428 const Vec128<Offset, N> offset) {
1429 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1431 alignas(16) T lanes[N];
1434 alignas(16) Offset offset_lanes[N];
1435 Store(offset, Simd<Offset, N>(), offset_lanes);
1437 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
1438 for (
size_t i = 0; i < N; ++i) {
1439 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1443 template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
1445 const Vec128<Index, N> index) {
1446 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1448 alignas(16) T lanes[N];
1451 alignas(16) Index index_lanes[N];
1452 Store(index, Simd<Index, N>(), index_lanes);
1454 for (
size_t i = 0; i < N; ++i) {
1455 base[index_lanes[i]] = lanes[i];
1461 template <
typename T,
size_t N,
typename Offset>
1464 const Vec128<Offset, N> offset) {
1465 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1467 alignas(16) Offset offset_lanes[N];
1468 Store(offset, Simd<Offset, N>(), offset_lanes);
1470 alignas(16) T lanes[N];
1471 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
1472 for (
size_t i = 0; i < N; ++i) {
1473 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1475 return Load(d, lanes);
1478 template <
typename T,
size_t N,
typename Index>
1480 const Vec128<Index, N> index) {
1481 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1483 alignas(16) Index index_lanes[N];
1484 Store(index, Simd<Index, N>(), index_lanes);
1486 alignas(16) T lanes[N];
1487 for (
size_t i = 0; i < N; ++i) {
1488 lanes[i] = base[index_lanes[i]];
1490 return Load(d, lanes);
1500 return wasm_i8x16_extract_lane(v.raw, 0);
1504 return wasm_i8x16_extract_lane(v.raw, 0);
1508 return wasm_i16x8_extract_lane(v.raw, 0);
1512 return wasm_i16x8_extract_lane(v.raw, 0);
1516 return wasm_i32x4_extract_lane(v.raw, 0);
1520 return wasm_i32x4_extract_lane(v.raw, 0);
1524 return wasm_i64x2_extract_lane(v.
raw, 0);
1528 return wasm_i64x2_extract_lane(v.
raw, 0);
1533 return wasm_f32x4_extract_lane(v.
raw, 0);
1538 template <
typename T,
size_t N>
1540 return Vec128<T, N / 2>{v.raw};
1543 template <
typename T,
size_t N>
1551 template <
int kBytes,
typename T,
size_t N>
1553 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1554 const __i8x16 zero = wasm_i8x16_splat(0);
1560 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5,
1561 6, 7, 8, 9, 10, 11, 12, 13, 14)};
1564 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4,
1565 5, 6, 7, 8, 9, 10, 11, 12, 13)};
1568 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2,
1569 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)};
1572 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1,
1573 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)};
1576 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0,
1577 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)};
1580 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1581 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
1584 return Vec128<T, N>{wasm_i8x16_shuffle(
1585 v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
1588 return Vec128<T, N>{wasm_i8x16_shuffle(
1589 v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
1592 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1593 16, 16, 16, 16, 0, 1, 2, 3, 4, 5,
1597 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1598 16, 16, 16, 16, 16, 0, 1, 2, 3, 4,
1602 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1603 16, 16, 16, 16, 16, 16, 0, 1, 2, 3,
1607 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1608 16, 16, 16, 16, 16, 16, 16, 0, 1,
1612 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1613 16, 16, 16, 16, 16, 16, 16, 16, 0,
1617 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1618 16, 16, 16, 16, 16, 16, 16, 16, 16,
1622 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1623 16, 16, 16, 16, 16, 16, 16, 16, 16,
1626 return Vec128<T, N>{zero};
1629 template <
int kBytes,
typename T,
size_t N>
1631 return ShiftLeftBytes<kBytes>(Simd<T, N>(), v);
1636 template <
int kLanes,
typename T,
size_t N>
1642 template <
int kLanes,
typename T,
size_t N>
1644 return ShiftLeftLanes<kLanes>(Simd<T, N>(), v);
1651 template <
int kBytes,
typename T,
size_t N>
1653 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1654 const __i8x16 zero = wasm_i8x16_splat(0);
1661 return wasm_i8x16_shuffle(v.
raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1662 12, 13, 14, 15, 16);
1665 return wasm_i8x16_shuffle(v.
raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1666 13, 14, 15, 16, 16);
1669 return wasm_i8x16_shuffle(v.
raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1670 13, 14, 15, 16, 16, 16);
1673 return wasm_i8x16_shuffle(v.
raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1674 14, 15, 16, 16, 16, 16);
1677 return wasm_i8x16_shuffle(v.
raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1678 15, 16, 16, 16, 16, 16);
1681 return wasm_i8x16_shuffle(v.
raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1682 16, 16, 16, 16, 16, 16);
1685 return wasm_i8x16_shuffle(v.
raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1686 16, 16, 16, 16, 16, 16, 16);
1689 return wasm_i8x16_shuffle(v.
raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1690 16, 16, 16, 16, 16, 16, 16);
1693 return wasm_i8x16_shuffle(v.
raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
1694 16, 16, 16, 16, 16, 16, 16);
1697 return wasm_i8x16_shuffle(v.
raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
1698 16, 16, 16, 16, 16, 16, 16);
1701 return wasm_i8x16_shuffle(v.
raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
1702 16, 16, 16, 16, 16, 16, 16);
1705 return wasm_i8x16_shuffle(v.
raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
1706 16, 16, 16, 16, 16, 16, 16);
1709 return wasm_i8x16_shuffle(v.
raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
1710 16, 16, 16, 16, 16, 16, 16);
1713 return wasm_i8x16_shuffle(v.
raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
1714 16, 16, 16, 16, 16, 16, 16);
1717 return wasm_i8x16_shuffle(v.
raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
1718 16, 16, 16, 16, 16, 16, 16);
1727 template <
int kBytes,
typename T,
size_t N>
1730 if (N != 16 /
sizeof(T)) {
1731 const Vec128<T> vfull{v.raw};
1734 return Vec128<T, N>{detail::ShrBytes<kBytes>(v)};
1738 template <
int kLanes,
typename T,
size_t N>
1747 template <
typename T>
1750 return Vec128<T, 8 /
sizeof(T)>{wasm_i32x4_shuffle(v.
raw, v.
raw, 2, 3, 2, 3)};
1758 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1764 return Vec128<T, (N + 1) / 2>{upper.raw};
1769 template <
int kBytes,
typename T,
class V = Vec128<T>>
1771 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1777 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1778 11, 12, 13, 14, 15, 16)};
1781 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1782 11, 12, 13, 14, 15, 16, 17)};
1785 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1786 12, 13, 14, 15, 16, 17, 18)};
1789 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1790 13, 14, 15, 16, 17, 18, 19)};
1793 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1794 14, 15, 16, 17, 18, 19, 20)};
1797 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
1798 14, 15, 16, 17, 18, 19, 20, 21)};
1801 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
1802 15, 16, 17, 18, 19, 20, 21, 22)};
1805 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
1806 16, 17, 18, 19, 20, 21, 22, 23)};
1809 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
1810 17, 18, 19, 20, 21, 22, 23, 24)};
1813 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
1814 17, 18, 19, 20, 21, 22, 23, 24, 25)};
1817 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
1818 18, 19, 20, 21, 22, 23, 24, 25, 26)};
1821 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
1822 19, 20, 21, 22, 23, 24, 25, 26, 27)};
1825 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
1826 20, 21, 22, 23, 24, 25, 26, 27, 28)};
1829 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
1830 21, 22, 23, 24, 25, 26, 27, 28, 29)};
1833 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
1834 22, 23, 24, 25, 26, 27, 28, 29, 30)};
1839 template <
int kBytes,
typename T,
size_t N,
HWY_IF_LE64(T, N),
1840 class V = Vec128<T, N>>
1842 constexpr
size_t kSize = N *
sizeof(T);
1843 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
1846 using V8 =
VFromD<decltype(d_full8)>;
1847 const V8 hi8{
BitCast(d8, hi).raw};
1857 template <
int kLane,
size_t N>
1859 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
1861 v.
raw, v.
raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
1863 template <
int kLane,
size_t N>
1865 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
1867 wasm_i32x4_shuffle(v.
raw, v.
raw, kLane, kLane, kLane, kLane)};
1871 template <
int kLane,
size_t N>
1873 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
1875 v.
raw, v.
raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
1877 template <
int kLane,
size_t N>
1879 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
1881 wasm_i32x4_shuffle(v.
raw, v.
raw, kLane, kLane, kLane, kLane)};
1885 template <
int kLane,
size_t N>
1887 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
1889 wasm_i32x4_shuffle(v.
raw, v.
raw, kLane, kLane, kLane, kLane)};
1896 template <
typename T,
size_t N,
typename TI,
size_t NI>
1906 alignas(16) uint8_t control[16];
1907 alignas(16) uint8_t input[16];
1908 alignas(16) uint8_t output[16];
1909 wasm_v128_store(control, from.
raw);
1910 wasm_v128_store(input, bytes.
raw);
1911 for (
size_t i = 0; i < 16; ++i) {
1912 output[i] = control[i] < 16 ? input[control[i]] : 0;
1918 template <
typename T,
size_t N,
typename TI,
size_t NI>
1940 return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
1943 return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
1946 return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
1995 template <
typename T,
size_t N>
2000 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
2002 #if HWY_IS_DEBUG_BUILD
2003 for (
size_t i = 0; i < N; ++i) {
2004 HWY_DASSERT(0 <= idx[i] && idx[i] <
static_cast<int32_t
>(N));
2009 alignas(16) uint8_t control[16] = {0};
2010 for (
size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
2011 for (
size_t idx_byte = 0; idx_byte <
sizeof(T); ++idx_byte) {
2012 control[idx_lane *
sizeof(T) + idx_byte] =
2013 static_cast<uint8_t
>(idx[idx_lane] *
sizeof(T) + idx_byte);
2016 return Indices128<T, N>{
Load(d8, control).raw};
2021 const Vec128<uint32_t, N> v,
const Indices128<uint32_t, N> idx) {
2026 const Indices128<int32_t, N> idx) {
2031 const Indices128<float, N> idx) {
2032 const Simd<int32_t, N> di;
2033 const Simd<float, N> df;
2040 template <
typename T>
2045 template <
typename T>
2046 HWY_API Vec128<T, 2>
Reverse(Simd<T, 2> ,
const Vec128<T, 2> v) {
2047 return Vec128<T, 2>(
Shuffle2301(Vec128<T>(v.raw)).raw);
2050 template <
typename T>
2051 HWY_API Vec128<T, 1>
Reverse(Simd<T, 1> ,
const Vec128<T, 1> v) {
2061 a.
raw, b.
raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2067 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2084 a.
raw, b.
raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2090 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2110 template <
typename T,
size_t N,
class V = Vec128<T, N>>
2124 26, 11, 27, 12, 28, 13, 29, 14,
2131 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2148 26, 11, 27, 12, 28, 13, 29, 14,
2155 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2177 template <
typename T,
class V = Vec128<T>>
2183 template <
typename T,
size_t N, HWY_IF_LE64(T, N),
class V = Vec128<T, N>>
2185 const Half<decltype(d)> d2;
2193 template <
typename T,
size_t N,
class DW = RepartitionToW
ide<Simd<T, N>>>
2197 template <
typename T,
size_t N,
class D = Simd<T, N>,
2198 class DW = RepartitionToW
ide<D>>
2203 template <
typename T,
size_t N,
class D = Simd<T, N>,
2204 class DW = RepartitionToW
ide<D>>
2214 template <
typename T,
size_t N>
2217 const Half<decltype(d)> d2;
2221 const VU lo{
BitCast(du2, lo_half).raw};
2222 const VU hi{
BitCast(du2, hi_half).raw};
2228 template <
typename T,
size_t N>
2236 template <
typename T>
2241 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2243 const Vec128<T, N> lo) {
2244 const Half<decltype(d)> d2;
2250 template <
typename T>
2255 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2257 const Vec128<T, N> lo) {
2258 const Half<decltype(d)> d2;
2264 template <
typename T>
2267 return CombineShiftRightBytes<8>(d, hi, lo);
2269 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2271 const Vec128<T, N> lo) {
2272 const Half<decltype(d)> d2;
2277 template <
typename T,
size_t N>
2279 const Vec128<T, N> lo) {
2286 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2288 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
2292 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2300 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2308 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2310 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
2314 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2322 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2331 template <
typename T,
size_t N>
2336 alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
2337 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
2340 template <
typename T,
size_t N>
2344 wasm_i16x8_shuffle(a.
raw, b.
raw, 8, 1, 10, 3, 12, 5, 14, 7)};
2346 template <
typename T,
size_t N>
2351 template <
typename T,
size_t N>
2359 template <
typename T,
size_t N>
2360 HWY_API Vec128<T, N>
OddEven(
const Vec128<T, N> a,
const Vec128<T, N> b) {
2383 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.
raw))};
2394 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.
raw))};
2398 const Vec128<uint16_t, N> v) {
2399 return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
2410 const Vec128<int8_t, N> v) {
2411 return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
2415 const Vec128<int8_t, N> v) {
2416 return Vec128<int32_t, N>{
2417 wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
2421 const Vec128<int16_t, N> v) {
2422 return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
2433 const Vec128<float16_t, N> v) {
2434 const Simd<int32_t, N> di32;
2435 const Simd<uint32_t, N> du32;
2436 const Simd<float, N> df32;
2438 const auto bits16 =
PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
2439 const auto sign = ShiftRight<15>(bits16);
2440 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
2441 const auto mantissa = bits16 &
Set(du32, 0x3FF);
2442 const auto subnormal =
2444 Set(df32, 1.0f / 16384 / 1024));
2446 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
2447 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
2448 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
2449 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
2450 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
2455 const Vec128<bfloat16_t, N> v) {
2456 const Rebind<uint16_t, decltype(df32)> du16;
2478 const auto intermediate = wasm_i16x8_narrow_i32x4(v.
raw, v.
raw);
2480 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2492 const auto intermediate = wasm_i16x8_narrow_i32x4(v.
raw, v.
raw);
2510 const Vec128<float, N> v) {
2511 const Simd<int32_t, N> di;
2512 const Simd<uint32_t, N> du;
2513 const Simd<uint16_t, N> du16;
2514 const auto bits32 =
BitCast(du, v);
2515 const auto sign = ShiftRight<31>(bits32);
2516 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
2517 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
2519 const auto k15 =
Set(di, 15);
2520 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
2521 const auto is_tiny = exp <
Set(di, -24);
2523 const auto is_subnormal = exp <
Set(di, -14);
2524 const auto biased_exp16 =
2526 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
2527 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
2528 (mantissa32 >> (
Set(du, 13) + sub_exp));
2530 ShiftRight<13>(mantissa32));
2532 const auto sign16 = ShiftLeft<15>(sign);
2533 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
2535 return Vec128<float16_t, N>{
DemoteTo(du16, bits16).raw};
2540 const Vec128<float, N> v) {
2541 const Rebind<int32_t, decltype(dbf16)> di32;
2542 const Rebind<uint32_t, decltype(dbf16)> du32;
2543 const Rebind<uint16_t, decltype(dbf16)> du16;
2544 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32, v)));
2550 Simd<bfloat16_t, 2 * N> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
2552 const Repartition<uint32_t, decltype(dbf16)> du32;
2553 const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(
BitCast(du32, b));
2560 const auto intermediate = wasm_i16x8_narrow_i32x4(v.
raw, v.
raw);
2562 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2590 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2595 const Vec128<T, N> vbits{wasm_i32x4_splat(
static_cast<int32_t
>(bits))};
2598 alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
2599 1, 1, 1, 1, 1, 1, 1, 1};
2602 alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
2603 1, 2, 4, 8, 16, 32, 64, 128};
2607 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2610 alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2614 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2617 alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
2621 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2624 alignas(16) constexpr uint64_t kBit[8] = {1, 2};
2631 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
2634 uint64_t mask_bits = 0;
2635 CopyBytes<(N + 7) / 8>(bits, &mask_bits);
2644 template <
typename T>
2646 const Mask128<T> mask) {
2647 alignas(16) uint64_t lanes[2];
2648 wasm_v128_store(lanes, mask.raw);
2650 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2651 const uint64_t lo = ((lanes[0] * kMagic) >> 56);
2652 const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
2657 template <
typename T>
2660 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2661 return (wasm_i64x2_extract_lane(mask.
raw, 0) * kMagic) >> 56;
2665 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
2668 uint64_t bytes = wasm_i64x2_extract_lane(mask.
raw, 0);
2670 bytes &= (1ULL << (N * 8)) - 1;
2671 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2672 return (bytes * kMagic) >> 56;
2675 template <
typename T,
size_t N>
2679 const __i16x8 zero = wasm_i16x8_splat(0);
2684 template <
typename T,
size_t N>
2687 const __i32x4 mask_i =
static_cast<__i32x4
>(mask.
raw);
2688 const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
2689 const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
2690 alignas(16) uint32_t lanes[4];
2691 wasm_v128_store(lanes, sliced_mask);
2692 return lanes[0] | lanes[1] | lanes[2] | lanes[3];
2696 template <
typename T,
size_t N>
2697 constexpr uint64_t
OnlyActive(uint64_t bits) {
2698 return ((N *
sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1);
2705 (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
2706 : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
2707 : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
2708 : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
2709 : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
2710 : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
2711 : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
2712 : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
2713 : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
2714 : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2716 : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2718 : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
2720 : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
2722 : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
2725 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
2727 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
2728 : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
2731 template <
typename T,
size_t N>
2736 template <
typename T>
2741 template <
typename T>
2746 template <
typename T>
2748 const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
2749 const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
2750 alignas(16) uint64_t lanes[2];
2751 wasm_v128_store(lanes, shifted_bits);
2752 return PopCount(lanes[0] | lanes[1]);
2758 template <
typename T,
size_t N>
2760 const Mask128<T, N> mask, uint8_t* bits) {
2762 const size_t kNumBytes = (N + 7) / 8;
2763 CopyBytes<kNumBytes>(&mask_bits, bits);
2767 template <
typename T,
size_t N>
2773 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2776 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
2781 template <
typename T>
2787 return !wasm_i8x16_any_true(v8.raw);
2790 return (wasm_i64x2_extract_lane(m.raw, 0) |
2791 wasm_i64x2_extract_lane(m.raw, 1)) == 0;
2797 template <
typename T>
2799 return wasm_i8x16_all_true(m.
raw);
2801 template <
typename T>
2803 return wasm_i16x8_all_true(m.
raw);
2805 template <
typename T>
2807 return wasm_i32x4_all_true(m.
raw);
2812 template <
typename T,
size_t N>
2819 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2822 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
2826 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2829 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
2833 template <
typename T,
size_t N>
2835 const Mask128<T, N> mask) {
2844 template <
typename T,
size_t N>
2848 const Rebind<uint8_t, decltype(d)> d8;
2856 alignas(16) constexpr uint8_t table[256 * 8] = {
2857 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
2858 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
2859 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
2860 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
2861 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
2862 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
2863 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
2864 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
2865 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
2866 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
2867 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
2868 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
2869 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
2870 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
2871 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
2872 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
2873 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
2874 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
2875 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
2876 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
2877 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
2878 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
2879 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
2880 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
2881 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
2882 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
2883 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
2884 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
2885 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
2886 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
2887 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
2888 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
2889 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
2890 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
2891 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
2892 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
2893 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
2894 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
2895 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
2896 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
2897 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
2898 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
2899 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
2900 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
2901 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
2902 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
2903 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
2904 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
2905 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
2906 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
2907 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
2908 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
2909 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
2910 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
2911 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
2912 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
2913 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
2914 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
2915 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
2916 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
2917 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
2918 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
2919 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
2920 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
2921 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
2922 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
2923 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
2924 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
2925 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
2926 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
2927 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
2928 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
2929 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
2930 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
2931 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
2932 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
2933 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
2934 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
2935 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
2936 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
2937 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
2938 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
2939 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
2940 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
2941 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
2942 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
2943 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
2944 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
2945 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
2946 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
2947 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
2948 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
2949 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
2950 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
2951 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
2952 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
2953 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
2954 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
2955 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
2956 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
2957 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
2958 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
2959 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
2960 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
2961 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
2962 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
2963 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
2964 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
2965 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
2966 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
2967 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
2968 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
2969 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
2970 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
2977 template <
typename T,
size_t N>
2982 alignas(16) constexpr uint8_t packed_array[16 * 16] = {
2983 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2984 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2985 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2986 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3,
2987 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2988 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3,
2989 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3,
2990 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3,
2991 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2992 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
2993 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
2994 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3,
2995 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
2996 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
2997 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
2998 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3002 return BitCast(d,
Load(d8, packed_array + 16 * mask_bits));
3005 #if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64
3007 template <
typename T,
size_t N>
3012 alignas(16) constexpr uint8_t packed_array[4 * 16] = {
3013 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
3014 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
3015 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
3016 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3020 return BitCast(d,
Load(d8, packed_array + 16 * mask_bits));
3028 template <
typename T,
size_t N>
3030 const uint64_t mask_bits) {
3031 const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
3037 template <
typename T,
size_t N>
3039 const uint64_t mask_bits) {
3040 const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
3046 #if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64
3048 template <
typename T,
size_t N>
3051 const uint64_t mask_bits) {
3052 const auto idx = detail::Idx64x2FromBits<uint64_t, N>(mask_bits);
3062 template <
typename T,
size_t N>
3063 HWY_API Vec128<T, N>
Compress(Vec128<T, N> v,
const Mask128<T, N> mask) {
3070 template <
typename T,
size_t N>
3073 uint64_t mask_bits = 0;
3074 constexpr
size_t kNumBytes = (N + 7) / 8;
3075 CopyBytes<kNumBytes>(bits, &mask_bits);
3077 mask_bits &= (1ull << N) - 1;
3085 template <
typename T,
size_t N>
3096 template <
typename T,
size_t N>
3100 uint64_t mask_bits = 0;
3101 constexpr
size_t kNumBytes = (N + 7) / 8;
3102 CopyBytes<kNumBytes>(bits, &mask_bits);
3104 mask_bits &= (1ull << N) - 1;
3117 const Vec128<uint8_t> c, Full128<uint8_t> d,
3119 const auto k5 =
Set(d, 5);
3120 const auto k6 =
Set(d, 6);
3124 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
3125 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
3126 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
3127 alignas(16)
static constexpr uint8_t tbl_g0[16] = {
3128 0x80, 0, 0x80, 0x80, 1, 0x80,
3129 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
3130 const auto shuf_r0 =
Load(d, tbl_r0);
3131 const auto shuf_g0 =
Load(d, tbl_g0);
3132 const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
3136 const auto int0 = r0 | g0 | b0;
3137 StoreU(int0, d, unaligned + 0 * 16);
3140 const auto shuf_r1 = shuf_b0 + k6;
3141 const auto shuf_g1 = shuf_r0 + k5;
3142 const auto shuf_b1 = shuf_g0 + k5;
3146 const auto int1 = r1 | g1 | b1;
3147 StoreU(int1, d, unaligned + 1 * 16);
3150 const auto shuf_r2 = shuf_b1 + k6;
3151 const auto shuf_g2 = shuf_r1 + k5;
3152 const auto shuf_b2 = shuf_g1 + k5;
3156 const auto int2 = r2 | g2 | b2;
3157 StoreU(int2, d, unaligned + 2 * 16);
3162 const Vec128<uint8_t, 8> b,
3163 const Vec128<uint8_t, 8> c, Simd<uint8_t, 8> d,
3166 const Full128<uint8_t> d_full;
3167 const auto k5 =
Set(d_full, 5);
3168 const auto k6 =
Set(d_full, 6);
3170 const Vec128<uint8_t> full_a{a.raw};
3171 const Vec128<uint8_t> full_b{b.raw};
3172 const Vec128<uint8_t> full_c{c.raw};
3176 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
3177 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
3178 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
3179 alignas(16)
static constexpr uint8_t tbl_g0[16] = {
3180 0x80, 0, 0x80, 0x80, 1, 0x80,
3181 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
3182 const auto shuf_r0 =
Load(d_full, tbl_r0);
3183 const auto shuf_g0 =
Load(d_full, tbl_g0);
3184 const auto shuf_b0 = CombineShiftRightBytes<15>(d_full, shuf_g0, shuf_g0);
3188 const auto int0 = r0 | g0 | b0;
3189 StoreU(int0, d_full, unaligned + 0 * 16);
3192 const auto shuf_r1 = shuf_b0 + k6;
3193 const auto shuf_g1 = shuf_r0 + k5;
3194 const auto shuf_b1 = shuf_g0 + k5;
3198 const decltype(
Zero(d)) int1{(r1 | g1 | b1).raw};
3199 StoreU(int1, d, unaligned + 1 * 16);
3203 template <
size_t N, HWY_IF_LE32(u
int8_t, N)>
3205 const Vec128<uint8_t, N> b,
3206 const Vec128<uint8_t, N> c,
3210 const Full128<uint8_t> d_full;
3212 const Vec128<uint8_t> full_a{a.raw};
3213 const Vec128<uint8_t> full_b{b.raw};
3214 const Vec128<uint8_t> full_c{c.raw};
3218 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
3219 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80,
3220 0x80, 0x80, 0x80, 0x80};
3221 const auto shuf_r0 =
Load(d_full, tbl_r0);
3222 const auto shuf_g0 = CombineShiftRightBytes<15>(d_full, shuf_r0, shuf_r0);
3223 const auto shuf_b0 = CombineShiftRightBytes<14>(d_full, shuf_r0, shuf_r0);
3227 const auto int0 = r0 | g0 | b0;
3228 alignas(16) uint8_t buf[16];
3229 StoreU(int0, d_full, buf);
3230 CopyBytes<N * 3>(buf, unaligned);
3237 const Vec128<uint8_t> v1,
3238 const Vec128<uint8_t> v2,
3239 const Vec128<uint8_t> v3, Full128<uint8_t> d8,
3244 const auto ba0 =
ZipLower(d16, v0, v1);
3245 const auto dc0 =
ZipLower(d16, v2, v3);
3246 const auto ba8 =
ZipUpper(d16, v0, v1);
3247 const auto dc8 =
ZipUpper(d16, v2, v3);
3248 const auto dcba_0 =
ZipLower(d32, ba0, dc0);
3249 const auto dcba_4 =
ZipUpper(d32, ba0, dc0);
3250 const auto dcba_8 =
ZipLower(d32, ba8, dc8);
3251 const auto dcba_C =
ZipUpper(d32, ba8, dc8);
3260 const Vec128<uint8_t, 8> in1,
3261 const Vec128<uint8_t, 8> in2,
3262 const Vec128<uint8_t, 8> in3,
3266 const Full128<uint8_t> d_full8;
3269 const Vec128<uint8_t> v0{in0.raw};
3270 const Vec128<uint8_t> v1{in1.raw};
3271 const Vec128<uint8_t> v2{in2.raw};
3272 const Vec128<uint8_t> v3{in3.raw};
3274 const auto ba0 =
ZipLower(d16, v0, v1);
3275 const auto dc0 =
ZipLower(d16, v2, v3);
3276 const auto dcba_0 =
ZipLower(d32, ba0, dc0);
3277 const auto dcba_4 =
ZipUpper(d32, ba0, dc0);
3278 StoreU(
BitCast(d_full8, dcba_0), d_full8, unaligned + 0 * 16);
3279 StoreU(
BitCast(d_full8, dcba_4), d_full8, unaligned + 1 * 16);
3283 template <
size_t N, HWY_IF_LE32(u
int8_t, N)>
3285 const Vec128<uint8_t, N> in1,
3286 const Vec128<uint8_t, N> in2,
3287 const Vec128<uint8_t, N> in3,
3291 const Full128<uint8_t> d_full8;
3294 const Vec128<uint8_t> v0{in0.raw};
3295 const Vec128<uint8_t> v1{in1.raw};
3296 const Vec128<uint8_t> v2{in2.raw};
3297 const Vec128<uint8_t> v3{in3.raw};
3299 const auto ba0 =
ZipLower(d16, v0, v1);
3300 const auto dc0 =
ZipLower(d16, v2, v3);
3301 const auto dcba_0 =
ZipLower(d32, ba0, dc0);
3302 alignas(16) uint8_t buf[16];
3304 CopyBytes<4 * N>(buf, unaligned);
3310 const Vec128<uint64_t> b) {
3311 alignas(16) uint64_t mul[2];
3313 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 0)),
3314 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
3315 return Load(Full128<uint64_t>(), mul);
3319 const Vec128<uint64_t> b) {
3320 alignas(16) uint64_t mul[2];
3322 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 1)),
3323 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
3324 return Load(Full128<uint64_t>(), mul);
3331 Vec128<bfloat16_t, 2 * N> a,
3332 Vec128<bfloat16_t, 2 * N> b,
3333 const Vec128<float, N> sum0,
3334 Vec128<float, N>& sum1) {
3337 const Vec128<uint16_t, 2 * N> zero =
Zero(du16);
3338 const Vec128<uint32_t, N> a0 =
ZipLower(du32, zero,
BitCast(du16, a));
3339 const Vec128<uint32_t, N> a1 =
ZipUpper(du32, zero,
BitCast(du16, a));
3340 const Vec128<uint32_t, N> b0 =
ZipLower(du32, zero,
BitCast(du16, b));
3341 const Vec128<uint32_t, N> b1 =
ZipUpper(du32, zero,
BitCast(du16, b));
3351 template <
typename T>
3356 template <
typename T>
3361 template <
typename T>
3363 const Vec128<T, 1> v) {
3370 template <
typename T>
3375 template <
typename T>
3380 template <
typename T>
3382 const Vec128<T, 2> v10) {
3383 return Max(v10, Vec128<T, 2>{
Shuffle2301(Vec128<T>{v10.raw}).raw});
3387 template <
typename T>
3391 const Vec128<T> v31_20_31_20 = v3210 + v1032;
3393 return v20_31_20_31 + v31_20_31_20;
3395 template <
typename T>
3401 return Min(v20_31_20_31, v31_20_31_20);
3403 template <
typename T>
3405 const Vec128<T> v3210) {
3407 const Vec128<T> v31_20_31_20 =
Max(v3210, v1032);
3408 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
3409 return Max(v20_31_20_31, v31_20_31_20);
3415 template <
typename T>
3421 template <
typename T>
3425 return Min(v10, v01);
3427 template <
typename T>
3429 const Vec128<T> v10) {
3431 return Max(v10, v01);
3437 template <
typename T,
size_t N>
3441 template <
typename T,
size_t N>
3445 template <
typename T,
size_t N>
3452 template <
typename T,
size_t N>
3457 template <
typename T,
size_t N>
3459 return AllTrue(Simd<T, N>(), mask);
3462 template <
typename T,
size_t N>
3464 return AllFalse(Simd<T, N>(), mask);
3467 template <
typename T,
size_t N>
3472 template <
typename T,
size_t N>
3476 template <
typename T,
size_t N>
3480 template <
typename T,
size_t N>
3485 template <
typename T,
size_t N>
3490 template <
int kBytes,
typename T,
size_t N>
3492 return ShiftRightBytes<kBytes>(Simd<T, N>(), v);
3495 template <
int kLanes,
typename T,
size_t N>
3497 return ShiftRightLanes<kLanes>(Simd<T, N>(), v);
3500 template <
size_t kBytes,
typename T,
size_t N>
3502 return CombineShiftRightBytes<kBytes>(Simd<T, N>(), hi, lo);
3505 template <
typename T,
size_t N>
3510 template <
typename T,
size_t N,
class D = Simd<T, N>>
3511 HWY_API VFromD<RepartitionToWide<D>>
ZipUpper(Vec128<T, N> a, Vec128<T, N> b) {
3515 template <
typename T,
size_t N2>
3516 HWY_API Vec128<T, N2 * 2>
Combine(Vec128<T, N2> hi2, Vec128<T, N2> lo2) {
3517 return Combine(Simd<T, N2 * 2>(), hi2, lo2);
3520 template <
typename T,
size_t N2, HWY_IF_LE64(T, N2)>
3525 template <
typename T,
size_t N>
3530 template <
typename T,
size_t N>
3535 template <
typename T,
size_t N>
3537 const Vec128<T, N> lo) {
3541 template <
typename T,
size_t N>
3576 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
3580 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
3584 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
3589 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
3593 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
3598 HWY_API auto Le(V a, V b) -> decltype(a == b) {
#define HWY_MAX(a, b)
Definition: base.h:123
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:66
#define HWY_IF_LE64(T, N)
Definition: base.h:271
#define HWY_API
Definition: base.h:117
#define HWY_MIN(a, b)
Definition: base.h:122
#define HWY_INLINE
Definition: base.h:59
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:67
#define HWY_DASSERT(condition)
Definition: base.h:163
Definition: arm_neon-inl.h:506
detail::Raw128< T >::type raw
Definition: wasm_128-inl.h:102
Raw raw
Definition: arm_neon-inl.h:516
Definition: arm_neon-inl.h:468
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: wasm_128-inl.h:86
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: wasm_128-inl.h:89
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: wasm_128-inl.h:77
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: wasm_128-inl.h:92
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: wasm_128-inl.h:74
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:469
Raw raw
Definition: arm_neon-inl.h:501
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: wasm_128-inl.h:80
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: wasm_128-inl.h:83
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4233
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4288
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition: wasm_128-inl.h:1652
HWY_INLINE Vec128< T, N > Idx16x8FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:2845
constexpr __i8x16 BytesAbove()
Definition: wasm_128-inl.h:2703
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:2798
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1278
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1136
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2332
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:127
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4095
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4447
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:4769
HWY_INLINE Vec128< T, N > Idx32x4FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:2978
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:578
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4100
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:4431
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:551
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3490
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4091
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:624
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:803
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1573
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:66
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_API Vec128< uint32_t, N > TableLookupLanes(const Vec128< uint32_t, N > v, const Indices128< uint32_t, N > idx)
Definition: arm_neon-inl.h:3342
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3064
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1167
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5035
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3435
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2829
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5027
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1073
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3629
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2949
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3052
HWY_API size_t StoreMaskBits(Simd< T, N >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4528
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4207
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5040
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_API Vec128< T, N > LoadDup128(Simd< T, N > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2164
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4054
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2352
V Shl(V a, V b)
Definition: arm_neon-inl.h:5018
HWY_API Vec128< T, N > VecFromMask(const Mask128< T, N > v)
Definition: arm_neon-inl.h:1607
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5044
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1206
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3760
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N > d, const int32_t *idx)
Definition: arm_neon-inl.h:3323
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1600
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: shared-inl.h:149
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:904
constexpr HWY_API size_t Lanes(Simd< T, N >)
Definition: arm_sve-inl.h:226
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3947
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3726
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3389
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1795
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:422
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3070
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3802
Vec128< T, N > Iota(const Simd< T, N > d, const T2 first)
Definition: arm_neon-inl.h:734
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1438
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1443
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:3907
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2903
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: shared-inl.h:147
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:778
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1619
HWY_API Vec128< T, N > Undefined(Simd< T, N >)
Definition: arm_neon-inl.h:724
HWY_API intptr_t FindFirstTrue(const Simd< T, N >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4520
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1448
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4509
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:392
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5000
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3395
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4538
HWY_API bool AllTrue(const Simd< T, N > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4557
HWY_API void StoreInterleaved3(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4829
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_API VFromD< DW > ZipUpper(DW dw, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3538
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1581
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1655
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2170
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1735
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2846
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:794
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:475
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:325
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1827
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3041
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1126
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:449
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec128< uint8_t, 8 > hi, Vec128< uint8_t, 8 > lo)
Definition: arm_neon-inl.h:3566
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1194
Simd< T, 16/sizeof(T)> Full128
Definition: arm_neon-inl.h:30
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4071
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2157
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4036
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5004
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3637
typename D::template Rebind< T > Rebind
Definition: shared-inl.h:144
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1799
HWY_API V InterleaveUpper(Simd< T, N >, V a, V b)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1084
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< uint32_t, 2 > Shuffle2301(const Vec128< uint32_t, 2 > v)
Definition: arm_neon-inl.h:1698
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:319
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5049
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1077
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:532
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2890
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1366
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4276
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
V Shr(V a, V b)
Definition: arm_neon-inl.h:5022
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:720
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4812
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:3844
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4211
Neg(const Vec128< int64_t, 1 > v)
Definition: arm_neon-inl.h:866
HWY_API Vec128< uint8_t, 4 > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2699
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3235
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3752
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4787
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2818
HWY_API VFromD< DW > ZipLower(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3527
typename D::Half Half
Definition: shared-inl.h:164
typename D::template Repartition< T > Repartition
Definition: shared-inl.h:155
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:2665
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4802
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3401
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:3957
HWY_API void StoreInterleaved4(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, const Vec128< uint8_t > v3, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4864
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5013
HWY_API Vec128< uint8_t, 8 > UpperHalf(Simd< uint8_t, 8 >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3096
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3407
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:984
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5009
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1590
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:535
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3385
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1320
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4780
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:405
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4012
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1296
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
Definition: aligned_allocator.h:23
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:634
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:565
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:613
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:535
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:589
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:529
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:523
#define HWY_ALIGN
Definition: set_macros-inl.h:78
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
Definition: arm_neon-inl.h:3318
__v128_u raw
Definition: wasm_128-inl.h:1997
Definition: shared-inl.h:35
HWY_INLINE __f32x4 operator()(__v128_u v)
Definition: wasm_128-inl.h:147
Definition: wasm_128-inl.h:142
HWY_INLINE __v128_u operator()(__v128_u v)
Definition: wasm_128-inl.h:143
Simd< T, N > operator()(Vec128< T, N >) const
Definition: wasm_128-inl.h:110
__f32x4 type
Definition: wasm_128-inl.h:62
Definition: x86_128-inl.h:51
__v128_u type
Definition: wasm_128-inl.h:58