41 return *
this = (*
this * other);
44 return *
this = (*
this / other);
47 return *
this = (*
this + other);
50 return *
this = (*
this - other);
53 return *
this = (*
this & other);
56 return *
this = (*
this | other);
59 return *
this = (*
this ^ other);
93 using DFromV = decltype(detail::Deduce1()(V()));
96 using TFromV = TFromD<DFromV<V>>;
100 template <
typename T,
typename FromT>
102 static_assert(
sizeof(T) <=
sizeof(FromT),
"Promoting is undefined");
104 CopyBytes<sizeof(FromT)>(&v.
raw, &to);
110 template <
typename T>
115 template <
typename T,
typename T2>
117 return Vec1<T>(
static_cast<T
>(t));
120 template <
typename T>
125 template <
typename T,
typename T2>
127 return Vec1<T>(
static_cast<T
>(first));
134 template <
typename T>
143 template <
typename T>
149 template <
typename T>
156 template <
typename T>
166 template <
typename T>
172 template <
typename T>
179 template <
typename T>
185 template <
typename T>
192 template <
typename T>
194 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
199 template <
typename T>
201 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
207 template <
typename T>
215 #ifdef HWY_NATIVE_POPCNT
216 #undef HWY_NATIVE_POPCNT
218 #define HWY_NATIVE_POPCNT
221 template <
typename T>
228 template <
typename TFrom,
typename TTo>
230 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
235 template <
typename T>
242 template <
typename T>
249 template <
typename T>
256 template <
typename T>
262 template <
typename T>
265 return mask.
bits ? yes : no;
268 template <
typename T>
273 template <
typename T>
278 template <
typename T>
285 template <
typename T>
290 template <
typename T>
296 template <
typename T>
302 template <
typename T>
308 template <
typename T>
318 template <
int kBits,
typename T>
320 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
324 template <
int kBits,
typename T>
326 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
327 #if __cplusplus >= 202002L
337 const TU shifted =
BitCast(du, v).raw >> kBits;
339 const TU upper = sign << (
sizeof(TU) * 8 - 1 - kBits);
349 template <
typename T>
354 template <
typename T>
356 #if __cplusplus >= 202002L
366 const TU shifted =
BitCast(du, v).raw >> bits;
368 const TU upper = sign << (
sizeof(TU) * 8 - 1 - bits);
379 template <
typename T>
384 template <
typename T>
391 template <
typename T>
393 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw);
394 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw);
395 return Vec1<T>(
static_cast<T
>((a64 + b64) &
static_cast<uint64_t
>(~T(0))));
404 template <
typename T>
406 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw);
407 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw);
408 return Vec1<T>(
static_cast<T
>((a64 - b64) &
static_cast<uint64_t
>(~T(0))));
486 template <
typename T>
489 return (i >= 0 || i == hwy::LimitsMin<T>()) ? a :
Vec1<T>(-i);
500 template <
typename T, HWY_IF_NOT_FLOAT(T)>
505 template <
typename T, HWY_IF_FLOAT(T)>
506 HWY_API Vec1<T>
Min(
const Vec1<T> a,
const Vec1<T> b) {
507 if (std::isnan(a.raw))
return b;
508 if (std::isnan(b.raw))
return a;
509 return Vec1<T>(
HWY_MIN(a.raw, b.raw));
512 template <
typename T, HWY_IF_NOT_FLOAT(T)>
517 template <
typename T, HWY_IF_FLOAT(T)>
518 HWY_API Vec1<T>
Max(
const Vec1<T> a,
const Vec1<T> b) {
519 if (std::isnan(a.raw))
return b;
520 if (std::isnan(b.raw))
return a;
521 return Vec1<T>(
HWY_MAX(a.raw, b.raw));
526 template <
typename T, HWY_IF_FLOAT(T)>
531 template <
typename T, HWY_IF_NOT_FLOAT(T)>
533 return Zero(Sisd<T>()) - v;
538 template <
typename T, HWY_IF_FLOAT(T)>
543 template <
typename T, HWY_IF_SIGNED(T)>
545 return Vec1<T>(
static_cast<T
>(int64_t(a.raw) * b.raw));
548 template <
typename T, HWY_IF_UNSIGNED(T)>
550 return Vec1<T>(
static_cast<T
>(uint64_t(a.raw) * b.raw));
553 template <
typename T>
567 (
static_cast<uint32_t
>(a.
raw) *
static_cast<uint32_t
>(b.
raw)) >> 16));
572 const int64_t a64 = a.
raw;
576 const uint64_t a64 = a.
raw;
596 template <
typename T>
598 return mul * x + add;
601 template <
typename T>
604 return add - mul * x;
607 template <
typename T>
609 return mul * x - sub;
612 template <
typename T>
615 return Neg(mul) * x - sub;
623 const float half = f * 0.5f;
625 CopyBytes<4>(&f, &bits);
627 bits = 0x5F3759DF - (bits >> 1);
628 CopyBytes<4>(&bits, &f);
643 template <
typename T>
646 if (!(
Abs(v).raw < MantissaEnd<T>())) {
649 const T bias = v.
raw < T(0.0) ? T(-0.5) : T(0.5);
650 const TI rounded =
static_cast<TI
>(v.
raw + bias);
653 if ((rounded & 1) && std::abs(rounded - v.
raw) == T(0.5)) {
654 return Vec1<T>(
static_cast<T
>(rounded - (v.
raw < T(0) ? -1 : 1)));
656 return Vec1<T>(
static_cast<T
>(rounded));
664 const T abs =
Abs(v).raw;
665 const bool signbit = std::signbit(v.
raw);
667 if (!(abs < MantissaEnd<T>())) {
669 if (!(abs <=
static_cast<T
>(LimitsMax<TI>()))) {
670 return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
674 const T bias = v.
raw < T(0.0) ? T(-0.5) : T(0.5);
675 const TI rounded =
static_cast<TI
>(v.
raw + bias);
678 if ((rounded & 1) && std::abs(
static_cast<T
>(rounded) - v.
raw) == T(0.5)) {
679 return Vec1<TI>(rounded - (signbit ? -1 : 1));
684 template <
typename T>
687 if (!(
Abs(v).raw <= MantissaEnd<T>())) {
690 const TI truncated =
static_cast<TI
>(v.
raw);
692 return Vec1<T>(
static_cast<T
>(truncated));
695 template <
typename Float,
typename Bits,
int kMantissaBits,
int kExponentBits,
698 const Bits kExponentMask = (1ull << kExponentBits) - 1;
699 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
700 const Bits kBias = kExponentMask / 2;
703 const bool positive = f > Float(0.0);
706 CopyBytes<sizeof(Bits)>(&v, &bits);
709 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
711 if (exponent >= kMantissaBits)
return v;
713 if (exponent < 0)
return positive ? V(1) : V(-0.0);
715 const Bits mantissa_mask = kMantissaMask >> exponent;
717 if ((bits & mantissa_mask) == 0)
return v;
720 if (positive) bits += (kMantissaMask + 1) >> exponent;
721 bits &= ~mantissa_mask;
723 CopyBytes<sizeof(Bits)>(&bits, &f);
727 template <
typename Float,
typename Bits,
int kMantissaBits,
int kExponentBits,
730 const Bits kExponentMask = (1ull << kExponentBits) - 1;
731 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
732 const Bits kBias = kExponentMask / 2;
735 const bool negative = f < Float(0.0);
738 CopyBytes<sizeof(Bits)>(&v, &bits);
741 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
743 if (exponent >= kMantissaBits)
return v;
745 if (exponent < 0)
return V(negative ? Float(-1.0) : Float(0.0));
747 const Bits mantissa_mask = kMantissaMask >> exponent;
749 if ((bits & mantissa_mask) == 0)
return v;
752 if (negative) bits += (kMantissaMask + 1) >> exponent;
753 bits &= ~mantissa_mask;
755 CopyBytes<sizeof(Bits)>(&bits, &f);
761 return Ceiling<float, uint32_t, 23, 8>(v);
764 return Ceiling<double, uint64_t, 52, 11>(v);
769 return Floor<float, uint32_t, 23, 8>(v);
772 return Floor<double, uint64_t, 52, 11>(v);
777 template <
typename T>
782 template <
typename T>
787 template <
typename T>
789 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
790 return (v & bit) == bit;
793 template <
typename T>
797 template <
typename T>
802 template <
typename T>
806 template <
typename T>
815 template <
typename T>
818 CopyBytes<sizeof(T)>(aligned, &t);
822 template <
typename T>
828 template <
typename T>
834 template <
typename T>
836 return Load(d, aligned);
841 template <
typename T>
844 CopyBytes<sizeof(T)>(&v.
raw, aligned);
847 template <
typename T>
849 return Store(v, d, p);
857 StoreU(v0, d, unaligned + 0);
858 StoreU(v1, d, unaligned + 1);
859 StoreU(v2, d, unaligned + 2);
866 StoreU(v0, d, unaligned + 0);
867 StoreU(v1, d, unaligned + 1);
868 StoreU(v2, d, unaligned + 2);
869 StoreU(v3, d, unaligned + 3);
874 template <
typename T>
876 return Store(v, d, aligned);
881 template <
typename T,
typename Offset>
884 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
885 uint8_t*
const base8 =
reinterpret_cast<uint8_t*
>(base) + offset.
raw;
886 return Store(v, d,
reinterpret_cast<T*
>(base8));
889 template <
typename T,
typename Index>
892 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
893 return Store(v, d, base + index.
raw);
898 template <
typename T,
typename Offset>
901 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
902 const uintptr_t addr =
reinterpret_cast<uintptr_t
>(base) + offset.
raw;
903 return Load(d,
reinterpret_cast<const T*
>(addr));
906 template <
typename T,
typename Index>
909 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
910 return Load(d, base + index.
raw);
918 template <
typename FromT,
typename ToT>
920 static_assert(
sizeof(ToT) >
sizeof(FromT),
"Not promoting");
925 template <
typename FromT,
typename ToT, HWY_IF_FLOAT(FromT)>
927 static_assert(
sizeof(ToT) <
sizeof(FromT),
"Not demoting");
930 if (std::isinf(from.
raw) ||
931 std::fabs(from.
raw) >
static_cast<FromT
>(HighestValue<ToT>())) {
932 return Vec1<ToT>(std::signbit(from.
raw) ? LowestValue<ToT>()
933 : HighestValue<ToT>());
938 template <
typename FromT,
typename ToT, HWY_IF_NOT_FLOAT(FromT)>
940 static_assert(
sizeof(ToT) <
sizeof(FromT),
"Not demoting");
943 from.raw =
HWY_MIN(
HWY_MAX(LimitsMin<ToT>(), from.raw), LimitsMax<ToT>());
944 return Vec1<ToT>(
static_cast<ToT
>(from.raw));
948 #if HWY_NATIVE_FLOAT16
950 CopyBytes<2>(&v.
raw, &bits16);
952 const uint16_t bits16 = v.
raw.bits;
954 const uint32_t sign =
static_cast<uint32_t
>(bits16 >> 15);
955 const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
956 const uint32_t mantissa = bits16 & 0x3FF;
959 if (biased_exp == 0) {
960 const float subnormal =
961 (1.0f / 16384) * (
static_cast<float>(mantissa) * (1.0f / 1024));
966 const uint32_t biased_exp32 = biased_exp + (127 - 15);
967 const uint32_t mantissa32 = mantissa << (23 - 10);
968 const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
970 CopyBytes<4>(&bits32, &out);
981 CopyBytes<4>(&v.
raw, &bits32);
982 const uint32_t sign = bits32 >> 31;
983 const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
984 const uint32_t mantissa32 = bits32 & 0x7FFFFF;
986 const int32_t exp =
HWY_MIN(
static_cast<int32_t
>(biased_exp32) - 127, 15);
991 #if HWY_NATIVE_FLOAT16
992 const uint16_t zero = 0;
993 CopyBytes<2>(&zero, &out.
raw);
1000 uint32_t biased_exp16, mantissa16;
1005 const uint32_t sub_exp =
static_cast<uint32_t
>(-14 - exp);
1007 mantissa16 =
static_cast<uint32_t
>((1u << (10 - sub_exp)) +
1008 (mantissa32 >> (13 + sub_exp)));
1011 biased_exp16 =
static_cast<uint32_t
>(exp + 15);
1012 HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1013 mantissa16 = mantissa32 >> 13;
1017 const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1019 #if HWY_NATIVE_FLOAT16
1020 const uint16_t narrowed =
static_cast<uint16_t
>(bits16);
1021 CopyBytes<2>(&narrowed, &out.
raw);
1023 out.
raw.bits =
static_cast<uint16_t
>(bits16);
1032 template <
typename FromT,
typename ToT, HWY_IF_FLOAT(FromT)>
1034 static_assert(
sizeof(ToT) ==
sizeof(FromT),
"Should have same size");
1037 const double f =
static_cast<double>(from.
raw);
1038 if (std::isinf(from.
raw) ||
1039 std::fabs(f) >
static_cast<double>(LimitsMax<ToT>())) {
1040 return Vec1<ToT>(std::signbit(from.
raw) ? LimitsMin<ToT>()
1041 : LimitsMax<ToT>());
1046 template <
typename FromT,
typename ToT, HWY_IF_NOT_FLOAT(FromT)>
1048 static_assert(
sizeof(ToT) ==
sizeof(FromT),
"Should have same size");
1050 return Vec1<ToT>(
static_cast<ToT
>(from.raw));
1060 template <
typename T>
1065 template <
typename T>
1073 template <
typename T>
1081 template <
typename T>
1086 template <
typename T>
1092 template <
typename T>
1099 template <
typename T>
1109 template <
int kLane,
typename T>
1111 static_assert(kLane == 0,
"Scalar only has one lane");
1119 template <
typename T>
1121 uint8_t in_bytes[
sizeof(T)];
1122 uint8_t from_bytes[
sizeof(T)];
1123 uint8_t out_bytes[
sizeof(T)];
1124 CopyBytes<sizeof(T)>(&in, &in_bytes);
1125 CopyBytes<sizeof(T)>(&from, &from_bytes);
1126 for (
size_t i = 0; i <
sizeof(T); ++i) {
1127 out_bytes[i] = in_bytes[from_bytes[i]];
1130 CopyBytes<sizeof(T)>(&out_bytes, &out);
1134 template <
typename T>
1136 uint8_t in_bytes[
sizeof(T)];
1137 uint8_t from_bytes[
sizeof(T)];
1138 uint8_t out_bytes[
sizeof(T)];
1139 CopyBytes<sizeof(T)>(&in, &in_bytes);
1140 CopyBytes<sizeof(T)>(&from, &from_bytes);
1141 for (
size_t i = 0; i <
sizeof(T); ++i) {
1142 out_bytes[i] = from_bytes[i] & 0x80 ? 0 : in_bytes[from_bytes[i]];
1145 CopyBytes<sizeof(T)>(&out_bytes, &out);
1172 template <
typename T,
typename TW = MakeW
ide<T>,
class VW = Vec1<TW>>
1174 return VW(
static_cast<TW
>((TW{b.
raw} << (
sizeof(T) * 8)) + a.
raw));
1179 template <
typename T>
1181 return mask.
bits == 0;
1184 template <
typename T>
1186 return mask.
bits != 0;
1190 template <
typename T>
1197 template <
typename T>
1203 template <
typename T>
1205 return mask.
bits == 0 ? 0 : 1;
1208 template <
typename T>
1210 return mask.
bits == 0 ? -1 : 0;
1215 template <
typename T>
1221 template <
typename T>
1228 template <
typename T>
1237 template <
typename T>
1259 template <
typename T>
1263 template <
typename T>
1267 template <
typename T>
1274 template <
typename T>
1279 template <
typename T>
1284 template <
typename T>
1289 template <
typename T>
1294 template <
typename T>
1298 template <
typename T>
1302 template <
typename T>
1337 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
1341 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
1345 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
1350 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
1354 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
1359 HWY_API auto Le(V a, V b) -> decltype(a == b) {
#define HWY_MAX(a, b)
Definition: base.h:123
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_API
Definition: base.h:117
#define HWY_MIN(a, b)
Definition: base.h:122
#define HWY_INLINE
Definition: base.h:59
#define HWY_DASSERT(condition)
Definition: base.h:163
Definition: scalar-inl.h:67
Raw bits
Definition: scalar-inl.h:77
hwy::MakeUnsigned< T > Raw
Definition: scalar-inl.h:68
static HWY_INLINE Mask1< T > FromBool(bool b)
Definition: scalar-inl.h:71
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:803
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1573
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:66
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_API Vec128< uint32_t, N > TableLookupLanes(const Vec128< uint32_t, N > v, const Indices128< uint32_t, N > idx)
Definition: arm_neon-inl.h:3342
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1167
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1520
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5035
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2829
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5027
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1073
HWY_API size_t StoreMaskBits(Simd< T, N >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4528
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4207
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5040
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_API Vec128< T, N > LoadDup128(Simd< T, N > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2164
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4054
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2352
V Shl(V a, V b)
Definition: arm_neon-inl.h:5018
HWY_API Vec128< T, N > VecFromMask(const Mask128< T, N > v)
Definition: arm_neon-inl.h:1607
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5044
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1206
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N > d, const int32_t *idx)
Definition: arm_neon-inl.h:3323
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1600
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:904
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1795
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:422
Vec128< T, N > Iota(const Simd< T, N > d, const T2 first)
Definition: arm_neon-inl.h:734
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1438
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1443
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:3907
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2903
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:778
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1619
HWY_API Vec128< T, N > Undefined(Simd< T, N >)
Definition: arm_neon-inl.h:724
HWY_API intptr_t FindFirstTrue(const Simd< T, N >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4520
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1448
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4509
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:392
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5000
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4538
HWY_API bool AllTrue(const Simd< T, N > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4557
HWY_API void StoreInterleaved3(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4829
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1581
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1655
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2170
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1735
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2846
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:794
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:475
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:325
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1827
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1126
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:449
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1194
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4071
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2157
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4036
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5004
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1799
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1084
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
V Ceiling(const V v)
Definition: scalar-inl.h:697
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:319
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5049
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1077
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:532
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2890
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1366
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4276
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
V Shr(V a, V b)
Definition: arm_neon-inl.h:5022
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4812
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4211
Neg(const Vec128< int64_t, 1 > v)
Definition: arm_neon-inl.h:866
HWY_API Vec128< uint8_t, 4 > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2699
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3235
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2818
HWY_API VFromD< DW > ZipLower(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3527
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4019
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4802
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:3957
HWY_API void StoreInterleaved4(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, const Vec128< uint8_t > v3, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4864
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5013
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:984
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5009
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1590
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:535
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1320
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4780
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:405
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4012
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1296
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
Definition: aligned_allocator.h:23
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:634
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:648
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:656
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:589
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:521
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:523
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
Definition: scalar-inl.h:1082
int raw
Definition: scalar-inl.h:1083
Definition: shared-inl.h:35
Definition: scalar-inl.h:34
T raw
Definition: scalar-inl.h:62
Vec1 & operator=(const Vec1 &)=default
HWY_INLINE Vec1 & operator*=(const Vec1 other)
Definition: scalar-inl.h:40
HWY_INLINE Vec1 & operator-=(const Vec1 other)
Definition: scalar-inl.h:49
HWY_INLINE Vec1 & operator+=(const Vec1 other)
Definition: scalar-inl.h:46
Vec1(const Vec1 &)=default
HWY_INLINE Vec1()=default
HWY_INLINE Vec1 & operator/=(const Vec1 other)
Definition: scalar-inl.h:43
HWY_INLINE Vec1 & operator&=(const Vec1 other)
Definition: scalar-inl.h:52
HWY_INLINE Vec1(const T t)
Definition: scalar-inl.h:38
HWY_INLINE Vec1 & operator^=(const Vec1 other)
Definition: scalar-inl.h:58
HWY_INLINE Vec1 & operator|=(const Vec1 other)
Definition: scalar-inl.h:55
Definition: scalar-inl.h:83
Sisd< T > operator()(Vec1< T >) const
Definition: scalar-inl.h:85