21 #if defined(HWY_EMULATE_SVE)
22 #include "third_party/farm_sve/farm_sve.h"
35 template <
typename T,
int kShift = 0>
46 #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
47 #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
48 #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
49 #define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
59 #define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) X_MACRO(uint, u, 8, NAME, OP)
60 #define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) X_MACRO(uint, u, 16, NAME, OP)
61 #define HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) X_MACRO(uint, u, 32, NAME, OP)
62 #define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) X_MACRO(uint, u, 64, NAME, OP)
65 #define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) X_MACRO(int, s, 8, NAME, OP)
66 #define HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) X_MACRO(int, s, 16, NAME, OP)
67 #define HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) X_MACRO(int, s, 32, NAME, OP)
68 #define HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) X_MACRO(int, s, 64, NAME, OP)
71 #define HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) X_MACRO(float, f, 16, NAME, OP)
72 #define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) X_MACRO(float, f, 32, NAME, OP)
73 #define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) X_MACRO(float, f, 64, NAME, OP)
76 #define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
77 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
78 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
79 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
80 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
82 #define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
83 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) \
84 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) \
85 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \
86 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
88 #define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
89 HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
90 HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
91 HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
94 #define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
95 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
96 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
98 #define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP) \
99 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
100 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP)
102 #define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
103 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
104 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP)
106 #define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
107 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
108 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
110 #define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
111 HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
112 HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
113 HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
114 HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
117 #define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
118 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
119 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
121 #define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP) \
122 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
123 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
125 #define HWY_SVE_FOREACH(X_MACRO, NAME, OP) \
126 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
127 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
128 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
131 #define HWY_SVE_T(BASE, BITS) BASE##BITS##_t
132 #define HWY_SVE_D(BASE, BITS, N) Simd<HWY_SVE_T(BASE, BITS), N>
133 #define HWY_SVE_V(BASE, BITS) sv##BASE##BITS##_t
137 #define HWY_SPECIALIZE(BASE, CHAR, BITS, NAME, OP) \
139 struct DFromV_t<HWY_SVE_V(BASE, BITS)> { \
140 using type = HWY_SVE_D(BASE, BITS, HWY_LANES(HWY_SVE_T(BASE, BITS))); \
144 #undef HWY_SPECIALIZE
147 #define HWY_SVE_RETV_ARGD(BASE, CHAR, BITS, NAME, OP) \
148 template <size_t N> \
149 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N) d) { \
150 return sv##OP##_##CHAR##BITS(); \
157 #define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, NAME, OP) \
158 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
159 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
161 #define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, NAME, OP) \
162 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
163 return sv##OP##_##CHAR##BITS(v); \
167 #define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, NAME, OP) \
168 HWY_API HWY_SVE_V(BASE, BITS) \
169 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
170 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
172 #define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, NAME, OP) \
173 HWY_API HWY_SVE_V(BASE, BITS) \
174 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
175 return sv##OP##_##CHAR##BITS(a, b); \
179 #define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, NAME, OP) \
180 HWY_API HWY_SVE_V(BASE, BITS) \
181 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
182 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
184 #define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, NAME, OP) \
185 HWY_API HWY_SVE_V(BASE, BITS) \
186 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
187 return sv##OP##_##CHAR##BITS(a, b); \
196 return svcntb_pat(SV_ALL);
199 return svcnth_pat(SV_ALL);
202 return svcntw_pat(SV_ALL);
205 return svcntd_pat(SV_ALL);
210 return svcntb_pat(SV_POW2);
213 return svcnth_pat(SV_POW2);
216 return svcntw_pat(SV_POW2);
219 return svcntd_pat(SV_POW2);
225 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
233 template <
typename T,
size_t N, HWY_IF_GT128(T, N)>
235 static_assert(N <=
HWY_LANES(T),
"N cannot exceed a full vector");
239 static_assert(div <= 8,
"Invalid N - must be <=128 bit, or >=1/8th");
248 #define HWY_SVE_FIRSTN(BASE, CHAR, BITS, NAME, OP) \
249 template <size_t KN> \
250 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, KN) , uint32_t N) { \
251 return sv##OP##_b##BITS##_u32(uint32_t(0), N); \
254 #undef HWY_SVE_FIRSTN
259 #define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
261 #define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, NAME, OP) \
262 template <size_t N> \
263 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N) d) { \
264 return HWY_SVE_PTRUE(BITS); \
268 #undef HWY_SVE_WRAP_PTRUE
270 HWY_API svbool_t PFalse() {
return svpfalse_b(); }
276 template <
typename T,
size_t N>
287 #define HWY_SVE_SET(BASE, CHAR, BITS, NAME, OP) \
288 template <size_t N> \
289 HWY_API HWY_SVE_V(BASE, BITS) \
290 NAME(HWY_SVE_D(BASE, BITS, N) d, HWY_SVE_T(BASE, BITS) arg) { \
291 return sv##OP##_##CHAR##BITS(arg); \
304 using VFromD = decltype(
Set(D(), TFromD<D>()));
315 #if defined(HWY_EMULATE_SVE)
329 #define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, NAME, OP) \
330 HWY_API HWY_SVE_V(BASE, BITS) BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
333 template <size_t N> \
334 HWY_API HWY_SVE_V(BASE, BITS) BitCastFromByte( \
335 HWY_SVE_D(BASE, BITS, N) , HWY_SVE_V(BASE, BITS) v) { \
340 #define HWY_SVE_CAST(BASE, CHAR, BITS, NAME, OP) \
341 HWY_INLINE svuint8_t BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
342 return sv##OP##_u8_##CHAR##BITS(v); \
344 template <size_t N> \
345 HWY_INLINE HWY_SVE_V(BASE, BITS) \
346 BitCastFromByte(HWY_SVE_D(BASE, BITS, N) , svuint8_t v) { \
347 return sv##OP##_##CHAR##BITS##_u8(v); \
357 #undef HWY_SVE_CAST_NOP
368 template <
class D,
class FromV>
389 template <
class V, HWY_IF_FLOAT_V(V)>
400 template <
class V, HWY_IF_FLOAT_V(V)>
415 template <
class V, HWY_IF_FLOAT_V(V)>
425 #define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, NAME, OP) \
426 HWY_API HWY_SVE_V(BASE, BITS) \
427 NAME(HWY_SVE_T(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
428 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
432 #undef HWY_SVE_RETV_ARGPVN_SWAP
435 #define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, NAME, OP) \
436 HWY_API HWY_SVE_V(BASE, BITS) \
437 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
438 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
441 #undef HWY_SVE_RETV_ARGPVV_SWAP
443 template <
class V, HWY_IF_FLOAT_V(V)>
452 #ifdef HWY_NATIVE_POPCNT
453 #undef HWY_NATIVE_POPCNT
455 #define HWY_NATIVE_POPCNT
459 #define HWY_SVE_POPCNT(BASE, CHAR, BITS, NAME, OP) \
460 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
461 return BitCast(DFromV<decltype(v)>(), \
462 sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v)); \
465 #undef HWY_SVE_POPCNT
486 return Or(abs,
And(msb, sign));
503 #define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, NAME, OP) \
504 HWY_API HWY_SVE_V(BASE, BITS) \
505 NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
506 return sv##OP##_##CHAR##BITS##_z(pg, a, b); \
510 #undef HWY_SVE_RETV_ARGPVN_MASK
530 #define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, NAME, OP) \
531 template <int kBits> \
532 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
533 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
535 HWY_API HWY_SVE_V(BASE, BITS) \
536 NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \
537 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits); \
547 #undef HWY_SVE_SHIFT_N
551 #define HWY_SVE_SHIFT(BASE, CHAR, BITS, NAME, OP) \
552 HWY_API HWY_SVE_V(BASE, BITS) \
553 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \
554 using TU = HWY_SVE_T(uint, BITS); \
555 return sv##OP##_##CHAR##BITS##_x( \
556 HWY_SVE_PTRUE(BITS), v, BitCast(Simd<TU, HWY_LANES(TU)>(), bits)); \
602 #define HWY_SVE_FMA(BASE, CHAR, BITS, NAME, OP) \
603 HWY_API HWY_SVE_V(BASE, BITS) \
604 NAME(HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \
605 HWY_SVE_V(BASE, BITS) add) { \
606 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), x, mul, add); \
632 template <
class D,
typename MFrom>
646 return svand_b_z(b, b, a);
649 return svbic_b_z(b, b, a);
652 return svsel_b(a, a, b);
655 return svsel_b(a, svnand_b_z(a, a, b), b);
660 #define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, NAME, OP) \
661 template <size_t N> \
662 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N) d, svbool_t m) { \
663 return sv##OP##_b##BITS(detail::Mask(d), m); \
667 #undef HWY_SVE_COUNT_TRUE
672 #define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, NAME, OP) \
673 template <size_t N> \
674 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N) d, svbool_t m) { \
675 return sv##OP##_b##BITS(svptrue_b##BITS(), m); \
679 #undef HWY_SVE_COUNT_TRUE_FULL
684 template <
typename T,
size_t N>
690 template <
typename T,
size_t N>
696 template <
typename T,
size_t N>
702 #define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, NAME, OP) \
703 HWY_API HWY_SVE_V(BASE, BITS) \
704 NAME(svbool_t m, HWY_SVE_V(BASE, BITS) yes, HWY_SVE_V(BASE, BITS) no) { \
705 return sv##OP##_##CHAR##BITS(m, yes, no); \
709 #undef HWY_SVE_IF_THEN_ELSE
712 template <
class M,
class V>
718 template <
class M,
class V>
726 #define HWY_SVE_COMPARE(BASE, CHAR, BITS, NAME, OP) \
727 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
728 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
730 #define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, NAME, OP) \
731 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
732 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
750 #undef HWY_SVE_COMPARE
751 #undef HWY_SVE_COMPARE_N
778 template <
class D, HWY_IF_NOT_FLOAT_D(D)>
781 return BitCast(d, detail::SubN(mask, v0, 1));
784 template <
class D, HWY_IF_FLOAT_D(D)>
793 #define HWY_SVE_LOAD(BASE, CHAR, BITS, NAME, OP) \
794 template <size_t N> \
795 HWY_API HWY_SVE_V(BASE, BITS) \
796 NAME(HWY_SVE_D(BASE, BITS, N) d, \
797 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
798 return sv##OP##_##CHAR##BITS(detail::Mask(d), p); \
801 #define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, NAME, OP) \
802 template <size_t N> \
803 HWY_API HWY_SVE_V(BASE, BITS) \
804 NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N) d, \
805 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
806 return sv##OP##_##CHAR##BITS(m, p); \
809 #define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, NAME, OP) \
810 template <size_t N> \
811 HWY_API HWY_SVE_V(BASE, BITS) \
812 NAME(HWY_SVE_D(BASE, BITS, N) d, \
813 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
815 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), p); \
818 #define HWY_SVE_STORE(BASE, CHAR, BITS, NAME, OP) \
819 template <size_t N> \
820 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N) d, \
821 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
822 sv##OP##_##CHAR##BITS(detail::Mask(d), p, v); \
832 #undef HWY_SVE_MASKED_LOAD
833 #undef HWY_SVE_LOAD_DUP128
860 template <
class V,
class D>
867 #define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, NAME, OP) \
868 template <size_t N> \
869 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N) d, \
870 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
871 HWY_SVE_V(int, BITS) offset) { \
872 sv##OP##_s##BITS##offset_##CHAR##BITS(detail::Mask(d), base, offset, v); \
875 #define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, NAME, OP) \
876 template <size_t N> \
877 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N) d, \
878 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
879 HWY_SVE_V(int, BITS) index) { \
880 sv##OP##_s##BITS##index_##CHAR##BITS(detail::Mask(d), base, index, v); \
885 #undef HWY_SVE_SCATTER_OFFSET
886 #undef HWY_SVE_SCATTER_INDEX
890 #define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, NAME, OP) \
891 template <size_t N> \
892 HWY_API HWY_SVE_V(BASE, BITS) \
893 NAME(HWY_SVE_D(BASE, BITS, N) d, \
894 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
895 HWY_SVE_V(int, BITS) offset) { \
896 return sv##OP##_s##BITS##offset_##CHAR##BITS(detail::Mask(d), base, \
899 #define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, NAME, OP) \
900 template <size_t N> \
901 HWY_API HWY_SVE_V(BASE, BITS) \
902 NAME(HWY_SVE_D(BASE, BITS, N) d, \
903 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
904 HWY_SVE_V(int, BITS) index) { \
905 return sv##OP##_s##BITS##index_##CHAR##BITS(detail::Mask(d), base, index); \
910 #undef HWY_SVE_GATHER_OFFSET
911 #undef HWY_SVE_GATHER_INDEX
915 #define HWY_SVE_STORE3(BASE, CHAR, BITS, NAME, OP) \
916 template <size_t N> \
917 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
918 HWY_SVE_V(BASE, BITS) v2, HWY_SVE_D(BASE, BITS, N) d, \
919 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
920 const sv##BASE##BITS##x3_t triple = svcreate3##_##CHAR##BITS(v0, v1, v2); \
921 sv##OP##_##CHAR##BITS(detail::Mask(d), unaligned, triple); \
925 #undef HWY_SVE_STORE3
929 #define HWY_SVE_STORE4(BASE, CHAR, BITS, NAME, OP) \
930 template <size_t N> \
931 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
932 HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
933 HWY_SVE_D(BASE, BITS, N) d, \
934 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
935 const sv##BASE##BITS##x4_t quad = \
936 svcreate4##_##CHAR##BITS(v0, v1, v2, v3); \
937 sv##OP##_##CHAR##BITS(detail::Mask(d), unaligned, quad); \
941 #undef HWY_SVE_STORE4
948 #define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, NAME, OP) \
949 template <size_t N> \
950 HWY_API HWY_SVE_V(BASE, BITS) \
951 NAME(HWY_SVE_D(BASE, BITS, N) , \
952 VFromD<Simd<MakeNarrow<HWY_SVE_T(BASE, BITS)>, \
953 HWY_LANES(HWY_SVE_T(BASE, BITS)) * 2>> \
955 return sv##OP##_##CHAR##BITS(v); \
1016 #undef HWY_SVE_PROMOTE_TO
1032 template <
typename TN,
class VU>
1034 return detail::MinN(v,
static_cast<TFromV<VU>>(LimitsMax<TN>()));
1038 template <
typename TN,
class VI>
1041 return detail::MinN(detail::MaxN(v, LimitsMin<TN>()), LimitsMax<TN>());
1048 const DFromV<decltype(v)> di;
1050 using TN =
TFromD<decltype(dn)>;
1054 const svuint8_t vn =
BitCast(dn, detail::SaturateU<TN>(clamped));
1055 return svuzp1_u8(vn, vn);
1060 const DFromV<decltype(v)> di;
1062 using TN =
TFromD<decltype(dn)>;
1066 const svuint16_t vn =
BitCast(dn, detail::SaturateU<TN>(clamped));
1067 return svuzp1_u16(vn, vn);
1072 const DFromV<decltype(v)> di;
1075 using TN =
TFromD<decltype(dn)>;
1079 const svuint16_t cast16 =
BitCast(d2, detail::SaturateU<TN>(clamped));
1080 const svuint8_t x2 =
BitCast(dn, svuzp1_u16(cast16, cast16));
1081 return svuzp1_u8(x2, x2);
1089 const svuint16_t cast16 =
BitCast(du16, v);
1090 const svuint16_t x2 = svuzp1_u16(cast16, cast16);
1091 const svuint8_t cast8 =
BitCast(du8, x2);
1092 return svuzp1_u8(cast8, cast8);
1099 const DFromV<decltype(v)> di;
1100 using TN =
TFromD<decltype(dn)>;
1101 #if HWY_TARGET == HWY_SVE2
1102 const svint8_t vn =
BitCast(dn, svqxtnb_s16(v));
1104 const svint8_t vn =
BitCast(dn, detail::SaturateI<TN>(v));
1106 return svuzp1_s8(vn, vn);
1111 const DFromV<decltype(v)> di;
1112 using TN =
TFromD<decltype(dn)>;
1113 #if HWY_TARGET == HWY_SVE2
1114 const svint16_t vn =
BitCast(dn, svqxtnb_s32(v));
1116 const svint16_t vn =
BitCast(dn, detail::SaturateI<TN>(v));
1118 return svuzp1_s16(vn, vn);
1123 const DFromV<decltype(v)> di;
1124 using TN =
TFromD<decltype(dn)>;
1126 #if HWY_TARGET == HWY_SVE2
1127 const svint16_t cast16 =
BitCast(d2, svqxtnb_s16(svqxtnb_s32(v)));
1129 const svint16_t cast16 =
BitCast(d2, detail::SaturateI<TN>(v));
1131 const svint8_t v2 =
BitCast(dn, svuzp1_s16(cast16, cast16));
1132 return BitCast(dn, svuzp1_s8(v2, v2));
1141 #define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, NAME, OP) \
1142 HWY_INLINE HWY_SVE_V(BASE, BITS) \
1143 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1144 return sv##OP##_##CHAR##BITS(lo, hi); \
1148 #undef HWY_SVE_CONCAT_EVERY_SECOND
1152 #define HWY_SVE_SPLICE(BASE, CHAR, BITS, NAME, OP) \
1153 HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1154 HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
1155 return sv##OP##_##CHAR##BITS(mask, lo, hi); \
1158 #undef HWY_SVE_SPLICE
1169 return detail::Splice(hi_odd, lo_odd,
FirstN(d,
Lanes(d) / 2));
1180 return detail::Splice(hi_odd, lo_odd,
FirstN(d,
Lanes(d) / 2));
1188 return svcvt_f16_f32_x(detail::PTrue(d), v);
1199 return svcvt_f32_f64_x(detail::PTrue(d), v);
1204 return svcvt_s32_f64_x(detail::PTrue(d), v);
1209 #define HWY_SVE_CONVERT(BASE, CHAR, BITS, NAME, OP) \
1210 template <size_t N> \
1211 HWY_API HWY_SVE_V(BASE, BITS) \
1212 NAME(HWY_SVE_D(BASE, BITS, N) , HWY_SVE_V(int, BITS) v) { \
1213 return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1216 template <size_t N> \
1217 HWY_API HWY_SVE_V(int, BITS) \
1218 NAME(HWY_SVE_D(int, BITS, N) , HWY_SVE_V(BASE, BITS) v) { \
1219 return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1224 #undef HWY_SVE_CONVERT
1228 template <
class VF,
class DI = RebindToSigned<DFromV<VF>>>
1236 #define HWY_SVE_IOTA(BASE, CHAR, BITS, NAME, OP) \
1237 template <size_t N> \
1238 HWY_API HWY_SVE_V(BASE, BITS) \
1239 NAME(HWY_SVE_D(BASE, BITS, N) d, HWY_SVE_T(BASE, BITS) first) { \
1240 return sv##OP##_##CHAR##BITS(first, 1); \
1246 template <
class D, HWY_IF_FLOAT_D(D)>
1256 template <
typename T,
size_t N>
1260 template <
typename T,
size_t N>
1268 #define HWY_SVE_EXT(BASE, CHAR, BITS, NAME, OP) \
1269 template <size_t kIndex> \
1270 HWY_API HWY_SVE_V(BASE, BITS) \
1271 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1272 return sv##OP##_##CHAR##BITS(lo, hi, kIndex); \
1280 template <
class D,
class V>
1286 template <
class D,
class V>
1292 template <
class D,
class V>
1298 template <
class D,
class V>
1301 const V lo_upper = detail::Splice(lo, lo, mask_upper);
1306 template <
class D,
class V2>
1313 template <
class D,
class V>
1320 template <
class D2,
class V>
1330 template <
class D2,
class V>
1339 #define HWY_SVE_GET_LANE(BASE, CHAR, BITS, NAME, OP) \
1340 HWY_API HWY_SVE_T(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1341 return sv##OP##_##CHAR##BITS(detail::PFalse(), v); \
1345 #undef HWY_SVE_GET_LANE
1357 const auto even_in_odd = detail::Insert(even, 0);
1358 return detail::InterleaveOdd(even_in_odd, odd);
1363 template <
class D,
class DI = RebindToSigned<D>>
1365 #if HWY_IS_DEBUG_BUILD
1366 const size_t N =
Lanes(d);
1367 for (
size_t i = 0; i < N; ++i) {
1373 return Load(DI(), idx);
1377 #define HWY_SVE_TABLE(BASE, CHAR, BITS, NAME, OP) \
1378 HWY_API HWY_SVE_V(BASE, BITS) \
1379 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(int, BITS) idx) { \
1380 const auto idx_u = BitCast(RebindToUnsigned<DFromV<decltype(v)>>(), idx); \
1381 return sv##OP##_##CHAR##BITS(v, idx_u); \
1385 #undef HWY_SVE_TABLE
1390 #error "Update macro"
1392 #define HWY_SVE_REVERSE(BASE, CHAR, BITS, NAME, OP) \
1393 template <size_t N> \
1394 HWY_API HWY_SVE_V(BASE, BITS) \
1395 NAME(Simd<HWY_SVE_T(BASE, BITS), N> d, HWY_SVE_V(BASE, BITS) v) { \
1396 const auto reversed = sv##OP##_##CHAR##BITS(v); \
1398 const size_t all_lanes = \
1399 detail::AllHardwareLanes(hwy::SizeTag<BITS / 8>()); \
1401 const svbool_t mask = Not(FirstN(d, all_lanes - Lanes(d))); \
1402 return detail::Splice(reversed, reversed, mask); \
1406 #undef HWY_SVE_REVERSE
1410 #define HWY_SVE_COMPRESS(BASE, CHAR, BITS, NAME, OP) \
1411 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
1412 return sv##OP##_##CHAR##BITS(mask, v); \
1416 #undef HWY_SVE_COMPRESS
1418 template <
class V, HWY_IF_LANE_SIZE_V(V, 2)>
1420 static_assert(!IsSame<V, svfloat16_t>(),
"Must use overload");
1427 const svbool_t mask32L = svunpklo_b(mask16);
1428 const svbool_t mask32H = svunpkhi_b(mask16);
1430 const auto compressedL =
Compress(v32L, mask32L);
1431 const auto compressedH =
Compress(v32H, mask32H);
1434 const V evenL =
BitCast(d16, compressedL);
1435 const V evenH =
BitCast(d16, compressedH);
1442 const size_t countL = detail::CountTrueFull(dw, mask32L);
1443 const auto compressed_maskL =
FirstN(d16, countL);
1444 return detail::Splice(v16H, v16L, compressed_maskL);
1449 const DFromV<decltype(v)> df;
1456 template <
class V,
class M,
class D>
1471 template <
typename T,
size_t N>
1474 return HWY_MIN(16 /
sizeof(T), N);
1477 template <
class D,
class V>
1480 return detail::AndNotN(
static_cast<T
>(
LanesPerBlock(d) - 1), iota0);
1483 template <
size_t kLanes,
class D>
1487 const auto idx_mod = detail::AndN(
Iota(di, 0), kLanesPerBlock - 1);
1488 return detail::LtN(
BitCast(di, idx_mod), kLanes);
1493 template <
size_t kBytes,
class D,
class V = VFromD<D>>
1496 const auto hi8 =
BitCast(d8, hi);
1497 const auto lo8 =
BitCast(d8, lo);
1498 const auto hi_up = detail::Splice(hi8, hi8,
FirstN(d8, 16 - kBytes));
1499 const auto lo_down = detail::Ext<kBytes>(lo8, lo8);
1506 #define HWY_SVE_SHUFFLE_2301(BASE, CHAR, BITS, NAME, OP) \
1507 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1508 const DFromV<decltype(v)> d; \
1509 const svuint64_t vu64 = BitCast(Repartition<uint64_t, decltype(d)>(), v); \
1510 return BitCast(d, sv##OP##_u64_x(HWY_SVE_PTRUE(64), vu64)); \
1514 #undef HWY_SVE_SHUFFLE_2301
1516 template <
class V, HWY_IF_FLOAT_V(V)>
1528 static_assert(
sizeof(
TFromD<decltype(d)>) == 4,
"Defined for 32-bit types");
1529 const svuint8_t v8 =
BitCast(d8, v);
1530 return BitCast(d, CombineShiftRightBytes<12>(d8, v8, v8));
1538 static_assert(
sizeof(
TFromD<decltype(d)>) == 4,
"Defined for 32-bit types");
1539 const svuint8_t v8 =
BitCast(d8, v);
1540 return BitCast(d, CombineShiftRightBytes<4>(d8, v8, v8));
1548 static_assert(
sizeof(
TFromD<decltype(d)>) == 4,
"Defined for 32-bit types");
1549 const svuint8_t v8 =
BitCast(d8, v);
1550 return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
1558 static_assert(
sizeof(
TFromD<decltype(d)>) == 8,
"Defined for 64-bit types");
1559 const svuint8_t v8 =
BitCast(d8, v);
1560 return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
1571 template <
class V,
class VI>
1581 template <
class V,
class VI>
1587 auto idx8 =
BitCast(di8, idx);
1588 const auto msb =
Lt(idx8,
Zero(di8));
1590 #if defined(HWY_EMULATE_SVE)
1600 template <
int kLane,
class V>
1605 static_assert(0 <= kLane && kLane < kLanesPerBlock,
"Invalid lane");
1608 idx = detail::AddN(idx, kLane);
1615 template <
size_t kLanes,
class D,
class V = VFromD<D>>
1618 const auto zero =
Zero(d);
1619 const auto shifted = detail::Splice(v, zero,
FirstN(d, kLanes));
1621 return IfThenElse(detail::FirstNPerBlock<kLanes>(d), zero, shifted);
1624 template <
size_t kLanes,
class V>
1626 return ShiftLeftLanes<kLanes>(
DFromV<V>(), v);
1630 template <
size_t kLanes,
typename T,
size_t N,
class V = VFromD<Simd<T, N>>>
1638 const auto shifted = detail::Ext<kLanes>(v, v);
1647 template <
int kBytes,
class D,
class V = VFromD<D>>
1653 template <
int kBytes,
class V>
1655 return ShiftLeftBytes<kBytes>(
DFromV<V>(), v);
1659 template <
int kBytes,
class D,
class V = VFromD<D>>
1673 template <
class D,
class V>
1678 const auto a64 =
BitCast(d64, a);
1679 const auto b64 =
BitCast(d64, b);
1694 template <
typename T,
class V = VFromD<Full<T>>>
1698 const auto a64 =
BitCast(d64, a);
1699 const auto b64 =
BitCast(d64, b);
1706 template <
typename T,
size_t N, HWY_IF_LE64(T, N),
class V = VFromD<Simd<T, N>>>
1708 static_assert(
IsSame<T, TFromV<V>>(),
"D/V mismatch");
1709 const Half<decltype(d)> d2;
1714 template <
typename T,
size_t N,
1716 class V = VFromD<Simd<T, N>>>
1718 static_assert(
IsSame<T, TFromV<V>>(),
"D/V mismatch");
1720 if (
Lanes(d) *
sizeof(T) < 16) {
1721 const Half<decltype(d)> d2;
1729 template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
1735 template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
1741 template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
1750 #define HWY_SVE_REDUCE(BASE, CHAR, BITS, NAME, OP) \
1751 template <size_t N> \
1752 HWY_API HWY_SVE_V(BASE, BITS) \
1753 NAME(HWY_SVE_D(BASE, BITS, N) d, HWY_SVE_V(BASE, BITS) v) { \
1754 return Set(d, sv##OP##_##CHAR##BITS(detail::Mask(d), v)); \
1764 #undef HWY_SVE_REDUCE
1781 const Repartition<uint32_t, decltype(dbf16)> du32;
1782 const svuint32_t b_in_even = ShiftRight<16>(
BitCast(du32, b));
1802 #if HWY_TARGET == HWY_SVE2
1808 return ShiftRight<1>(
Add(
Add(a, b),
Set(DFromV<V>(), 1)));
1815 template <
class D, HWY_IF_LANE_SIZE_D(D, 1)>
1818 const svuint8_t iota =
Iota(du, 0);
1821 const svuint8_t bytes =
BitCast(du, svld1ub_u64(detail::PTrue(d), bits));
1823 const svuint8_t rep8 = svtbl_u8(bytes, detail::AndNotN(7, iota));
1826 const svuint8_t bit =
Shl(
Set(du, 1), detail::AndN(iota, 7));
1831 template <
class D, HWY_IF_LANE_SIZE_D(D, 2)>
1834 const RebindToUnsigned<D> du;
1835 const Repartition<uint8_t, D> du8;
1838 const svuint8_t bytes = svld1(
FirstN(du8, (
Lanes(du) + 7) / 8), bits);
1841 const svuint8_t rep16 = svtbl_u8(bytes, ShiftRight<4>(
Iota(du8, 0)));
1844 const svuint16_t bit =
Shl(
Set(du, 1), detail::AndN(
Iota(du, 0), 7));
1849 template <
class D, HWY_IF_LANE_SIZE_D(D, 4)>
1852 const RebindToUnsigned<D> du;
1853 const Repartition<uint8_t, D> du8;
1857 const svuint8_t bytes = svld1(
FirstN(du8, 8), bits);
1860 const svuint8_t rep32 = svtbl_u8(bytes, ShiftRight<5>(
Iota(du8, 0)));
1863 const svuint32_t bit =
Shl(
Set(du, 1), detail::AndN(
Iota(du, 0), 7));
1868 template <
class D, HWY_IF_LANE_SIZE_D(D, 8)>
1871 const RebindToUnsigned<D> du;
1876 CopyBytes<4>(bits, &mask_bits);
1877 const auto vbits =
Set(du, mask_bits);
1880 const svuint64_t bit =
Shl(
Set(du, 1),
Iota(du, 0));
1890 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1892 return svdup_n_u8_z(m, 1);
1894 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1897 const svuint8_t b16 =
BitCast(d8, svdup_n_u16_z(m, 1));
1900 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1904 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1907 const svuint32_t b64 =
BitCast(d32, svdup_n_u64_z(m, 1));
1914 template <
typename T,
size_t N>
1926 const size_t num_bits =
Lanes(d);
1927 const size_t num_bytes = (num_bits + 8 - 1) / 8;
1935 const int mask = (1 << num_bits) - 1;
1936 bits[0] =
static_cast<uint8_t
>(bits[0] & mask);
1958 #if HWY_TARGET == HWY_SVE2
1964 template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
1966 #if HWY_TARGET == HWY_SVE2
1969 const auto lo =
Mul(a, b);
1971 return BitCast(DW(), detail::InterleaveEven(lo, hi));
1976 const auto lo =
Mul(a, b);
1978 return detail::InterleaveEven(lo, hi);
1982 const auto lo =
Mul(a, b);
1984 return detail::InterleaveOdd(lo, hi);
1992 const svfloat32_t sum0,
1993 svfloat32_t& sum1) {
1997 const svuint16_t zero =
Zero(du16);
2008 #if defined(__ARM_FEATURE_SVE2_AES)
2011 #ifdef HWY_NATIVE_AES
2012 #undef HWY_NATIVE_AES
2014 #define HWY_NATIVE_AES
2022 const svuint8_t zero = svdup_n_u8(0);
2023 return Xor(vaesmcq_u8(vaeseq_u8(state, zero), round_key));
2027 return svpmullb_pair(a, b);
2031 return svpmullt_pair(a, b);
2038 #undef HWY_IF_FLOAT_V
2039 #undef HWY_IF_LANE_SIZE_V
2040 #undef HWY_IF_SIGNED_V
2041 #undef HWY_IF_UNSIGNED_V
2043 #undef HWY_SVE_FOREACH
2044 #undef HWY_SVE_FOREACH_F
2045 #undef HWY_SVE_FOREACH_F16
2046 #undef HWY_SVE_FOREACH_F32
2047 #undef HWY_SVE_FOREACH_F64
2048 #undef HWY_SVE_FOREACH_I
2049 #undef HWY_SVE_FOREACH_I08
2050 #undef HWY_SVE_FOREACH_I16
2051 #undef HWY_SVE_FOREACH_I32
2052 #undef HWY_SVE_FOREACH_I64
2053 #undef HWY_SVE_FOREACH_IF
2054 #undef HWY_SVE_FOREACH_U
2055 #undef HWY_SVE_FOREACH_U08
2056 #undef HWY_SVE_FOREACH_U16
2057 #undef HWY_SVE_FOREACH_U32
2058 #undef HWY_SVE_FOREACH_U64
2059 #undef HWY_SVE_FOREACH_UI
2060 #undef HWY_SVE_FOREACH_UI08
2061 #undef HWY_SVE_FOREACH_UI16
2062 #undef HWY_SVE_FOREACH_UI32
2063 #undef HWY_SVE_FOREACH_UI64
2064 #undef HWY_SVE_FOREACH_UIF3264
2065 #undef HWY_SVE_PTRUE
2066 #undef HWY_SVE_RETV_ARGD
2067 #undef HWY_SVE_RETV_ARGPV
2068 #undef HWY_SVE_RETV_ARGPVN
2069 #undef HWY_SVE_RETV_ARGPVV
2070 #undef HWY_SVE_RETV_ARGV
2071 #undef HWY_SVE_RETV_ARGVN
2072 #undef HWY_SVE_RETV_ARGVV
#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:102
#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:62
#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:801
#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:248
#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:672
#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:261
#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:72
#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:167
#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1141
#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:157
#define HWY_SVE_REVERSE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1392
#define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:875
#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:59
#define HWY_SVE_CONVERT(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1209
#define HWY_SVE_FOREACH(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:125
#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:530
#define HWY_SVE_CAST(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:340
#define HWY_SVE_REDUCE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1750
#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:161
#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:503
#define HWY_SVE_IOTA(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1236
#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:117
#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:110
#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:65
#define HWY_SVE_PTRUE(BITS)
Definition: arm_sve-inl.h:259
#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:702
#define HWY_SVE_SHIFT(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:551
#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:867
#define HWY_SVE_SHUFFLE_2301(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1506
#define HWY_SVE_FMA(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:602
#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:88
#define HWY_SVE_STORE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:818
#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:890
#define HWY_SVE_SET(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:287
#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1410
#define HWY_SVE_SPLICE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1152
#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:94
#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:660
#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:435
#define HWY_SVE_STORE3(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:915
#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:329
#define HWY_SPECIALIZE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:137
#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:179
#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:730
#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:82
#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:184
#define HWY_SVE_TABLE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1377
#define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:899
#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:809
#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1339
#define HWY_SVE_COMPARE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:726
#define HWY_SVE_POPCNT(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:459
#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:121
#define HWY_SVE_EXT(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1268
#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:60
#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:76
#define HWY_SVE_RETV_ARGD(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:147
#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:106
#define HWY_SVE_LOAD(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:793
#define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:172
#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:425
#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:98
#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:948
#define HWY_SVE_STORE4(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:929
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_API
Definition: base.h:117
#define HWY_MIN(a, b)
Definition: base.h:122
#define HWY_INLINE
Definition: base.h:59
#define HWY_DASSERT(condition)
Definition: base.h:163
svbool_t MaskLowerHalf(Simd< T, N > d)
Definition: arm_sve-inl.h:1257
HWY_API svuint8_t BoolFromMask(Simd< T, N > d, svbool_t m)
Definition: arm_sve-inl.h:1891
HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag< 1 >)
Definition: arm_sve-inl.h:195
constexpr size_t LanesPerBlock(Simd< T, N >)
Definition: arm_sve-inl.h:1472
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition: arm_sve-inl.h:1478
svbool_t MaskUpperHalf(Simd< T, N > d)
Definition: arm_sve-inl.h:1261
VI SaturateI(VI v)
Definition: arm_sve-inl.h:1039
svbool_t FirstNPerBlock(D d)
Definition: arm_sve-inl.h:1484
VU SaturateU(VU v)
Definition: arm_sve-inl.h:1033
HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) HWY_API svbool_t PFalse()
Definition: arm_sve-inl.h:267
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:578
svbool_t Mask(Simd< T, N > d)
Definition: arm_sve-inl.h:277
HWY_INLINE size_t HardwareLanes(hwy::SizeTag< 1 >)
Definition: arm_sve-inl.h:209
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:551
HWY_API svfloat32_t PromoteUpperTo(Simd< float, N > df, const svfloat16_t v)
Definition: arm_sve-inl.h:1019
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:805
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1573
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:66
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_API Vec128< uint32_t, N > TableLookupLanes(const Vec128< uint32_t, N > v, const Indices128< uint32_t, N > idx)
Definition: arm_neon-inl.h:3342
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3064
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1167
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1520
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5035
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3435
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2829
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5027
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3629
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2949
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3052
HWY_API size_t StoreMaskBits(Simd< T, N >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4528
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4207
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5040
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_API Vec128< T, N > LoadDup128(Simd< T, N > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2164
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4054
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2352
V Shl(V a, V b)
Definition: arm_neon-inl.h:5018
HWY_API Vec128< T, N > VecFromMask(const Mask128< T, N > v)
Definition: arm_neon-inl.h:1607
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5044
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1206
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3760
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3495
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N > d, const int32_t *idx)
Definition: arm_neon-inl.h:3323
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1600
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: shared-inl.h:149
constexpr HWY_API size_t Lanes(Simd< T, N >)
Definition: arm_sve-inl.h:226
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3947
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3726
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3389
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:422
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3070
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3802
Vec128< T, N > Iota(const Simd< T, N > d, const T2 first)
Definition: arm_neon-inl.h:734
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3483
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:3907
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2903
typename D::Twice Twice
Definition: shared-inl.h:168
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: shared-inl.h:147
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1619
HWY_API Vec128< T, N > Undefined(Simd< T, N >)
Definition: arm_neon-inl.h:724
HWY_API intptr_t FindFirstTrue(const Simd< T, N >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4520
HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:1975
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4509
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5000
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3395
HWY_API svuint32_t U32FromU8(svuint8_t v)
Definition: arm_sve-inl.h:974
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4538
HWY_API bool AllTrue(const Simd< T, N > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4557
HWY_API void StoreInterleaved3(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4829
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_API VFromD< DW > ZipUpper(DW dw, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3538
HWY_API VFromD< D > ConcatEven(D d, VFromD< D > hi, VFromD< D > lo)
Definition: arm_sve-inl.h:1174
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1581
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1655
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2170
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2846
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: shared-inl.h:160
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:475
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:325
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3506
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1827
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3041
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1126
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:449
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec128< uint8_t, 8 > hi, Vec128< uint8_t, 8 > lo)
Definition: arm_neon-inl.h:3566
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4071
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2157
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4036
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5004
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3637
HWY_API V InterleaveUpper(Simd< T, N >, V a, V b)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< uint32_t, 2 > Shuffle2301(const Vec128< uint32_t, 2 > v)
Definition: arm_neon-inl.h:1698
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:319
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5049
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:532
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2890
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1366
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4276
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
V Shr(V a, V b)
Definition: arm_neon-inl.h:5022
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:720
HWY_API VFromD< DW > ZipLower(const V a, const V b)
Definition: arm_sve-inl.h:1736
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4812
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:3844
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4211
Neg(const Vec128< int64_t, 1 > v)
Definition: arm_neon-inl.h:866
HWY_API Vec128< uint8_t, 4 > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2699
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3235
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3752
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4787
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2818
HWY_API VFromD< DW > ZipLower(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3527
typename D::Half Half
Definition: shared-inl.h:164
typename D::template Repartition< T > Repartition
Definition: shared-inl.h:155
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:2665
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4802
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3401
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:3957
HWY_API void StoreInterleaved4(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, const Vec128< uint8_t > v3, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4864
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5013
HWY_API Vec128< uint8_t, 8 > UpperHalf(Simd< uint8_t, 8 >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3096
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3407
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5009
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1590
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:535
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3385
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1320
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4780
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API VFromD< D > ConcatOdd(D d, VFromD< D > hi, VFromD< D > lo)
Definition: arm_sve-inl.h:1163
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
typename D::T TFromD
Definition: shared-inl.h:140
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4012
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1296
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
Definition: aligned_allocator.h:23
constexpr HWY_API bool IsSame()
Definition: base.h:260
typename EnableIfT< Condition, T >::type EnableIf
Definition: base.h:247
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:521
#define HWY_LANES(T)
Definition: set_macros-inl.h:80
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
Definition: arm_sve-inl.h:39
Definition: shared-inl.h:35
uint16_t bits
Definition: base.h:228