Grok  9.5.0
rvv-inl.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // RISC-V V vectors (length not known at compile time).
16 // External include guard in highway.h - see comment there.
17 
18 #include <riscv_vector.h>
19 #include <stddef.h>
20 #include <stdint.h>
21 
22 #include "hwy/base.h"
23 #include "hwy/ops/shared-inl.h"
24 
26 namespace hwy {
27 namespace HWY_NAMESPACE {
28 
29 template <class V>
30 struct DFromV_t {}; // specialized in macros
31 template <class V>
32 using DFromV = typename DFromV_t<RemoveConst<V>>::type;
33 
34 template <class V>
35 using TFromV = TFromD<DFromV<V>>;
36 
37 template <typename T, size_t N>
38 HWY_INLINE constexpr size_t MLenFromD(Simd<T, N> /* tag */) {
39  // Returns divisor = type bits / LMUL
40  return sizeof(T) * 8 / (N / HWY_LANES(T));
41 }
42 
43 // kShift = log2 of multiplier: 0 for m1, 1 for m2, -2 for mf4
44 template <typename T, int kShift = 0>
45 using Full = Simd<T, (kShift < 0) ? (HWY_LANES(T) >> (-kShift))
46  : (HWY_LANES(T) << kShift)>;
47 
48 // ================================================== MACROS
49 
50 // Generate specializations and function definitions using X macros. Although
51 // harder to read and debug, writing everything manually is too bulky.
52 
53 namespace detail { // for code folding
54 
55 // For all mask sizes MLEN: (1/Nth of a register, one bit per lane)
56 // The first two arguments are SEW and SHIFT such that SEW >> SHIFT = MLEN.
57 #define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP) \
58  X_MACRO(64, 0, 64, NAME, OP) \
59  X_MACRO(32, 0, 32, NAME, OP) \
60  X_MACRO(16, 0, 16, NAME, OP) \
61  X_MACRO(8, 0, 8, NAME, OP) \
62  X_MACRO(8, 1, 4, NAME, OP) \
63  X_MACRO(8, 2, 2, NAME, OP) \
64  X_MACRO(8, 3, 1, NAME, OP)
65 
66 // For given SEW, iterate over all LMUL. Precompute SEW/LMUL => MLEN because we
67 // need to token-paste the result. For the same reason, we also pass the
68 // twice-as-long and half-as-long LMUL suffixes as arguments.
69 // TODO(janwas): add fractional LMUL
70 #define HWY_RVV_FOREACH_08(X_MACRO, BASE, CHAR, NAME, OP) \
71  X_MACRO(BASE, CHAR, 8, m1, m2, mf2, /*kShift=*/0, /*MLEN=*/8, NAME, OP) \
72  X_MACRO(BASE, CHAR, 8, m2, m4, m1, /*kShift=*/1, /*MLEN=*/4, NAME, OP) \
73  X_MACRO(BASE, CHAR, 8, m4, m8, m2, /*kShift=*/2, /*MLEN=*/2, NAME, OP) \
74  X_MACRO(BASE, CHAR, 8, m8, __, m4, /*kShift=*/3, /*MLEN=*/1, NAME, OP)
75 
76 #define HWY_RVV_FOREACH_16(X_MACRO, BASE, CHAR, NAME, OP) \
77  X_MACRO(BASE, CHAR, 16, m1, m2, mf2, /*kShift=*/0, /*MLEN=*/16, NAME, OP) \
78  X_MACRO(BASE, CHAR, 16, m2, m4, m1, /*kShift=*/1, /*MLEN=*/8, NAME, OP) \
79  X_MACRO(BASE, CHAR, 16, m4, m8, m2, /*kShift=*/2, /*MLEN=*/4, NAME, OP) \
80  X_MACRO(BASE, CHAR, 16, m8, __, m4, /*kShift=*/3, /*MLEN=*/2, NAME, OP)
81 
82 #define HWY_RVV_FOREACH_32(X_MACRO, BASE, CHAR, NAME, OP) \
83  X_MACRO(BASE, CHAR, 32, m1, m2, mf2, /*kShift=*/0, /*MLEN=*/32, NAME, OP) \
84  X_MACRO(BASE, CHAR, 32, m2, m4, m1, /*kShift=*/1, /*MLEN=*/16, NAME, OP) \
85  X_MACRO(BASE, CHAR, 32, m4, m8, m2, /*kShift=*/2, /*MLEN=*/8, NAME, OP) \
86  X_MACRO(BASE, CHAR, 32, m8, __, m4, /*kShift=*/3, /*MLEN=*/4, NAME, OP)
87 
88 #define HWY_RVV_FOREACH_64(X_MACRO, BASE, CHAR, NAME, OP) \
89  X_MACRO(BASE, CHAR, 64, m1, m2, mf2, /*kShift=*/0, /*MLEN=*/64, NAME, OP) \
90  X_MACRO(BASE, CHAR, 64, m2, m4, m1, /*kShift=*/1, /*MLEN=*/32, NAME, OP) \
91  X_MACRO(BASE, CHAR, 64, m4, m8, m2, /*kShift=*/2, /*MLEN=*/16, NAME, OP) \
92  X_MACRO(BASE, CHAR, 64, m8, __, m4, /*kShift=*/3, /*MLEN=*/8, NAME, OP)
93 
94 // SEW for unsigned:
95 #define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP) \
96  HWY_RVV_FOREACH_08(X_MACRO, uint, u, NAME, OP)
97 #define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP) \
98  HWY_RVV_FOREACH_16(X_MACRO, uint, u, NAME, OP)
99 #define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP) \
100  HWY_RVV_FOREACH_32(X_MACRO, uint, u, NAME, OP)
101 #define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP) \
102  HWY_RVV_FOREACH_64(X_MACRO, uint, u, NAME, OP)
103 
104 // SEW for signed:
105 #define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP) \
106  HWY_RVV_FOREACH_08(X_MACRO, int, i, NAME, OP)
107 #define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP) \
108  HWY_RVV_FOREACH_16(X_MACRO, int, i, NAME, OP)
109 #define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP) \
110  HWY_RVV_FOREACH_32(X_MACRO, int, i, NAME, OP)
111 #define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP) \
112  HWY_RVV_FOREACH_64(X_MACRO, int, i, NAME, OP)
113 
114 // SEW for float:
115 #define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP) \
116  HWY_RVV_FOREACH_16(X_MACRO, float, f, NAME, OP)
117 #define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP) \
118  HWY_RVV_FOREACH_32(X_MACRO, float, f, NAME, OP)
119 #define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP) \
120  HWY_RVV_FOREACH_64(X_MACRO, float, f, NAME, OP)
121 
122 // For all combinations of SEW:
123 #define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP) \
124  HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP) \
125  HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP) \
126  HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP) \
127  HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP)
128 
129 #define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP) \
130  HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP) \
131  HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP) \
132  HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP) \
133  HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP)
134 
135 #if HWY_CAP_FLOAT16
136 #define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP) \
137  HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP) \
138  HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP) \
139  HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP)
140 #else
141 #define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP) \
142  HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP) \
143  HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP)
144 #endif
145 
146 // Commonly used type categories for a given SEW:
147 #define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP) \
148  HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP) \
149  HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP)
150 
151 #define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP) \
152  HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP) \
153  HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP)
154 
155 #define HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP) \
156  HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP) \
157  HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP)
158 
159 // Commonly used type categories:
160 #define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP) \
161  HWY_RVV_FOREACH_U(X_MACRO, NAME, OP) \
162  HWY_RVV_FOREACH_I(X_MACRO, NAME, OP)
163 
164 #define HWY_RVV_FOREACH(X_MACRO, NAME, OP) \
165  HWY_RVV_FOREACH_U(X_MACRO, NAME, OP) \
166  HWY_RVV_FOREACH_I(X_MACRO, NAME, OP) \
167  HWY_RVV_FOREACH_F(X_MACRO, NAME, OP)
168 
169 // Assemble types for use in x-macros
170 #define HWY_RVV_T(BASE, SEW) BASE##SEW##_t
171 #define HWY_RVV_D(CHAR, SEW, LMUL) D##CHAR##SEW##LMUL
172 #define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t
173 #define HWY_RVV_M(MLEN) vbool##MLEN##_t
174 
175 } // namespace detail
176 
177 // TODO(janwas): remove typedefs and only use HWY_RVV_V etc. directly
178 
179 // Until we have full intrinsic support for fractional LMUL, mixed-precision
180 // code can use LMUL 1..8 (adequate unless they need many registers).
181 #define HWY_SPECIALIZE(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
182  using HWY_RVV_D(CHAR, SEW, LMUL) = Full<HWY_RVV_T(BASE, SEW), SHIFT>; \
183  using V##CHAR##SEW##LMUL = HWY_RVV_V(BASE, SEW, LMUL); \
184  template <> \
185  struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> { \
186  using Lane = HWY_RVV_T(BASE, SEW); \
187  using type = Full<Lane, SHIFT>; \
188  };
189 #if HWY_CAP_FLOAT16
190 using Vf16m1 = vfloat16m1_t;
191 using Vf16m2 = vfloat16m2_t;
192 using Vf16m4 = vfloat16m4_t;
193 using Vf16m8 = vfloat16m8_t;
194 using Df16m1 = Full<float16_t, 0>;
195 using Df16m2 = Full<float16_t, 1>;
196 using Df16m4 = Full<float16_t, 2>;
197 using Df16m8 = Full<float16_t, 3>;
198 #endif
199 
201 #undef HWY_SPECIALIZE
202 
203 // ------------------------------ Lanes
204 
205 // WARNING: we want to query VLMAX/sizeof(T), but this actually changes VL!
206 // vlenb is not exposed through intrinsics and vreadvl is not VLMAX.
207 #define HWY_RVV_LANES(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
208  HWY_API size_t NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */) { \
209  return v##OP##SEW##LMUL(); \
210  }
211 
212 HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e)
213 #undef HWY_RVV_LANES
214 
215 template <size_t N>
217  return Lanes(Simd<uint16_t, N>());
218 }
219 
220 // ------------------------------ Common x-macros
221 
222 // Last argument to most intrinsics. Use when the op has no d arg of its own.
223 #define HWY_RVV_AVL(SEW, SHIFT) Lanes(Full<HWY_RVV_T(uint, SEW), SHIFT>())
224 
225 // vector = f(vector), e.g. Not
226 #define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
227  OP) \
228  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
229  return v##OP##_v_##CHAR##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
230  }
231 
232 // vector = f(vector, scalar), e.g. detail::AddS
233 #define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
234  OP) \
235  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
236  NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
237  return v##OP##_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
238  }
239 
240 // vector = f(vector, vector), e.g. Add
241 #define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
242  OP) \
243  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
244  NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
245  return v##OP##_vv_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
246  }
247 
248 // ================================================== INIT
249 
250 // ------------------------------ Set
251 
252 #define HWY_RVV_SET(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
253  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
254  NAME(HWY_RVV_D(CHAR, SEW, LMUL) d, HWY_RVV_T(BASE, SEW) arg) { \
255  return v##OP##_##CHAR##SEW##LMUL(arg, Lanes(d)); \
256  }
257 
260 #undef HWY_RVV_SET
261 
262 // Treat bfloat16_t as uint16_t (using the previously defined Set overloads);
263 // required for Zero and VFromD.
264 template <size_t N>
265 decltype(Set(Simd<uint16_t, N>(), 0)) Set(Simd<bfloat16_t, N> d,
266  bfloat16_t arg) {
267  return Set(RebindToUnsigned<decltype(d)>(), arg.bits);
268 }
269 
270 template <class D>
271 using VFromD = decltype(Set(D(), TFromD<D>()));
272 
273 // Partial vectors
274 template <typename T, size_t N, HWY_IF_LE128(T, N)>
276  return Set(Full<T>(), arg);
277 }
278 
279 // ------------------------------ Zero
280 
281 template <typename T, size_t N>
283  return Set(d, T(0));
284 }
285 
286 // ------------------------------ Undefined
287 
288 // RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized
289 // by it gives unpredictable results. It should only be used for maskoff, so
290 // keep it internal. For the Highway op, just use Zero (single instruction).
291 namespace detail {
292 #define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
293  OP) \
294  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
295  NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* tag */) { \
296  return v##OP##_##CHAR##SEW##LMUL(); /* no AVL */ \
297  }
298 
300 #undef HWY_RVV_UNDEFINED
301 } // namespace detail
302 
303 template <class D>
305  return Zero(d);
306 }
307 
308 // ------------------------------ BitCast
309 
310 namespace detail {
311 
312 // There is no reinterpret from u8 <-> u8, so just return.
313 #define HWY_RVV_CAST_U8(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
314  OP) \
315  HWY_API vuint8##LMUL##_t BitCastToByte(vuint8##LMUL##_t v) { return v; } \
316  HWY_API vuint8##LMUL##_t BitCastFromByte(HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \
317  vuint8##LMUL##_t v) { \
318  return v; \
319  }
320 
321 // For i8, need a single reinterpret (HWY_RVV_CAST_IF does two).
322 #define HWY_RVV_CAST_I8(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
323  OP) \
324  HWY_API vuint8##LMUL##_t BitCastToByte(vint8##LMUL##_t v) { \
325  return vreinterpret_v_i8##LMUL##_u8##LMUL(v); \
326  } \
327  HWY_API vint8##LMUL##_t BitCastFromByte(HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \
328  vuint8##LMUL##_t v) { \
329  return vreinterpret_v_u8##LMUL##_i8##LMUL(v); \
330  }
331 
332 // Separate u/i because clang only provides signed <-> unsigned reinterpret for
333 // the same SEW.
334 #define HWY_RVV_CAST_U(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
335  HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
336  return v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v); \
337  } \
338  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
339  HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) { \
340  return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v); \
341  }
342 
343 // Signed/Float: first cast to/from unsigned
344 #define HWY_RVV_CAST_IF(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
345  OP) \
346  HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
347  return v##OP##_v_u##SEW##LMUL##_u8##LMUL( \
348  v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v)); \
349  } \
350  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
351  HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) { \
352  return v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \
353  v##OP##_v_u8##LMUL##_u##SEW##LMUL(v)); \
354  }
355 
356 HWY_RVV_FOREACH_U08(HWY_RVV_CAST_U8, _, reinterpret)
357 HWY_RVV_FOREACH_I08(HWY_RVV_CAST_I8, _, reinterpret)
358 HWY_RVV_FOREACH_U16(HWY_RVV_CAST_U, _, reinterpret)
359 HWY_RVV_FOREACH_U32(HWY_RVV_CAST_U, _, reinterpret)
360 HWY_RVV_FOREACH_U64(HWY_RVV_CAST_U, _, reinterpret)
361 HWY_RVV_FOREACH_I16(HWY_RVV_CAST_IF, _, reinterpret)
362 HWY_RVV_FOREACH_I32(HWY_RVV_CAST_IF, _, reinterpret)
363 HWY_RVV_FOREACH_I64(HWY_RVV_CAST_IF, _, reinterpret)
364 HWY_RVV_FOREACH_F(HWY_RVV_CAST_IF, _, reinterpret)
365 
366 #undef HWY_RVV_CAST_U8
367 #undef HWY_RVV_CAST_I8
368 #undef HWY_RVV_CAST_U
369 #undef HWY_RVV_CAST_IF
370 
371 template <size_t N>
374  return BitCastFromByte(Simd<uint16_t, N>(), v);
375 }
376 
377 } // namespace detail
378 
379 template <class D, class FromV>
380 HWY_API VFromD<D> BitCast(D d, FromV v) {
382 }
383 
384 // Partial
385 template <typename T, size_t N, class FromV, HWY_IF_LE128(T, N)>
387  return BitCast(Full<T>(), v);
388 }
389 
390 namespace detail {
391 
392 template <class V, class DU = RebindToUnsigned<DFromV<V>>>
394  return BitCast(DU(), v);
395 }
396 
397 } // namespace detail
398 
399 // ------------------------------ Iota
400 
401 namespace detail {
402 
403 #define HWY_RVV_IOTA(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
404  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(CHAR, SEW, LMUL) d) { \
405  return v##OP##_##CHAR##SEW##LMUL(Lanes(d)); \
406  }
407 
409 #undef HWY_RVV_IOTA
410 
411 template <class D, class DU = RebindToUnsigned<D>>
412 HWY_INLINE VFromD<DU> Iota0(const D /*d*/) {
413  return BitCastToUnsigned(Iota0(DU()));
414 }
415 
416 // Partial
417 template <typename T, size_t N, HWY_IF_LE128(T, N)>
419  return Iota0(Full<T>());
420 }
421 
422 } // namespace detail
423 
424 // ================================================== LOGICAL
425 
426 // ------------------------------ Not
427 
429 
430 template <class V, HWY_IF_FLOAT_V(V)>
431 HWY_API V Not(const V v) {
432  using DF = DFromV<V>;
433  using DU = RebindToUnsigned<DF>;
434  return BitCast(DF(), Not(BitCast(DU(), v)));
435 }
436 
437 // ------------------------------ And
438 
439 // Non-vector version (ideally immediate) for use with Iota0
440 namespace detail {
442 } // namespace detail
443 
445 
446 template <class V, HWY_IF_FLOAT_V(V)>
447 HWY_API V And(const V a, const V b) {
448  using DF = DFromV<V>;
449  using DU = RebindToUnsigned<DF>;
450  return BitCast(DF(), And(BitCast(DU(), a), BitCast(DU(), b)));
451 }
452 
453 // ------------------------------ Or
454 
455 #undef HWY_RVV_OR_MASK
456 
458 
459 template <class V, HWY_IF_FLOAT_V(V)>
460 HWY_API V Or(const V a, const V b) {
461  using DF = DFromV<V>;
462  using DU = RebindToUnsigned<DF>;
463  return BitCast(DF(), Or(BitCast(DU(), a), BitCast(DU(), b)));
464 }
465 
466 // ------------------------------ Xor
467 
468 // Non-vector version (ideally immediate) for use with Iota0
469 namespace detail {
471 } // namespace detail
472 
474 
475 template <class V, HWY_IF_FLOAT_V(V)>
476 HWY_API V Xor(const V a, const V b) {
477  using DF = DFromV<V>;
478  using DU = RebindToUnsigned<DF>;
479  return BitCast(DF(), Xor(BitCast(DU(), a), BitCast(DU(), b)));
480 }
481 
482 // ------------------------------ AndNot
483 
484 template <class V>
485 HWY_API V AndNot(const V not_a, const V b) {
486  return And(Not(not_a), b);
487 }
488 
489 // ------------------------------ CopySign
490 
492 
493 template <class V>
494 HWY_API V CopySignToAbs(const V abs, const V sign) {
495  // RVV can also handle abs < 0, so no extra action needed.
496  return CopySign(abs, sign);
497 }
498 
499 // ================================================== ARITHMETIC
500 
501 // ------------------------------ Add
502 
503 namespace detail {
506 } // namespace detail
507 
510 
511 // ------------------------------ Sub
514 
515 // ------------------------------ SaturatedAdd
516 
519 
522 
523 // ------------------------------ SaturatedSub
524 
527 
530 
531 // ------------------------------ AverageRound
532 
533 // TODO(janwas): check vxrm rounding mode
536 
537 // ------------------------------ ShiftLeft[Same]
538 
539 // Intrinsics do not define .vi forms, so use .vx instead.
540 #define HWY_RVV_SHIFT(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
541  template <int kBits> \
542  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
543  return v##OP##_vx_##CHAR##SEW##LMUL(v, kBits, HWY_RVV_AVL(SEW, SHIFT)); \
544  } \
545  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
546  NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \
547  return v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits), \
548  HWY_RVV_AVL(SEW, SHIFT)); \
549  }
550 
552 
553 // ------------------------------ ShiftRight[Same]
554 
557 
558 #undef HWY_RVV_SHIFT
559 
560 // ------------------------------ Shl
561 #define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
562  OP) \
563  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
564  NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
565  return v##OP##_vv_##CHAR##SEW##LMUL(v, bits, HWY_RVV_AVL(SEW, SHIFT)); \
566  }
567 
569 
570 #define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
571  OP) \
572  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
573  NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
574  return v##OP##_vv_##CHAR##SEW##LMUL(v, detail::BitCastToUnsigned(bits), \
575  HWY_RVV_AVL(SEW, SHIFT)); \
576  }
577 
579 
580 // ------------------------------ Shr
581 
584 
585 #undef HWY_RVV_SHIFT_II
586 #undef HWY_RVV_SHIFT_VV
587 
588 // ------------------------------ Min
589 
593 
594 // ------------------------------ Max
595 
596 namespace detail {
597 
601 
602 } // namespace detail
603 
607 
608 // ------------------------------ Mul
609 
610 // Only for internal use (Highway only promises Mul for 16/32-bit inputs).
611 // Used by MulLower.
612 namespace detail {
614 } // namespace detail
615 
619 
620 // ------------------------------ MulHigh
621 
622 // Only for internal use (Highway only promises MulHigh for 16-bit inputs).
623 // Used by MulEven; vwmul does not work for m8.
624 namespace detail {
628 } // namespace detail
629 
632 
633 // ------------------------------ Div
635 
636 // ------------------------------ ApproximateReciprocal
638 
639 // ------------------------------ Sqrt
641 
642 // ------------------------------ ApproximateReciprocalSqrt
644 
645 // ------------------------------ MulAdd
646 // Note: op is still named vv, not vvv.
647 #define HWY_RVV_FMA(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
648  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
649  NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \
650  HWY_RVV_V(BASE, SEW, LMUL) add) { \
651  return v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x, HWY_RVV_AVL(SEW, SHIFT)); \
652  }
653 
655 
656 // ------------------------------ NegMulAdd
658 
659 // ------------------------------ MulSub
661 
662 // ------------------------------ NegMulSub
664 
665 #undef HWY_RVV_FMA
666 
667 // ================================================== COMPARE
668 
669 // Comparisons set a mask bit to 1 if the condition is true, else 0. The XX in
670 // vboolXX_t is a power of two divisor for vector bits. SLEN 8 / LMUL 1 = 1/8th
671 // of all bits; SLEN 8 / LMUL 4 = half of all bits.
672 
673 // mask = f(vector, vector)
674 #define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
675  OP) \
676  HWY_API HWY_RVV_M(MLEN) \
677  NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
678  return v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN(a, b, \
679  HWY_RVV_AVL(SEW, SHIFT)); \
680  }
681 
682 // mask = f(vector, scalar)
683 #define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
684  OP) \
685  HWY_API HWY_RVV_M(MLEN) \
686  NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
687  return v##OP##_vx_##CHAR##SEW##LMUL##_b##MLEN(a, b, \
688  HWY_RVV_AVL(SEW, SHIFT)); \
689  }
690 
691 // ------------------------------ Eq
694 
695 // ------------------------------ Ne
698 
699 // ------------------------------ Lt
702 
703 namespace detail {
705 } // namespace detail
706 
707 // ------------------------------ Le
709 
710 #undef HWY_RVV_RETM_ARGVV
711 #undef HWY_RVV_RETM_ARGVS
712 
713 // ------------------------------ Gt/Ge
714 
715 template <class V>
716 HWY_API auto Ge(const V a, const V b) -> decltype(Le(a, b)) {
717  return Le(b, a);
718 }
719 
720 template <class V>
721 HWY_API auto Gt(const V a, const V b) -> decltype(Lt(a, b)) {
722  return Lt(b, a);
723 }
724 
725 // ------------------------------ TestBit
726 template <class V>
727 HWY_API auto TestBit(const V a, const V bit) -> decltype(Eq(a, bit)) {
728  return Ne(And(a, bit), Zero(DFromV<V>()));
729 }
730 
731 // ------------------------------ Not
732 
733 // mask = f(mask)
734 #define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
735  HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \
736  return vm##OP##_m_b##MLEN(m, ~0ull); \
737  }
738 
740 
741 #undef HWY_RVV_RETM_ARGM
742 
743 // ------------------------------ And
744 
745 // mask = f(mask_a, mask_b) (note arg2,arg1 order!)
746 #define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP) \
747  HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) a, HWY_RVV_M(MLEN) b) { \
748  return vm##OP##_mm_b##MLEN(b, a, HWY_RVV_AVL(SEW, SHIFT)); \
749  }
750 
752 
753 // ------------------------------ AndNot
755 
756 // ------------------------------ Or
758 
759 // ------------------------------ Xor
761 
762 #undef HWY_RVV_RETM_ARGMM
763 
764 // ------------------------------ IfThenElse
765 #define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, \
766  NAME, OP) \
767  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
768  NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \
769  HWY_RVV_V(BASE, SEW, LMUL) no) { \
770  return v##OP##_vvm_##CHAR##SEW##LMUL(m, no, yes, HWY_RVV_AVL(SEW, SHIFT)); \
771  }
772 
774 
775 #undef HWY_RVV_IF_THEN_ELSE
776 
777 // ------------------------------ IfThenElseZero
778 template <class M, class V>
779 HWY_API V IfThenElseZero(const M mask, const V yes) {
780  return IfThenElse(mask, yes, Zero(DFromV<V>()));
781 }
782 
783 // ------------------------------ IfThenZeroElse
784 template <class M, class V>
785 HWY_API V IfThenZeroElse(const M mask, const V no) {
786  return IfThenElse(mask, Zero(DFromV<V>()), no);
787 }
788 
789 // ------------------------------ MaskFromVec
790 
791 template <class V>
792 HWY_API auto MaskFromVec(const V v) -> decltype(Eq(v, v)) {
793  return Ne(v, Zero(DFromV<V>()));
794 }
795 
796 template <class D>
797 using MFromD = decltype(MaskFromVec(Zero(D())));
798 
799 template <class D, typename MFrom>
800 HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
801  // No need to check lane size/LMUL are the same: if not, casting MFrom to
802  // MFromD<D> would fail.
803  return mask;
804 }
805 
806 // ------------------------------ VecFromMask
807 
808 namespace detail {
809 #define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, \
810  NAME, OP) \
811  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
812  NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_M(MLEN) m) { \
813  return v##OP##_##CHAR##SEW##LMUL##_m(m, v0, v0, 1, \
814  HWY_RVV_AVL(SEW, SHIFT)); \
815  }
816 
818 #undef HWY_RVV_VEC_FROM_MASK
819 } // namespace detail
820 
821 template <class D, HWY_IF_NOT_FLOAT_D(D)>
823  return detail::SubS(Zero(d), mask);
824 }
825 
826 template <class D, HWY_IF_FLOAT_D(D)>
827 HWY_API VFromD<D> VecFromMask(const D d, MFromD<D> mask) {
828  return BitCast(d, VecFromMask(RebindToUnsigned<D>(), mask));
829 }
830 
831 // ------------------------------ ZeroIfNegative
832 template <class V>
833 HWY_API V ZeroIfNegative(const V v) {
834  const auto v0 = Zero(DFromV<V>());
835  // We already have a zero constant, so avoid IfThenZeroElse.
836  return IfThenElse(Lt(v, v0), v0, v);
837 }
838 
839 // ------------------------------ BroadcastSignBit
840 template <class V>
841 HWY_API V BroadcastSignBit(const V v) {
842  return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
843 }
844 
845 // ------------------------------ FindFirstTrue
846 
847 #define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
848  template <class D> \
849  HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
850  static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
851  return vfirst_m_b##MLEN(m, Lanes(d)); \
852  }
853 
855 #undef HWY_RVV_FIND_FIRST_TRUE
856 
857 // ------------------------------ AllFalse
858 template <class D>
859 HWY_API bool AllFalse(D d, MFromD<D> m) {
860  return FindFirstTrue(d, m) < 0;
861 }
862 
863 // ------------------------------ AllTrue
864 
865 #define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
866  template <class D> \
867  HWY_API bool AllTrue(D d, HWY_RVV_M(MLEN) m) { \
868  static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
869  return AllFalse(d, vmnot_m_b##MLEN(m, Lanes(d))); \
870  }
871 
873 #undef HWY_RVV_ALL_TRUE
874 
875 // ------------------------------ CountTrue
876 
877 #define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
878  template <class D> \
879  HWY_API size_t CountTrue(D d, HWY_RVV_M(MLEN) m) { \
880  static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
881  return vpopc_m_b##MLEN(m, Lanes(d)); \
882  }
883 
885 #undef HWY_RVV_COUNT_TRUE
886 
887 // ================================================== MEMORY
888 
889 // ------------------------------ Load
890 
891 #define HWY_RVV_LOAD(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
892  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
893  NAME(HWY_RVV_D(CHAR, SEW, LMUL) d, \
894  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
895  return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, Lanes(d)); \
896  }
898 #undef HWY_RVV_LOAD
899 
900 // Partial
901 template <typename T, size_t N, HWY_IF_LE128(T, N)>
903  return Load(d, p);
904 }
905 
906 // There is no native BF16, treat as uint16_t.
907 template <size_t N>
909  const bfloat16_t* HWY_RESTRICT p) {
910  return Load(RebindToUnsigned<decltype(d)>(),
911  reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
912 }
913 
914 template <size_t N>
917  Store(v, RebindToUnsigned<decltype(d)>(),
918  reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
919 }
920 
921 // ------------------------------ LoadU
922 
923 // RVV only requires lane alignment, not natural alignment of the entire vector.
924 template <class D>
925 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
926  return Load(d, p);
927 }
928 
929 // ------------------------------ MaskedLoad
930 
931 #define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, \
932  NAME, OP) \
933  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
934  NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(CHAR, SEW, LMUL) d, \
935  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
936  return v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(m, Zero(d), p, Lanes(d)); \
937  }
939 #undef HWY_RVV_MASKED_LOAD
940 
941 // ------------------------------ Store
942 
943 #define HWY_RVV_RET_ARGVDP(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
944  OP) \
945  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
946  HWY_RVV_D(CHAR, SEW, LMUL) d, \
947  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
948  return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, Lanes(d)); \
949  }
951 #undef HWY_RVV_RET_ARGVDP
952 
953 // Partial
954 template <typename T, size_t N, HWY_IF_LE128(T, N)>
956  T* HWY_RESTRICT p) {
957  return Store(v, Full<T>(), p);
958 }
959 
960 // ------------------------------ StoreU
961 
962 // RVV only requires lane alignment, not natural alignment of the entire vector.
963 template <class V, class D>
964 HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
965  Store(v, d, p);
966 }
967 
968 // ------------------------------ Stream
969 template <class V, class D, typename T>
970 HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) {
971  Store(v, d, aligned);
972 }
973 
974 // ------------------------------ ScatterOffset
975 
976 #define HWY_RVV_SCATTER(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
977  OP) \
978  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
979  HWY_RVV_D(CHAR, SEW, LMUL) d, \
980  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
981  HWY_RVV_V(int, SEW, LMUL) offset) { \
982  return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
983  base, detail::BitCastToUnsigned(offset), v, Lanes(d)); \
984  }
986 #undef HWY_RVV_SCATTER
987 
988 // Partial
989 template <typename T, size_t N, HWY_IF_LE128(T, N)>
991  T* HWY_RESTRICT base,
992  VFromD<Simd<MakeSigned<T>, N>> offset) {
993  return ScatterOffset(v, Full<T>(), base, offset);
994 }
995 
996 // ------------------------------ ScatterIndex
997 
998 template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1000  const VFromD<RebindToSigned<D>> index) {
1001  return ScatterOffset(v, d, base, ShiftLeft<2>(index));
1002 }
1003 
1004 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1005 HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
1006  const VFromD<RebindToSigned<D>> index) {
1007  return ScatterOffset(v, d, base, ShiftLeft<3>(index));
1008 }
1009 
1010 // ------------------------------ GatherOffset
1011 
1012 #define HWY_RVV_GATHER(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
1013  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1014  NAME(HWY_RVV_D(CHAR, SEW, LMUL) d, \
1015  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1016  HWY_RVV_V(int, SEW, LMUL) offset) { \
1017  return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1018  base, detail::BitCastToUnsigned(offset), Lanes(d)); \
1019  }
1021 #undef HWY_RVV_GATHER
1022 
1023 // Partial
1024 template <typename T, size_t N, HWY_IF_LE128(T, N)>
1026  const T* HWY_RESTRICT base,
1027  VFromD<Simd<MakeSigned<T>, N>> offset) {
1028  return GatherOffset(Full<T>(), base, offset);
1029 }
1030 
1031 // ------------------------------ GatherIndex
1032 
1033 template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1035  const VFromD<RebindToSigned<D>> index) {
1036  return GatherOffset(d, base, ShiftLeft<2>(index));
1037 }
1038 
1039 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1040 HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
1041  const VFromD<RebindToSigned<D>> index) {
1042  return GatherOffset(d, base, ShiftLeft<3>(index));
1043 }
1044 
1045 // TODO(janwas): wait for https://github.com/riscv/rvv-intrinsic-doc/issues/95
1046 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
1047 
1048 // ------------------------------ StoreInterleaved3
1049 
1050 #define HWY_RVV_STORE3(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
1051  HWY_API void NAME( \
1052  HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
1053  HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_D(CHAR, SEW, LMUL) d, \
1054  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
1055  const v##BASE##SEW##LMUL##x3_t triple = \
1056  vcreate_##CHAR##SEW##LMUL##x3(v0, v1, v2); \
1057  return v##OP##e8_v_##CHAR##SEW##LMUL##x3(unaligned, triple, Lanes(d)); \
1058  }
1059 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
1060 HWY_RVV_STORE3(uint, u, 8, m1, /*kShift=*/0, 8, StoreInterleaved3, sseg3)
1061 HWY_RVV_STORE3(uint, u, 8, m2, /*kShift=*/1, 4, StoreInterleaved3, sseg3)
1062 
1063 #undef HWY_RVV_STORE3
1064 
1065 // Partial
1066 template <typename T, size_t N, HWY_IF_LE128(T, N)>
1067 HWY_API void StoreInterleaved3(VFromD<Simd<T, N>> v0, VFromD<Simd<T, N>> v1,
1068  VFromD<Simd<T, N>> v2, Simd<T, N> /*tag*/,
1069  T* unaligned) {
1070  return StoreInterleaved3(v0, v1, v2, Full<T>(), unaligned);
1071 }
1072 
1073 // ------------------------------ StoreInterleaved4
1074 
1075 #define HWY_RVV_STORE4(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
1076  HWY_API void NAME( \
1077  HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
1078  HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3, \
1079  HWY_RVV_D(CHAR, SEW, LMUL) d, \
1080  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned) { \
1081  const v##BASE##SEW##LMUL##x4_t quad = \
1082  vcreate_##CHAR##SEW##LMUL##x4(v0, v1, v2, v3); \
1083  return v##OP##e8_v_##CHAR##SEW##LMUL##x4(aligned, quad, Lanes(d)); \
1084  }
1085 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
1086 HWY_RVV_STORE4(uint, u, 8, m1, /*kShift=*/0, 8, StoreInterleaved4, sseg4)
1087 HWY_RVV_STORE4(uint, u, 8, m2, /*kShift=*/1, 4, StoreInterleaved4, sseg4)
1088 
1089 #undef HWY_RVV_STORE4
1090 
1091 // Partial
1092 template <typename T, size_t N, HWY_IF_LE128(T, N)>
1093 HWY_API void StoreInterleaved4(VFromD<Simd<T, N>> v0, VFromD<Simd<T, N>> v1,
1094  VFromD<Simd<T, N>> v2, VFromD<Simd<T, N>> v3,
1095  Simd<T, N> /*tag*/, T* unaligned) {
1096  return StoreInterleaved4(v0, v1, v2, v3, Full<T>(), unaligned);
1097 }
1098 
1099 #endif // GCC
1100 
1101 // ================================================== CONVERT
1102 
1103 #define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN) \
1104  HWY_API HWY_RVV_V(BASE, BITS, LMUL) PromoteTo( \
1105  HWY_RVV_D(CHAR, BITS, LMUL) d, HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) { \
1106  return OP##CHAR##BITS##LMUL(v, Lanes(d)); \
1107  }
1108 
1109 #define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
1110  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2) \
1111  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1) \
1112  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2) \
1113  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4)
1114 
1115 #define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
1116  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4) \
1117  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2) \
1118  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1) \
1119  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2)
1120 
1121 // ------------------------------ PromoteTo
1122 
1123 HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 16, uint, 8)
1124 HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 32, uint, 16)
1125 HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 64, uint, 32)
1126 HWY_RVV_PROMOTE_X4(vzext_vf4_, uint, u, 32, uint, 8)
1127 
1128 HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 16, int, 8)
1129 HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 32, int, 16)
1130 HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 64, int, 32)
1131 HWY_RVV_PROMOTE_X4(vsext_vf4_, int, i, 32, int, 8)
1132 
1133 #if HWY_CAP_FLOAT16
1134 HWY_RVV_PROMOTE_X2(vfwcvt_f_f_v_, float, f, 32, float, 16)
1135 #endif
1136 HWY_RVV_PROMOTE_X2(vfwcvt_f_f_v_, float, f, 64, float, 32)
1137 
1138 // i32 to f64
1139 HWY_RVV_PROMOTE_X2(vfwcvt_f_x_v_, float, f, 64, int, 32)
1140 
1141 #undef HWY_RVV_PROMOTE_X4
1142 #undef HWY_RVV_PROMOTE_X2
1143 #undef HWY_RVV_PROMOTE
1144 
1145 template <size_t N>
1147  -> VFromD<decltype(d)> {
1148  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1149 }
1150 
1151 template <size_t N>
1153  -> VFromD<decltype(d)> {
1154  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1155 }
1156 
1157 template <size_t N>
1159  -> VFromD<decltype(d)> {
1160  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1161 }
1162 
1163 template <size_t N>
1165  -> VFromD<decltype(d)> {
1166  const RebindToSigned<decltype(d)> di32;
1167  const Rebind<uint16_t, decltype(d)> du16;
1168  return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
1169 }
1170 
1171 // ------------------------------ DemoteTo U
1172 
1173 // Unsigned -> unsigned (also used for bf16)
1174 namespace detail {
1175 
1176 HWY_INLINE Vu16m1 DemoteTo(Du16m1 d, const Vu32m2 v) {
1177  return vnclipu_wx_u16m1(v, 0, Lanes(d));
1178 }
1179 HWY_INLINE Vu16m2 DemoteTo(Du16m2 d, const Vu32m4 v) {
1180  return vnclipu_wx_u16m2(v, 0, Lanes(d));
1181 }
1182 HWY_INLINE Vu16m4 DemoteTo(Du16m4 d, const Vu32m8 v) {
1183  return vnclipu_wx_u16m4(v, 0, Lanes(d));
1184 }
1185 
1186 HWY_INLINE Vu8m1 DemoteTo(Du8m1 d, const Vu16m2 v) {
1187  return vnclipu_wx_u8m1(v, 0, Lanes(d));
1188 }
1189 HWY_INLINE Vu8m2 DemoteTo(Du8m2 d, const Vu16m4 v) {
1190  return vnclipu_wx_u8m2(v, 0, Lanes(d));
1191 }
1192 HWY_INLINE Vu8m4 DemoteTo(Du8m4 d, const Vu16m8 v) {
1193  return vnclipu_wx_u8m4(v, 0, Lanes(d));
1194 }
1195 
1196 } // namespace detail
1197 
1198 // First clamp negative numbers to zero to match x86 packus.
1199 HWY_API Vu16m1 DemoteTo(Du16m1 d, const Vi32m2 v) {
1201 }
1202 HWY_API Vu16m2 DemoteTo(Du16m2 d, const Vi32m4 v) {
1204 }
1205 HWY_API Vu16m4 DemoteTo(Du16m4 d, const Vi32m8 v) {
1207 }
1208 
1209 HWY_API Vu8m1 DemoteTo(Du8m1 d, const Vi32m4 v) {
1210  return vnclipu_wx_u8m1(DemoteTo(Du16m2(), v), 0, Lanes(d));
1211 }
1212 HWY_API Vu8m2 DemoteTo(Du8m2 d, const Vi32m8 v) {
1213  return vnclipu_wx_u8m2(DemoteTo(Du16m4(), v), 0, Lanes(d));
1214 }
1215 
1216 HWY_API Vu8m1 DemoteTo(Du8m1 d, const Vi16m2 v) {
1218 }
1219 HWY_API Vu8m2 DemoteTo(Du8m2 d, const Vi16m4 v) {
1221 }
1222 HWY_API Vu8m4 DemoteTo(Du8m4 d, const Vi16m8 v) {
1224 }
1225 
1226 HWY_API Vu8m1 U8FromU32(const Vu32m4 v) {
1227  const size_t avl = Lanes(Full<uint8_t>());
1228  return vnclipu_wx_u8m1(vnclipu_wx_u16m2(v, 0, avl), 0, avl);
1229 }
1230 HWY_API Vu8m2 U8FromU32(const Vu32m8 v) {
1231  const size_t avl = Lanes(Full<uint8_t, 2>());
1232  return vnclipu_wx_u8m2(vnclipu_wx_u16m4(v, 0, avl), 0, avl);
1233 }
1234 
1235 // ------------------------------ DemoteTo I
1236 
1237 HWY_API Vi8m1 DemoteTo(Di8m1 d, const Vi16m2 v) {
1238  return vnclip_wx_i8m1(v, 0, Lanes(d));
1239 }
1240 HWY_API Vi8m2 DemoteTo(Di8m2 d, const Vi16m4 v) {
1241  return vnclip_wx_i8m2(v, 0, Lanes(d));
1242 }
1243 HWY_API Vi8m4 DemoteTo(Di8m4 d, const Vi16m8 v) {
1244  return vnclip_wx_i8m4(v, 0, Lanes(d));
1245 }
1246 
1247 HWY_API Vi16m1 DemoteTo(Di16m1 d, const Vi32m2 v) {
1248  return vnclip_wx_i16m1(v, 0, Lanes(d));
1249 }
1250 HWY_API Vi16m2 DemoteTo(Di16m2 d, const Vi32m4 v) {
1251  return vnclip_wx_i16m2(v, 0, Lanes(d));
1252 }
1253 HWY_API Vi16m4 DemoteTo(Di16m4 d, const Vi32m8 v) {
1254  return vnclip_wx_i16m4(v, 0, Lanes(d));
1255 }
1256 
1257 HWY_API Vi8m1 DemoteTo(Di8m1 d, const Vi32m4 v) {
1258  return DemoteTo(d, DemoteTo(Di16m2(), v));
1259 }
1260 HWY_API Vi8m2 DemoteTo(Di8m2 d, const Vi32m8 v) {
1261  return DemoteTo(d, DemoteTo(Di16m4(), v));
1262 }
1263 
1264 // ------------------------------ DemoteTo F
1265 
1266 #if HWY_CAP_FLOAT16
1267 HWY_API Vf16m1 DemoteTo(Df16m1 d, const Vf32m2 v) {
1268  return vfncvt_rod_f_f_w_f16m1(v, Lanes(d));
1269 }
1270 HWY_API Vf16m2 DemoteTo(Df16m2 d, const Vf32m4 v) {
1271  return vfncvt_rod_f_f_w_f16m2(v, Lanes(d));
1272 }
1273 HWY_API Vf16m4 DemoteTo(Df16m4 d, const Vf32m8 v) {
1274  return vfncvt_rod_f_f_w_f16m4(v, Lanes(d));
1275 }
1276 #endif
1277 
1278 HWY_API Vf32m1 DemoteTo(Df32m1 d, const Vf64m2 v) {
1279  return vfncvt_rod_f_f_w_f32m1(v, Lanes(d));
1280 }
1281 HWY_API Vf32m2 DemoteTo(Df32m2 d, const Vf64m4 v) {
1282  return vfncvt_rod_f_f_w_f32m2(v, Lanes(d));
1283 }
1284 HWY_API Vf32m4 DemoteTo(Df32m4 d, const Vf64m8 v) {
1285  return vfncvt_rod_f_f_w_f32m4(v, Lanes(d));
1286 }
1287 
1288 HWY_API Vi32m1 DemoteTo(Di32m1 d, const Vf64m2 v) {
1289  return vfncvt_rtz_x_f_w_i32m1(v, Lanes(d));
1290 }
1291 HWY_API Vi32m2 DemoteTo(Di32m2 d, const Vf64m4 v) {
1292  return vfncvt_rtz_x_f_w_i32m2(v, Lanes(d));
1293 }
1294 HWY_API Vi32m4 DemoteTo(Di32m4 d, const Vf64m8 v) {
1295  return vfncvt_rtz_x_f_w_i32m4(v, Lanes(d));
1296 }
1297 
1298 template <size_t N>
1300  VFromD<Simd<float, N>> v) {
1301  const RebindToUnsigned<decltype(d)> du16;
1302  const Rebind<uint32_t, decltype(d)> du32;
1303  return DemoteTo(du16, BitCast(du32, v));
1304 }
1305 
1306 // ------------------------------ ConvertTo F
1307 
1308 #define HWY_RVV_CONVERT(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
1309  OP) \
1310  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1311  ConvertTo(HWY_RVV_D(CHAR, SEW, LMUL) d, HWY_RVV_V(int, SEW, LMUL) v) { \
1312  return vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d)); \
1313  } \
1314  /* Truncates (rounds toward zero). */ \
1315  HWY_API HWY_RVV_V(int, SEW, LMUL) \
1316  ConvertTo(HWY_RVV_D(i, SEW, LMUL) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1317  return vfcvt_rtz_x_f_v_i##SEW##LMUL(v, Lanes(d)); \
1318  } \
1319  /* Uses default rounding mode. */ \
1320  HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1321  return vfcvt_x_f_v_i##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
1322  }
1323 
1324 // API only requires f32 but we provide f64 for internal use (otherwise, it
1325 // seems difficult to implement Iota without a _mf2 vector half).
1327 #undef HWY_RVV_CONVERT
1328 
1329 // Partial
1330 template <typename T, size_t N, class FromV, HWY_IF_LE128(T, N)>
1332  return ConvertTo(Full<T>(), v);
1333 }
1334 
1335 // ================================================== COMBINE
1336 
1337 namespace detail {
1338 
1339 // For x86-compatible behaviour mandated by Highway API: TableLookupBytes
1340 // offsets are implicitly relative to the start of their 128-bit block.
1341 template <typename T, size_t N>
1342 constexpr size_t LanesPerBlock(Simd<T, N> /* tag */) {
1343  // Also cap to the limit imposed by D (for fixed-size <= 128-bit vectors).
1344  return HWY_MIN(16 / sizeof(T), N);
1345 }
1346 
1347 template <class D, class V>
1348 HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
1349  using T = MakeUnsigned<TFromD<D>>;
1350  return AndS(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
1351 }
1352 
1353 template <size_t kLanes, class D>
1355  const RebindToUnsigned<D> du;
1356  const RebindToSigned<D> di;
1357  constexpr size_t kLanesPerBlock = LanesPerBlock(du);
1358  const auto idx_mod = AndS(Iota0(du), kLanesPerBlock - 1);
1359  return LtS(BitCast(di, idx_mod), static_cast<TFromD<decltype(di)>>(kLanes));
1360 }
1361 
1362 // vector = f(vector, vector, size_t)
1363 #define HWY_RVV_SLIDE(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
1364  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1365  NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \
1366  size_t lanes) { \
1367  return v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes, \
1368  HWY_RVV_AVL(SEW, SHIFT)); \
1369  }
1370 
1371 HWY_RVV_FOREACH(HWY_RVV_SLIDE, SlideUp, slideup)
1372 HWY_RVV_FOREACH(HWY_RVV_SLIDE, SlideDown, slidedown)
1373 
1374 #undef HWY_RVV_SLIDE
1375 
1376 } // namespace detail
1377 
1378 // ------------------------------ ConcatUpperLower
1379 template <class V>
1380 HWY_API V ConcatUpperLower(const V hi, const V lo) {
1381  const RebindToSigned<DFromV<V>> di;
1382  return IfThenElse(FirstN(di, Lanes(di) / 2), lo, hi);
1383 }
1384 
1385 // ------------------------------ ConcatLowerLower
1386 template <class V>
1387 HWY_API V ConcatLowerLower(const V hi, const V lo) {
1388  return detail::SlideUp(lo, hi, Lanes(DFromV<V>()) / 2);
1389 }
1390 
1391 // ------------------------------ ConcatUpperUpper
1392 template <class V>
1393 HWY_API V ConcatUpperUpper(const V hi, const V lo) {
1394  // Move upper half into lower
1395  const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV<V>()) / 2);
1396  return ConcatUpperLower(hi, lo_down);
1397 }
1398 
1399 // ------------------------------ ConcatLowerUpper
1400 template <class V>
1401 HWY_API V ConcatLowerUpper(const V hi, const V lo) {
1402  // Move half of both inputs to the other half
1403  const auto hi_up = detail::SlideUp(hi, hi, Lanes(DFromV<V>()) / 2);
1404  const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV<V>()) / 2);
1405  return ConcatUpperLower(hi_up, lo_down);
1406 }
1407 
1408 // ------------------------------ Combine
1409 
1410 // TODO(janwas): implement after LMUL ext/trunc
1411 #if 0
1412 
1413 template <class V>
1414 HWY_API V Combine(const V a, const V b) {
1415  using D = DFromV<V>;
1416  // double LMUL of inputs, then SlideUp with Lanes().
1417 }
1418 
1419 #endif
1420 
1421 // ------------------------------ ZeroExtendVector
1422 
1423 template <class V>
1424 HWY_API V ZeroExtendVector(const V lo) {
1425  return Combine(Xor(lo, lo), lo);
1426 }
1427 
1428 // ------------------------------ Lower/UpperHalf
1429 
1430 namespace detail {
1431 #define HWY_RVV_TRUNC(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
1432  HWY_API HWY_RVV_V(BASE, SEW, HALF) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1433  return v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##HALF(v); /* no AVL */ \
1434  }
1439 #if HWY_CAP_FLOAT16
1441 #endif
1443 #undef HWY_RVV_TRUNC
1444 } // namespace detail
1445 
1446 template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
1447 HWY_API VFromD<D> LowerHalf(const D /* tag */, const VFromD<D> v) {
1448  return detail::Trunc(v);
1449 }
1450 
1451 // Intrinsics do not provide mf2 for 64-bit T because VLEN might only be 64,
1452 // so "half-vectors" might not exist. However, the application processor profile
1453 // requires VLEN >= 128. Bypass this by casting to 32-bit.
1454 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1455 HWY_API VFromD<D> LowerHalf(const D d, const VFromD<D> v) {
1456  const Repartition<uint32_t, D> d32;
1457  return BitCast(d, detail::Trunc(BitCast(Twice<decltype(d32)>(), v)));
1458 }
1459 
1460 template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
1461 HWY_API VFromD<D> UpperHalf(const D d, const VFromD<D> v) {
1462  return LowerHalf(d, detail::SlideDown(v, v, Lanes(d)));
1463 }
1464 
1465 // ================================================== SWIZZLE
1466 
1467 // ------------------------------ GetLane
1468 
1469 #define HWY_RVV_GET_LANE(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
1470  OP) \
1471  HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1472  return v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); /* no AVL */ \
1473  }
1474 
1477 #undef HWY_RVV_GET_LANE
1478 
1479 // ------------------------------ OddEven
1480 template <class V>
1481 HWY_API V OddEven(const V a, const V b) {
1482  const RebindToUnsigned<DFromV<V>> du; // Iota0 is unsigned only
1483  const auto is_even = Eq(detail::AndS(detail::Iota0(du), 1), Zero(du));
1484  return IfThenElse(is_even, b, a);
1485 }
1486 
1487 // ------------------------------ TableLookupLanes
1488 
1489 template <class D, class DU = RebindToUnsigned<D>>
1491 #if HWY_IS_DEBUG_BUILD
1492  const size_t N = Lanes(d);
1493  for (size_t i = 0; i < N; ++i) {
1494  HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<TFromD<DU>>(N));
1495  }
1496  #else
1497  (void)d;
1498 #endif
1499  return Load(DU(), idx);
1500 }
1501 
1502 // <32bit are not part of Highway API, but used in Broadcast. This limits VLMAX
1503 // to 2048! We could instead use vrgatherei16.
1504 #define HWY_RVV_TABLE(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
1505  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1506  NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \
1507  return v##OP##_vv_##CHAR##SEW##LMUL(v, idx, HWY_RVV_AVL(SEW, SHIFT)); \
1508  }
1509 
1511 #undef HWY_RVV_TABLE
1512 
1513 // ------------------------------ Reverse
1514 template <class D>
1516  const RebindToUnsigned<D> du;
1517  using TU = TFromD<decltype(du)>;
1518  const size_t N = Lanes(du);
1519  const auto idx = Sub(Set(du, static_cast<TU>(N - 1)), detail::Iota0(du));
1520  return TableLookupLanes(v, idx);
1521 }
1522 
1523 // ------------------------------ Compress
1524 
1525 #define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
1526  OP) \
1527  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1528  NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \
1529  return v##OP##_vm_##CHAR##SEW##LMUL(mask, v, v, HWY_RVV_AVL(SEW, SHIFT)); \
1530  }
1531 
1536 #undef HWY_RVV_COMPRESS
1537 
1538 // ------------------------------ CompressStore
1539 template <class V, class M, class D>
1540 HWY_API size_t CompressStore(const V v, const M mask, const D d,
1541  TFromD<D>* HWY_RESTRICT unaligned) {
1542  StoreU(Compress(v, mask), d, unaligned);
1543  return CountTrue(d, mask);
1544 }
1545 
1546 // ================================================== BLOCKWISE
1547 
1548 // ------------------------------ CombineShiftRightBytes
1549 template <size_t kBytes, class D, class V = VFromD<D>>
1550 HWY_API V CombineShiftRightBytes(const D d, const V hi, V lo) {
1551  const Repartition<uint8_t, decltype(d)> d8;
1552  const auto hi8 = BitCast(d8, hi);
1553  const auto lo8 = BitCast(d8, lo);
1554  const auto hi_up = detail::SlideUp(hi8, hi8, 16 - kBytes);
1555  const auto lo_down = detail::SlideDown(lo8, lo8, kBytes);
1556  const auto is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
1557  return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
1558 }
1559 
1560 // ------------------------------ CombineShiftRightLanes
1561 template <size_t kLanes, class D, class V = VFromD<D>>
1562 HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo) {
1563  constexpr size_t kLanesUp = 16 / sizeof(TFromV<V>) - kLanes;
1564  const auto hi_up = detail::SlideUp(hi, hi, kLanesUp);
1565  const auto lo_down = detail::SlideDown(lo, lo, kLanes);
1566  const auto is_lo = detail::FirstNPerBlock<kLanesUp>(d);
1567  return IfThenElse(is_lo, lo_down, hi_up);
1568 }
1569 
1570 // ------------------------------ Shuffle2301 (ShiftLeft)
1571 template <class V>
1572 HWY_API V Shuffle2301(const V v) {
1573  const DFromV<V> d;
1574  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
1575  const Repartition<uint64_t, decltype(d)> du64;
1576  const auto v64 = BitCast(du64, v);
1577  return BitCast(d, Or(ShiftRight<32>(v64), ShiftLeft<32>(v64)));
1578 }
1579 
1580 // ------------------------------ Shuffle2103
1581 template <class V>
1582 HWY_API V Shuffle2103(const V v) {
1583  const DFromV<V> d;
1584  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
1585  return CombineShiftRightLanes<3>(d, v, v);
1586 }
1587 
1588 // ------------------------------ Shuffle0321
1589 template <class V>
1590 HWY_API V Shuffle0321(const V v) {
1591  const DFromV<V> d;
1592  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
1593  return CombineShiftRightLanes<1>(d, v, v);
1594 }
1595 
1596 // ------------------------------ Shuffle1032
1597 template <class V>
1598 HWY_API V Shuffle1032(const V v) {
1599  const DFromV<V> d;
1600  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
1601  return CombineShiftRightLanes<2>(d, v, v);
1602 }
1603 
1604 // ------------------------------ Shuffle01
1605 template <class V>
1606 HWY_API V Shuffle01(const V v) {
1607  const DFromV<V> d;
1608  static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types");
1609  return CombineShiftRightLanes<1>(d, v, v);
1610 }
1611 
1612 // ------------------------------ Shuffle0123
1613 template <class V>
1614 HWY_API V Shuffle0123(const V v) {
1615  return Shuffle2301(Shuffle1032(v));
1616 }
1617 
1618 // ------------------------------ TableLookupBytes
1619 
1620 template <class V, class VI>
1621 HWY_API VI TableLookupBytes(const V v, const VI idx) {
1622  const DFromV<VI> d;
1623  const Repartition<uint8_t, decltype(d)> d8;
1624  const auto offsets128 = detail::OffsetsOf128BitBlocks(d8, detail::Iota0(d8));
1625  const auto idx8 = Add(BitCast(d8, idx), offsets128);
1626  return BitCast(d, TableLookupLanes(BitCast(d8, v), idx8));
1627 }
1628 
1629 template <class V, class VI>
1630 HWY_API V TableLookupBytesOr0(const VI v, const V idx) {
1631  const DFromV<VI> d;
1632  // Mask size must match vector type, so cast everything to this type.
1633  const Repartition<int8_t, decltype(d)> di8;
1634  const auto lookup = TableLookupBytes(BitCast(di8, v), BitCast(di8, idx));
1635  const auto msb = Lt(BitCast(di8, idx), Zero(di8));
1636  return BitCast(d, IfThenZeroElse(msb, lookup));
1637 }
1638 
1639 // ------------------------------ Broadcast
1640 template <int kLane, class V>
1641 HWY_API V Broadcast(const V v) {
1642  const DFromV<V> d;
1643  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
1644  static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane");
1645  auto idx = detail::OffsetsOf128BitBlocks(d, detail::Iota0(d));
1646  if (kLane != 0) {
1647  idx = detail::AddS(idx, kLane);
1648  }
1649  return TableLookupLanes(v, idx);
1650 }
1651 
1652 // ------------------------------ ShiftLeftLanes
1653 
1654 template <size_t kLanes, class D, class V = VFromD<D>>
1655 HWY_API V ShiftLeftLanes(const D d, const V v) {
1656  const RebindToSigned<decltype(d)> di;
1657  const auto shifted = detail::SlideUp(v, v, kLanes);
1658  // Match x86 semantics by zeroing lower lanes in 128-bit blocks
1659  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di);
1660  const auto idx_mod = detail::AndS(detail::Iota0(di), kLanesPerBlock - 1);
1661  const auto clear = Lt(BitCast(di, idx_mod), Set(di, kLanes));
1662  return IfThenZeroElse(clear, shifted);
1663 }
1664 
1665 template <size_t kLanes, class V>
1666 HWY_API V ShiftLeftLanes(const V v) {
1667  return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
1668 }
1669 
1670 // ------------------------------ ShiftLeftBytes
1671 
1672 template <int kBytes, class V>
1674  const Repartition<uint8_t, decltype(d)> d8;
1675  return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v)));
1676 }
1677 
1678 template <int kBytes, class V>
1679 HWY_API V ShiftLeftBytes(const V v) {
1680  return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
1681 }
1682 
1683 // ------------------------------ ShiftRightLanes
1684 template <size_t kLanes, typename T, size_t N, class V = VFromD<Simd<T, N>>>
1685 HWY_API V ShiftRightLanes(const Simd<T, N> d, V v) {
1686  const RebindToSigned<decltype(d)> di;
1687  // For partial vectors, clear upper lanes so we shift in zeros.
1688  if (N <= 16 / sizeof(T)) {
1689  v = IfThenElseZero(FirstN(d, N), v);
1690  }
1691 
1692  const auto shifted = detail::SlideDown(v, v, kLanes);
1693  // Match x86 semantics by zeroing upper lanes in 128-bit blocks
1694  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di);
1695  const auto idx_mod = detail::AndS(detail::Iota0(di), kLanesPerBlock - 1);
1696  const auto keep = Lt(BitCast(di, idx_mod), Set(di, kLanesPerBlock - kLanes));
1697  return IfThenElseZero(keep, shifted);
1698 }
1699 
1700 // ------------------------------ ShiftRightBytes
1701 template <int kBytes, class D, class V = VFromD<D>>
1702 HWY_API V ShiftRightBytes(const D d, const V v) {
1703  const Repartition<uint8_t, decltype(d)> d8;
1704  return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
1705 }
1706 
1707 // ------------------------------ InterleaveLower
1708 
1709 // TODO(janwas): PromoteTo(LowerHalf), slide1up, add
1710 template <class D, class V>
1711 HWY_API V InterleaveLower(D d, const V a, const V b) {
1712  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
1713  const RebindToUnsigned<decltype(d)> du;
1714  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
1715  const auto i = detail::Iota0(du);
1716  const auto idx_mod = ShiftRight<1>(detail::AndS(i, kLanesPerBlock - 1));
1717  const auto idx = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
1718  const auto is_even = Eq(detail::AndS(i, 1), Zero(du));
1719  return IfThenElse(is_even, TableLookupLanes(a, idx),
1720  TableLookupLanes(b, idx));
1721 }
1722 
1723 template <class V>
1724 HWY_API V InterleaveLower(const V a, const V b) {
1725  return InterleaveLower(DFromV<V>(), a, b);
1726 }
1727 
1728 // ------------------------------ InterleaveUpper
1729 
1730 template <class D, class V>
1731 HWY_API V InterleaveUpper(const D d, const V a, const V b) {
1732  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
1733  const RebindToUnsigned<decltype(d)> du;
1734  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
1735  const auto i = detail::Iota0(du);
1736  const auto idx_mod = ShiftRight<1>(detail::AndS(i, kLanesPerBlock - 1));
1737  const auto idx_lower = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
1738  const auto idx = detail::AddS(idx_lower, kLanesPerBlock / 2);
1739  const auto is_even = Eq(detail::AndS(i, 1), Zero(du));
1740  return IfThenElse(is_even, TableLookupLanes(a, idx),
1741  TableLookupLanes(b, idx));
1742 }
1743 
1744 // ------------------------------ ZipLower
1745 
1746 template <class V, class DW = RepartitionToWide<DFromV<V>>>
1747 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
1748  const RepartitionToNarrow<DW> dn;
1749  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
1750  return BitCast(dw, InterleaveLower(dn, a, b));
1751 }
1752 
1753 template <class V, class DW = RepartitionToWide<DFromV<V>>>
1755  return BitCast(DW(), InterleaveLower(a, b));
1756 }
1757 
1758 // ------------------------------ ZipUpper
1759 template <class DW, class V>
1760 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
1761  const RepartitionToNarrow<DW> dn;
1762  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
1763  return BitCast(dw, InterleaveUpper(dn, a, b));
1764 }
1765 
1766 // ================================================== REDUCE
1767 
1768 // vector = f(vector, zero_m1)
1769 #define HWY_RVV_REDUCE(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
1770  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1771  NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) { \
1772  return Set(HWY_RVV_D(CHAR, SEW, LMUL)(), \
1773  GetLane(v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \
1774  v0, v, v0, HWY_RVV_AVL(SEW, SHIFT)))); \
1775  }
1776 
1777 // ------------------------------ SumOfLanes
1778 
1779 namespace detail {
1780 HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum)
1781 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredsum)
1782 } // namespace detail
1783 
1784 template <class D>
1785 HWY_API VFromD<D> SumOfLanes(D /* d */, const VFromD<D> v) {
1786  const auto v0 = Zero(Full<TFromD<D>>()); // always m1
1787  return detail::RedSum(v, v0);
1788 }
1789 
1790 // ------------------------------ MinOfLanes
1791 namespace detail {
1792 HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu)
1793 HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin)
1794 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin)
1795 } // namespace detail
1796 
1797 template <class D>
1798 HWY_API VFromD<D> MinOfLanes(D /* d */, const VFromD<D> v) {
1799  using T = TFromD<D>;
1800  const Full<T> d1; // always m1
1801  const auto neutral = Set(d1, HighestValue<T>());
1802  return detail::RedMin(v, neutral);
1803 }
1804 
1805 // ------------------------------ MaxOfLanes
1806 namespace detail {
1807 HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu)
1808 HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax)
1809 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax)
1810 } // namespace detail
1811 
1812 template <class D>
1813 HWY_API VFromD<D> MaxOfLanes(D /* d */, const VFromD<D> v) {
1814  using T = TFromD<D>;
1815  const Full<T> d1; // always m1
1816  const auto neutral = Set(d1, LowestValue<T>());
1817  return detail::RedMax(v, neutral);
1818 }
1819 
1820 #undef HWY_RVV_REDUCE
1821 
1822 // ================================================== Ops with dependencies
1823 
1824 // ------------------------------ LoadDup128
1825 
1826 template <class D>
1828  const auto loaded = Load(d, p);
1829  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
1830  // Broadcast the first block
1831  const auto idx = detail::AndS(detail::Iota0(d), kLanesPerBlock - 1);
1832  return TableLookupLanes(loaded, idx);
1833 }
1834 
1835 // ------------------------------ StoreMaskBits
1836 #define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \
1837  /* DEPRECATED */ \
1838  HWY_API size_t StoreMaskBits(HWY_RVV_M(MLEN) m, uint8_t* bits) { \
1839  /* LMUL=1 is always enough */ \
1840  Full<uint8_t> d8; \
1841  const size_t num_bytes = (Lanes(d8) + MLEN - 1) / MLEN; \
1842  /* TODO(janwas): how to convert vbool* to vuint?*/ \
1843  /*Store(m, d8, bits);*/ \
1844  (void)m; \
1845  (void)bits; \
1846  return num_bytes; \
1847  } \
1848  template <class D> \
1849  HWY_API size_t StoreMaskBits(D /* tag */, HWY_RVV_M(MLEN) m, \
1850  uint8_t* bits) { \
1851  return StoreMaskBits(m, bits); \
1852  }
1854 #undef HWY_RVV_STORE_MASK_BITS
1855 
1856 // ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp)
1857 
1858 // Disallow for 8-bit because Iota is likely to overflow.
1859 template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
1860 HWY_API MFromD<D> FirstN(const D d, const size_t n) {
1861  const RebindToSigned<D> di;
1862  return RebindMask(d, Lt(BitCast(di, detail::Iota0(d)), Set(di, n)));
1863 }
1864 
1865 template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
1866 HWY_API MFromD<D> FirstN(const D d, const size_t n) {
1867  const auto zero = Zero(d);
1868  const auto one = Set(d, 1);
1869  return Eq(detail::SlideUp(one, zero, n), one);
1870 }
1871 
1872 // ------------------------------ Neg (Sub)
1873 
1874 template <class V, HWY_IF_SIGNED_V(V)>
1875 HWY_API V Neg(const V v) {
1876  return Sub(Zero(DFromV<V>()), v);
1877 }
1878 
1879 // vector = f(vector), but argument is repeated
1880 #define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
1881  OP) \
1882  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1883  return v##OP##_vv_##CHAR##SEW##LMUL(v, v, HWY_RVV_AVL(SEW, SHIFT)); \
1884  }
1885 
1887 
1888 // ------------------------------ Abs (Max, Neg)
1889 
1890 template <class V, HWY_IF_SIGNED_V(V)>
1891 HWY_API V Abs(const V v) {
1892  return Max(v, Neg(v));
1893 }
1894 
1896 
1897 #undef HWY_RVV_RETV_ARGV2
1898 
1899 // ------------------------------ AbsDiff (Abs, Sub)
1900 template <class V>
1901 HWY_API V AbsDiff(const V a, const V b) {
1902  return Abs(Sub(a, b));
1903 }
1904 
1905 // ------------------------------ Round (NearestInt, ConvertTo, CopySign)
1906 
1907 // IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not have
1908 // a dedicated instruction for that. Rounding to integer and converting back to
1909 // float is correct except when the input magnitude is large, in which case the
1910 // input was already an integer (because mantissa >> exponent is zero).
1911 
1912 namespace detail {
1914 
1915 template <class V>
1916 HWY_INLINE auto UseInt(const V v) -> decltype(MaskFromVec(v)) {
1917  return Lt(Abs(v), Set(DFromV<V>(), MantissaEnd<TFromV<V>>()));
1918 }
1919 
1920 } // namespace detail
1921 
1922 template <class V>
1923 HWY_API V Round(const V v) {
1924  const DFromV<V> df;
1925 
1926  const auto integer = NearestInt(v); // round using current mode
1927  const auto int_f = ConvertTo(df, integer);
1928 
1929  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
1930 }
1931 
1932 // ------------------------------ Trunc (ConvertTo)
1933 template <class V>
1934 HWY_API V Trunc(const V v) {
1935  const DFromV<V> df;
1936  const RebindToSigned<decltype(df)> di;
1937 
1938  const auto integer = ConvertTo(di, v); // round toward 0
1939  const auto int_f = ConvertTo(df, integer);
1940 
1941  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
1942 }
1943 
1944 // ------------------------------ Ceil
1945 template <class V>
1946 HWY_API V Ceil(const V v) {
1947  asm volatile("fsrm %0" ::"r"(detail::kUp));
1948  const auto ret = Round(v);
1949  asm volatile("fsrm %0" ::"r"(detail::kNear));
1950  return ret;
1951 }
1952 
1953 // ------------------------------ Floor
1954 template <class V>
1955 HWY_API V Floor(const V v) {
1956  asm volatile("fsrm %0" ::"r"(detail::kDown));
1957  const auto ret = Round(v);
1958  asm volatile("fsrm %0" ::"r"(detail::kNear));
1959  return ret;
1960 }
1961 
1962 // ------------------------------ Iota (ConvertTo)
1963 
1964 template <class D, HWY_IF_UNSIGNED_D(D)>
1965 HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
1966  return Add(detail::Iota0(d), Set(d, first));
1967 }
1968 
1969 template <class D, HWY_IF_SIGNED_D(D)>
1970 HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
1971  const RebindToUnsigned<D> du;
1972  return Add(BitCast(d, detail::Iota0(du)), Set(d, first));
1973 }
1974 
1975 template <class D, HWY_IF_FLOAT_D(D)>
1976 HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
1977  const RebindToUnsigned<D> du;
1978  const RebindToSigned<D> di;
1979  return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))), first);
1980 }
1981 
1982 // ------------------------------ MulEven/Odd (Mul, OddEven)
1983 
1984 namespace detail {
1985 // Special instruction for 1 lane is presumably faster?
1986 #define HWY_RVV_SLIDE1(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP) \
1987  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1988  return v##OP##_vx_##CHAR##SEW##LMUL(v, 0, HWY_RVV_AVL(SEW, SHIFT)); \
1989  }
1990 
1991 HWY_RVV_FOREACH_UI32(HWY_RVV_SLIDE1, Slide1Up, slide1up)
1992 HWY_RVV_FOREACH_U64(HWY_RVV_SLIDE1, Slide1Up, slide1up)
1993 HWY_RVV_FOREACH_UI32(HWY_RVV_SLIDE1, Slide1Down, slide1down)
1994 HWY_RVV_FOREACH_U64(HWY_RVV_SLIDE1, Slide1Down, slide1down)
1995 #undef HWY_RVV_SLIDE1
1996 } // namespace detail
1997 
1998 template <class V, HWY_IF_LANE_SIZE_V(V, 4)>
2000  const auto lo = Mul(a, b);
2001  const auto hi = detail::MulHigh(a, b);
2002  const RepartitionToWide<DFromV<V>> dw;
2003  return BitCast(dw, OddEven(detail::Slide1Up(hi), lo));
2004 }
2005 
2006 // There is no 64x64 vwmul.
2007 template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2008 HWY_INLINE V MulEven(const V a, const V b) {
2009  const auto lo = detail::Mul(a, b);
2010  const auto hi = detail::MulHigh(a, b);
2011  return OddEven(detail::Slide1Up(hi), lo);
2012 }
2013 
2014 template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2015 HWY_INLINE V MulOdd(const V a, const V b) {
2016  const auto lo = detail::Mul(a, b);
2017  const auto hi = detail::MulHigh(a, b);
2018  return OddEven(hi, detail::Slide1Down(lo));
2019 }
2020 
2021 // ------------------------------ ReorderDemote2To (OddEven)
2022 
2023 template <size_t N, class DF = Simd<float, N / 2>>
2025  VFromD<DF> a, VFromD<DF> b) {
2026  const RebindToUnsigned<decltype(dbf16)> du16;
2027  const RebindToUnsigned<DF> du32;
2028  const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b));
2029  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
2030 }
2031 
2032 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2033 
2034 template <size_t N, class DU16 = Simd<uint16_t, N * 2>>
2036  VFromD<DU16> b,
2037  const VFromD<decltype(df32)> sum0,
2038  VFromD<decltype(df32)>& sum1)
2039  -> VFromD<decltype(df32)> {
2040  const DU16 du16;
2041  const RebindToUnsigned<decltype(df32)> du32;
2042  using VU32 = VFromD<decltype(du32)>;
2043  const VFromD<DU16> zero = Zero(du16);
2044  const VU32 a0 = ZipLower(du32, zero, BitCast(du16, a));
2045  const VU32 a1 = ZipUpper(du32, zero, BitCast(du16, a));
2046  const VU32 b0 = ZipLower(du32, zero, BitCast(du16, b));
2047  const VU32 b1 = ZipUpper(du32, zero, BitCast(du16, b));
2048  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
2049  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
2050 }
2051 
2052 // ================================================== END MACROS
2053 namespace detail { // for code folding
2054 #undef HWY_IF_FLOAT_V
2055 #undef HWY_IF_SIGNED_V
2056 #undef HWY_IF_UNSIGNED_V
2057 
2058 #undef HWY_RVV_FOREACH
2059 #undef HWY_RVV_FOREACH_08
2060 #undef HWY_RVV_FOREACH_16
2061 #undef HWY_RVV_FOREACH_32
2062 #undef HWY_RVV_FOREACH_64
2063 #undef HWY_RVV_FOREACH_B
2064 #undef HWY_RVV_FOREACH_F
2065 #undef HWY_RVV_FOREACH_F32
2066 #undef HWY_RVV_FOREACH_F64
2067 #undef HWY_RVV_FOREACH_I
2068 #undef HWY_RVV_FOREACH_I08
2069 #undef HWY_RVV_FOREACH_I16
2070 #undef HWY_RVV_FOREACH_I32
2071 #undef HWY_RVV_FOREACH_I64
2072 #undef HWY_RVV_FOREACH_U
2073 #undef HWY_RVV_FOREACH_U08
2074 #undef HWY_RVV_FOREACH_U16
2075 #undef HWY_RVV_FOREACH_U32
2076 #undef HWY_RVV_FOREACH_U64
2077 #undef HWY_RVV_FOREACH_UI
2078 #undef HWY_RVV_FOREACH_UI16
2079 #undef HWY_RVV_FOREACH_UI32
2080 #undef HWY_RVV_FOREACH_UI64
2081 
2082 #undef HWY_RVV_RETV_ARGV
2083 #undef HWY_RVV_RETV_ARGVS
2084 #undef HWY_RVV_RETV_ARGVV
2085 
2086 #undef HWY_RVV_T
2087 #undef HWY_RVV_D
2088 #undef HWY_RVV_V
2089 #undef HWY_RVV_M
2090 
2091 } // namespace detail
2092 // NOLINTNEXTLINE(google-readability-namespace-comments)
2093 } // namespace HWY_NAMESPACE
2094 } // namespace hwy
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_API
Definition: base.h:117
#define HWY_MIN(a, b)
Definition: base.h:122
#define HWY_INLINE
Definition: base.h:59
#define HWY_DASSERT(condition)
Definition: base.h:163
HWY_INLINE VFromD< DU > BitCastToUnsigned(V v)
Definition: rvv-inl.h:393
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2811
RoundingModes
Definition: rvv-inl.h:1913
@ kUp
Definition: rvv-inl.h:1913
@ kNear
Definition: rvv-inl.h:1913
@ kTrunc
Definition: rvv-inl.h:1913
@ kDown
Definition: rvv-inl.h:1913
constexpr size_t LanesPerBlock(Simd< T, N >)
Definition: arm_sve-inl.h:1472
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition: arm_sve-inl.h:1478
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1278
svbool_t FirstNPerBlock(D d)
Definition: arm_sve-inl.h:1484
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:578
HWY_INLINE VFromD< DU > Iota0(const D)
Definition: rvv-inl.h:412
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:551
HWY_INLINE Vu16m1 DemoteTo(Du16m1 d, const Vu32m2 v)
Definition: rvv-inl.h:1176
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1573
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:797
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_API Vec128< uint32_t, N > TableLookupLanes(const Vec128< uint32_t, N > v, const Indices128< uint32_t, N > idx)
Definition: arm_neon-inl.h:3342
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3064
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1167
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5035
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3435
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2829
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5027
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3629
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2949
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3052
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4207
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5040
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_API Vec128< T, N > LoadDup128(Simd< T, N > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2164
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4054
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2352
V Shl(V a, V b)
Definition: arm_neon-inl.h:5018
HWY_API Vec128< T, N > VecFromMask(const Mask128< T, N > v)
Definition: arm_neon-inl.h:1607
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5044
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1206
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N > d, const int32_t *idx)
Definition: arm_neon-inl.h:3323
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1600
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: shared-inl.h:149
constexpr HWY_API size_t Lanes(Simd< T, N >)
Definition: arm_sve-inl.h:226
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3947
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3726
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3389
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:422
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3070
Vec128< T, N > Iota(const Simd< T, N > d, const T2 first)
Definition: arm_neon-inl.h:734
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:3907
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2903
typename D::Twice Twice
Definition: shared-inl.h:168
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: shared-inl.h:147
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1619
HWY_API Vec128< T, N > Undefined(Simd< T, N >)
Definition: arm_neon-inl.h:724
HWY_API intptr_t FindFirstTrue(const Simd< T, N >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4520
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4509
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5000
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3395
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4538
HWY_API void StoreInterleaved3(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4829
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_API VFromD< DW > ZipUpper(DW dw, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3538
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1581
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1655
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2170
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2846
Simd< T,(kShift<=0) ?(HWY_LANES(T) > >(-kShift)) :0 > Full
Definition: arm_sve-inl.h:36
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:475
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:325
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1827
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3041
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1126
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:449
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec128< uint8_t, 8 > hi, Vec128< uint8_t, 8 > lo)
Definition: arm_neon-inl.h:3566
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4071
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2157
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4036
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5004
constexpr HWY_INLINE size_t MLenFromD(Simd< T, N >)
Definition: rvv-inl.h:38
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3637
typename D::template Rebind< T > Rebind
Definition: shared-inl.h:144
HWY_API V InterleaveUpper(Simd< T, N >, V a, V b)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< uint32_t, 2 > Shuffle2301(const Vec128< uint32_t, 2 > v)
Definition: arm_neon-inl.h:1698
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:319
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5049
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:532
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2890
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1366
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
V Shr(V a, V b)
Definition: arm_neon-inl.h:5022
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:720
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:3844
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4211
Neg(const Vec128< int64_t, 1 > v)
Definition: arm_neon-inl.h:866
HWY_API Vec128< uint8_t, 4 > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2699
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3235
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3752
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2818
HWY_API VFromD< DW > ZipLower(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3527
typename D::template Repartition< T > Repartition
Definition: shared-inl.h:155
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:2665
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4802
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3401
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:3957
HWY_API void StoreInterleaved4(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, const Vec128< uint8_t > v3, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4864
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5013
HWY_API Vec128< uint8_t, 8 > UpperHalf(Simd< uint8_t, 8 >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3096
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3407
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5009
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1590
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:535
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3385
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1320
HWY_API V Trunc(const V v)
Definition: rvv-inl.h:1934
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4780
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition: rvv-inl.h:1562
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
typename D::T TFromD
Definition: shared-inl.h:140
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4012
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1296
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
Definition: aligned_allocator.h:23
constexpr T MantissaEnd()
Definition: base.h:386
constexpr HWY_API bool IsSame()
Definition: base.h:260
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:523
#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:561
#define HWY_RVV_SLIDE(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1363
HWY_AFTER_NAMESPACE()
#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition: rvv-inl.h:1109
#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP)
Definition: rvv-inl.h:105
#define HWY_SPECIALIZE(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:181
#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:674
#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP)
Definition: rvv-inl.h:117
#define HWY_RVV_LOAD(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:891
#define HWY_RVV_IOTA(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:403
#define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:809
#define HWY_RVV_SCATTER(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:976
#define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:931
#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition: rvv-inl.h:1115
#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP)
Definition: rvv-inl.h:129
#define HWY_RVV_TABLE(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1504
#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP)
Definition: rvv-inl.h:115
#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1880
#define HWY_RVV_SLIDE1(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1986
#define HWY_RVV_FOREACH(X_MACRO, NAME, OP)
Definition: rvv-inl.h:164
#define HWY_RVV_CAST_U(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:334
#define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP)
Definition: rvv-inl.h:147
#define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP)
Definition: rvv-inl.h:111
#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP)
Definition: rvv-inl.h:107
#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1525
#define HWY_RVV_SHIFT(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:540
#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1469
#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP)
Definition: rvv-inl.h:101
#define HWY_RVV_REDUCE(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1769
#define HWY_RVV_CONVERT(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1308
#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP)
Definition: rvv-inl.h:141
#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP)
Definition: rvv-inl.h:99
#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP)
Definition: rvv-inl.h:57
#define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP)
Definition: rvv-inl.h:151
#define HWY_RVV_CAST_IF(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:344
#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:734
#define HWY_RVV_CAST_I8(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:322
#define HWY_RVV_LANES(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:207
#define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:865
#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP)
Definition: rvv-inl.h:160
#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:683
#define HWY_RVV_TRUNC(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1431
#define HWY_RVV_CAST_U8(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:313
#define HWY_RVV_FMA(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:647
HWY_BEFORE_NAMESPACE()
#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:746
#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP)
Definition: rvv-inl.h:95
#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:765
#define HWY_RVV_SET(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:252
#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP)
Definition: rvv-inl.h:97
#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP)
Definition: rvv-inl.h:109
#define HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP)
Definition: rvv-inl.h:155
#define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1836
#define HWY_RVV_GATHER(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1012
#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:847
#define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:292
#define HWY_RVV_RET_ARGVDP(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:943
#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:241
#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP)
Definition: rvv-inl.h:123
#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:226
#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:570
#define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:877
#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:233
#define HWY_LANES(T)
Definition: set_macros-inl.h:80
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
Definition: shared-inl.h:35
Definition: base.h:227