Grok  9.5.0
x86_128-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL
16 // operations when compiling for those targets.
17 // External include guard in highway.h - see comment there.
18 
19 #include <emmintrin.h>
20 #if HWY_TARGET == HWY_SSSE3
21 #include <tmmintrin.h> // SSSE3
22 #else
23 #include <smmintrin.h> // SSE4
24 #include <wmmintrin.h> // CLMUL
25 #endif
26 #include <stddef.h>
27 #include <stdint.h>
28 
29 #include "hwy/base.h"
30 #include "hwy/ops/shared-inl.h"
31 
32 // Clang 3.9 generates VINSERTF128 instead of the desired VBROADCASTF128,
33 // which would free up port5. However, inline assembly isn't supported on
34 // MSVC, results in incorrect output on GCC 8.3, and raises "invalid output size
35 // for constraint" errors on Clang (https://gcc.godbolt.org/z/-Jt_-F), hence we
36 // disable it.
37 #ifndef HWY_LOADDUP_ASM
38 #define HWY_LOADDUP_ASM 0
39 #endif
40 
42 namespace hwy {
43 namespace HWY_NAMESPACE {
44 
45 template <typename T>
46 using Full128 = Simd<T, 16 / sizeof(T)>;
47 
48 namespace detail {
49 
50 template <typename T>
51 struct Raw128 {
52  using type = __m128i;
53 };
54 template <>
55 struct Raw128<float> {
56  using type = __m128;
57 };
58 template <>
59 struct Raw128<double> {
60  using type = __m128d;
61 };
62 
63 } // namespace detail
64 
65 template <typename T, size_t N = 16 / sizeof(T)>
66 class Vec128 {
67  using Raw = typename detail::Raw128<T>::type;
68 
69  public:
70  // Compound assignment. Only usable if there is a corresponding non-member
71  // binary operator overload. For example, only f32 and f64 support division.
73  return *this = (*this * other);
74  }
76  return *this = (*this / other);
77  }
79  return *this = (*this + other);
80  }
82  return *this = (*this - other);
83  }
85  return *this = (*this & other);
86  }
88  return *this = (*this | other);
89  }
91  return *this = (*this ^ other);
92  }
93 
94  Raw raw;
95 };
96 
97 // Forward-declare for use by DeduceD, see below.
98 template <typename T>
99 class Vec256;
100 template <typename T>
101 class Vec512;
102 
103 #if HWY_TARGET <= HWY_AVX3
104 
105 namespace detail {
106 
107 // Template arg: sizeof(lane type)
108 template <size_t size>
109 struct RawMask128 {};
110 template <>
112  using type = __mmask16;
113 };
114 template <>
116  using type = __mmask8;
117 };
118 template <>
120  using type = __mmask8;
121 };
122 template <>
124  using type = __mmask8;
125 };
126 
127 } // namespace detail
128 
129 template <typename T, size_t N>
130 struct Mask128 {
131  using Raw = typename detail::RawMask128<sizeof(T)>::type;
132 
133  static Mask128<T, N> FromBits(uint64_t mask_bits) {
134  return Mask128<T, N>{static_cast<Raw>(mask_bits)};
135  }
136 
137  Raw raw;
138 };
139 
140 #else // AVX2 or below
141 
142 // FF..FF or 0.
143 template <typename T, size_t N>
144 struct Mask128 {
145  typename detail::Raw128<T>::type raw;
146 };
147 
148 #endif // HWY_TARGET <= HWY_AVX3
149 
150 namespace detail {
151 
152 // Deduce Simd<T, N> from Vec*<T, N> (pointers because Vec256/512 may be
153 // incomplete types at this point; this is simpler than avoiding multiple
154 // definitions of DFromV via #if)
155 struct DeduceD {
156  template <typename T, size_t N>
158  return Simd<T, N>();
159  }
160  template <typename T>
161  Simd<T, 32 / sizeof(T)> operator()(const Vec256<T>*) const {
162  return Simd<T, 32 / sizeof(T)>();
163  }
164  template <typename T>
165  Simd<T, 64 / sizeof(T)> operator()(const Vec512<T>*) const {
166  return Simd<T, 64 / sizeof(T)>();
167  }
168 };
169 
170 // Workaround for MSVC v19.14: alias with a dependent type fails to specialize.
171 template <class V>
172 struct ExpandDFromV {
173  using type = decltype(DeduceD()(static_cast<V*>(nullptr)));
174 };
175 
176 } // namespace detail
177 
178 template <class V>
179 using DFromV = typename detail::ExpandDFromV<V>::type;
180 
181 template <class V>
182 using TFromV = TFromD<DFromV<V>>;
183 
184 // ------------------------------ BitCast
185 
186 namespace detail {
187 
188 HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; }
189 HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); }
190 HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); }
191 
192 template <typename T, size_t N>
193 HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
194  return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
195 }
196 
197 // Cannot rely on function overloading because return types differ.
198 template <typename T>
199 struct BitCastFromInteger128 {
200  HWY_INLINE __m128i operator()(__m128i v) { return v; }
201 };
202 template <>
203 struct BitCastFromInteger128<float> {
204  HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); }
205 };
206 template <>
207 struct BitCastFromInteger128<double> {
208  HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); }
209 };
210 
211 template <typename T, size_t N>
213  Vec128<uint8_t, N * sizeof(T)> v) {
214  return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)};
215 }
216 
217 } // namespace detail
218 
219 template <typename T, size_t N, typename FromT>
220 HWY_API Vec128<T, N> BitCast(Simd<T, N> d,
221  Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
223 }
224 
225 // ------------------------------ Zero
226 
227 // Returns an all-zero vector/part.
228 template <typename T, size_t N, HWY_IF_LE128(T, N)>
229 HWY_API Vec128<T, N> Zero(Simd<T, N> /* tag */) {
230  return Vec128<T, N>{_mm_setzero_si128()};
231 }
232 template <size_t N, HWY_IF_LE128(float, N)>
233 HWY_API Vec128<float, N> Zero(Simd<float, N> /* tag */) {
234  return Vec128<float, N>{_mm_setzero_ps()};
235 }
236 template <size_t N, HWY_IF_LE128(double, N)>
238  return Vec128<double, N>{_mm_setzero_pd()};
239 }
240 
241 template <class D>
242 using VFromD = decltype(Zero(D()));
243 
244 // ------------------------------ Set
245 
246 // Returns a vector/part with all lanes set to "t".
247 template <size_t N, HWY_IF_LE128(uint8_t, N)>
248 HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N> /* tag */, const uint8_t t) {
249  return Vec128<uint8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT
250 }
251 template <size_t N, HWY_IF_LE128(uint16_t, N)>
252 HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N> /* tag */, const uint16_t t) {
253  return Vec128<uint16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT
254 }
255 template <size_t N, HWY_IF_LE128(uint32_t, N)>
256 HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N> /* tag */, const uint32_t t) {
257  return Vec128<uint32_t, N>{_mm_set1_epi32(static_cast<int>(t))};
258 }
259 template <size_t N, HWY_IF_LE128(uint64_t, N)>
260 HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N> /* tag */, const uint64_t t) {
261  return Vec128<uint64_t, N>{
262  _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT
263 }
264 template <size_t N, HWY_IF_LE128(int8_t, N)>
265 HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N> /* tag */, const int8_t t) {
266  return Vec128<int8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT
267 }
268 template <size_t N, HWY_IF_LE128(int16_t, N)>
269 HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N> /* tag */, const int16_t t) {
270  return Vec128<int16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT
271 }
272 template <size_t N, HWY_IF_LE128(int32_t, N)>
273 HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N> /* tag */, const int32_t t) {
274  return Vec128<int32_t, N>{_mm_set1_epi32(t)};
275 }
276 template <size_t N, HWY_IF_LE128(int64_t, N)>
277 HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N> /* tag */, const int64_t t) {
278  return Vec128<int64_t, N>{
279  _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT
280 }
281 template <size_t N, HWY_IF_LE128(float, N)>
282 HWY_API Vec128<float, N> Set(Simd<float, N> /* tag */, const float t) {
283  return Vec128<float, N>{_mm_set1_ps(t)};
284 }
285 template <size_t N, HWY_IF_LE128(double, N)>
286 HWY_API Vec128<double, N> Set(Simd<double, N> /* tag */, const double t) {
287  return Vec128<double, N>{_mm_set1_pd(t)};
288 }
289 
290 HWY_DIAGNOSTICS(push)
291 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
292 
293 // Returns a vector with uninitialized elements.
294 template <typename T, size_t N, HWY_IF_LE128(T, N)>
295 HWY_API Vec128<T, N> Undefined(Simd<T, N> /* tag */) {
296  // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
297  // generate an XOR instruction.
298  return Vec128<T, N>{_mm_undefined_si128()};
299 }
300 template <size_t N, HWY_IF_LE128(float, N)>
302  return Vec128<float, N>{_mm_undefined_ps()};
303 }
304 template <size_t N, HWY_IF_LE128(double, N)>
306  return Vec128<double, N>{_mm_undefined_pd()};
307 }
308 
309 HWY_DIAGNOSTICS(pop)
310 
311 // ------------------------------ GetLane
312 
313 // Gets the single value stored in a vector/part.
314 template <size_t N>
315 HWY_API uint8_t GetLane(const Vec128<uint8_t, N> v) {
316  return static_cast<uint8_t>(_mm_cvtsi128_si32(v.raw) & 0xFF);
317 }
318 template <size_t N>
319 HWY_API int8_t GetLane(const Vec128<int8_t, N> v) {
320  return static_cast<int8_t>(_mm_cvtsi128_si32(v.raw) & 0xFF);
321 }
322 template <size_t N>
323 HWY_API uint16_t GetLane(const Vec128<uint16_t, N> v) {
324  return static_cast<uint16_t>(_mm_cvtsi128_si32(v.raw) & 0xFFFF);
325 }
326 template <size_t N>
327 HWY_API int16_t GetLane(const Vec128<int16_t, N> v) {
328  return static_cast<int16_t>(_mm_cvtsi128_si32(v.raw) & 0xFFFF);
329 }
330 template <size_t N>
331 HWY_API uint32_t GetLane(const Vec128<uint32_t, N> v) {
332  return static_cast<uint32_t>(_mm_cvtsi128_si32(v.raw));
333 }
334 template <size_t N>
335 HWY_API int32_t GetLane(const Vec128<int32_t, N> v) {
336  return _mm_cvtsi128_si32(v.raw);
337 }
338 template <size_t N>
339 HWY_API float GetLane(const Vec128<float, N> v) {
340  return _mm_cvtss_f32(v.raw);
341 }
342 template <size_t N>
343 HWY_API uint64_t GetLane(const Vec128<uint64_t, N> v) {
344 #if HWY_ARCH_X86_32
345  alignas(16) uint64_t lanes[2];
346  Store(v, Simd<uint64_t, N>(), lanes);
347  return lanes[0];
348 #else
349  return static_cast<uint64_t>(_mm_cvtsi128_si64(v.raw));
350 #endif
351 }
352 template <size_t N>
353 HWY_API int64_t GetLane(const Vec128<int64_t, N> v) {
354 #if HWY_ARCH_X86_32
355  alignas(16) int64_t lanes[2];
356  Store(v, Simd<int64_t, N>(), lanes);
357  return lanes[0];
358 #else
359  return _mm_cvtsi128_si64(v.raw);
360 #endif
361 }
362 template <size_t N>
364  return _mm_cvtsd_f64(v.raw);
365 }
366 
367 // ================================================== LOGICAL
368 
369 // ------------------------------ And
370 
371 template <typename T, size_t N>
372 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
373  return Vec128<T, N>{_mm_and_si128(a.raw, b.raw)};
374 }
375 template <size_t N>
377  const Vec128<float, N> b) {
378  return Vec128<float, N>{_mm_and_ps(a.raw, b.raw)};
379 }
380 template <size_t N>
382  const Vec128<double, N> b) {
383  return Vec128<double, N>{_mm_and_pd(a.raw, b.raw)};
384 }
385 
386 // ------------------------------ AndNot
387 
388 // Returns ~not_mask & mask.
389 template <typename T, size_t N>
390 HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
391  return Vec128<T, N>{_mm_andnot_si128(not_mask.raw, mask.raw)};
392 }
393 template <size_t N>
395  const Vec128<float, N> mask) {
396  return Vec128<float, N>{_mm_andnot_ps(not_mask.raw, mask.raw)};
397 }
398 template <size_t N>
400  const Vec128<double, N> mask) {
401  return Vec128<double, N>{_mm_andnot_pd(not_mask.raw, mask.raw)};
402 }
403 
404 // ------------------------------ Or
405 
406 template <typename T, size_t N>
407 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
408  return Vec128<T, N>{_mm_or_si128(a.raw, b.raw)};
409 }
410 
411 template <size_t N>
413  const Vec128<float, N> b) {
414  return Vec128<float, N>{_mm_or_ps(a.raw, b.raw)};
415 }
416 template <size_t N>
418  const Vec128<double, N> b) {
419  return Vec128<double, N>{_mm_or_pd(a.raw, b.raw)};
420 }
421 
422 // ------------------------------ Xor
423 
424 template <typename T, size_t N>
425 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
426  return Vec128<T, N>{_mm_xor_si128(a.raw, b.raw)};
427 }
428 
429 template <size_t N>
431  const Vec128<float, N> b) {
432  return Vec128<float, N>{_mm_xor_ps(a.raw, b.raw)};
433 }
434 template <size_t N>
436  const Vec128<double, N> b) {
437  return Vec128<double, N>{_mm_xor_pd(a.raw, b.raw)};
438 }
439 
440 // ------------------------------ Not
441 
442 template <typename T, size_t N>
443 HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
444  using TU = MakeUnsigned<T>;
445 #if HWY_TARGET <= HWY_AVX3
446  const __m128i vu = BitCast(Simd<TU, N>(), v).raw;
447  return BitCast(Simd<T, N>(),
448  Vec128<TU, N>{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
449 #else
450  return Xor(v, BitCast(Simd<T, N>(), Vec128<TU, N>{_mm_set1_epi32(-1)}));
451 #endif
452 }
453 
454 // ------------------------------ Operator overloads (internal-only if float)
455 
456 template <typename T, size_t N>
457 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
458  return And(a, b);
459 }
460 
461 template <typename T, size_t N>
462 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
463  return Or(a, b);
464 }
465 
466 template <typename T, size_t N>
467 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
468  return Xor(a, b);
469 }
470 
471 // ------------------------------ PopulationCount
472 
473 // 8/16 require BITALG, 32/64 require VPOPCNTDQ.
474 #if HWY_TARGET == HWY_AVX3_DL
475 
476 #ifdef HWY_NATIVE_POPCNT
477 #undef HWY_NATIVE_POPCNT
478 #else
479 #define HWY_NATIVE_POPCNT
480 #endif
481 
482 namespace detail {
483 
484 template <typename T, size_t N>
486  Vec128<T, N> v) {
487  return Vec128<T, N>{_mm_popcnt_epi8(v.raw)};
488 }
489 template <typename T, size_t N>
491  Vec128<T, N> v) {
492  return Vec128<T, N>{_mm_popcnt_epi16(v.raw)};
493 }
494 template <typename T, size_t N>
496  Vec128<T, N> v) {
497  return Vec128<T, N>{_mm_popcnt_epi32(v.raw)};
498 }
499 template <typename T, size_t N>
501  Vec128<T, N> v) {
502  return Vec128<T, N>{_mm_popcnt_epi64(v.raw)};
503 }
504 
505 } // namespace detail
506 
507 template <typename T, size_t N>
509  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
510 }
511 
512 #endif // HWY_TARGET == HWY_AVX3_DL
513 
514 // ================================================== SIGN
515 
516 // ------------------------------ Neg
517 
518 template <typename T, size_t N, HWY_IF_FLOAT(T)>
519 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
520  return Xor(v, SignBit(Simd<T, N>()));
521 }
522 
523 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
524 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
525  return Zero(Simd<T, N>()) - v;
526 }
527 
528 // ------------------------------ Abs
529 
530 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
531 template <size_t N>
532 HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
533 #if HWY_COMPILER_MSVC
534  // Workaround for incorrect codegen? (reaches breakpoint)
535  const auto zero = Zero(Simd<int8_t, N>());
536  return Vec128<int8_t, N>{_mm_max_epi8(v.raw, (zero - v).raw)};
537 #else
538  return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)};
539 #endif
540 }
541 template <size_t N>
542 HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
543  return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)};
544 }
545 template <size_t N>
546 HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
547  return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
548 }
549 // i64 is implemented after BroadcastSignBit.
550 template <size_t N>
551 HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
552  const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
553  return v & BitCast(Simd<float, N>(), mask);
554 }
555 template <size_t N>
557  const Vec128<int64_t, N> mask{_mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)};
558  return v & BitCast(Simd<double, N>(), mask);
559 }
560 
561 // ------------------------------ CopySign
562 
563 template <typename T, size_t N>
564 HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
565  const Vec128<T, N> sign) {
566  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
567 
568  const Simd<T, N> d;
569  const auto msb = SignBit(d);
570 
571 #if HWY_TARGET <= HWY_AVX3
572  const Rebind<MakeUnsigned<T>, decltype(d)> du;
573  // Truth table for msb, magn, sign | bitwise msb ? sign : mag
574  // 0 0 0 | 0
575  // 0 0 1 | 0
576  // 0 1 0 | 1
577  // 0 1 1 | 1
578  // 1 0 0 | 0
579  // 1 0 1 | 1
580  // 1 1 0 | 0
581  // 1 1 1 | 1
582  // The lane size does not matter because we are not using predication.
583  const __m128i out = _mm_ternarylogic_epi32(
584  BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
585  return BitCast(d, decltype(Zero(du)){out});
586 #else
587  return Or(AndNot(msb, magn), And(msb, sign));
588 #endif
589 }
590 
591 template <typename T, size_t N>
592 HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
593  const Vec128<T, N> sign) {
594 #if HWY_TARGET <= HWY_AVX3
595  // AVX3 can also handle abs < 0, so no extra action needed.
596  return CopySign(abs, sign);
597 #else
598  return Or(abs, And(SignBit(Simd<T, N>()), sign));
599 #endif
600 }
601 
602 // ================================================== MASK
603 
604 #if HWY_TARGET <= HWY_AVX3
605 
606 // ------------------------------ FirstN
607 
608 template <typename T, size_t N, HWY_IF_LE128(T, N)>
609 HWY_API Mask128<T, N> FirstN(const Simd<T, N> /*tag*/, size_t n) {
610  return Mask128<T, N>::FromBits(_bzhi_u64(~uint64_t(0), n));
611 }
612 
613 template <class D>
614 using MFromD = decltype(FirstN(D(), 0));
615 
616 // ------------------------------ IfThenElse
617 
618 // Returns mask ? b : a.
619 
620 namespace detail {
621 
622 // Templates for signed/unsigned integer of a particular size.
623 template <typename T, size_t N>
625  Mask128<T, N> mask, Vec128<T, N> yes,
626  Vec128<T, N> no) {
627  return Vec128<T, N>{_mm_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
628 }
629 template <typename T, size_t N>
631  Mask128<T, N> mask, Vec128<T, N> yes,
632  Vec128<T, N> no) {
633  return Vec128<T, N>{_mm_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
634 }
635 template <typename T, size_t N>
637  Mask128<T, N> mask, Vec128<T, N> yes,
638  Vec128<T, N> no) {
639  return Vec128<T, N>{_mm_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
640 }
641 template <typename T, size_t N>
643  Mask128<T, N> mask, Vec128<T, N> yes,
644  Vec128<T, N> no) {
645  return Vec128<T, N>{_mm_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
646 }
647 
648 } // namespace detail
649 
650 template <typename T, size_t N>
651 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
652  Vec128<T, N> no) {
653  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
654 }
655 
656 template <size_t N>
659  return Vec128<float, N>{_mm_mask_mov_ps(no.raw, mask.raw, yes.raw)};
660 }
661 
662 template <size_t N>
664  Vec128<double, N> yes,
665  Vec128<double, N> no) {
666  return Vec128<double, N>{_mm_mask_mov_pd(no.raw, mask.raw, yes.raw)};
667 }
668 
669 namespace detail {
670 
671 template <typename T, size_t N>
673  Mask128<T, N> mask, Vec128<T, N> yes) {
674  return Vec128<T, N>{_mm_maskz_mov_epi8(mask.raw, yes.raw)};
675 }
676 template <typename T, size_t N>
678  Mask128<T, N> mask, Vec128<T, N> yes) {
679  return Vec128<T, N>{_mm_maskz_mov_epi16(mask.raw, yes.raw)};
680 }
681 template <typename T, size_t N>
683  Mask128<T, N> mask, Vec128<T, N> yes) {
684  return Vec128<T, N>{_mm_maskz_mov_epi32(mask.raw, yes.raw)};
685 }
686 template <typename T, size_t N>
688  Mask128<T, N> mask, Vec128<T, N> yes) {
689  return Vec128<T, N>{_mm_maskz_mov_epi64(mask.raw, yes.raw)};
690 }
691 
692 } // namespace detail
693 
694 template <typename T, size_t N>
695 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
696  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
697 }
698 
699 template <size_t N>
701  Vec128<float, N> yes) {
702  return Vec128<float, N>{_mm_maskz_mov_ps(mask.raw, yes.raw)};
703 }
704 
705 template <size_t N>
707  Vec128<double, N> yes) {
708  return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)};
709 }
710 
711 namespace detail {
712 
713 template <typename T, size_t N>
715  Mask128<T, N> mask, Vec128<T, N> no) {
716  // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
717  return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
718 }
719 template <typename T, size_t N>
721  Mask128<T, N> mask, Vec128<T, N> no) {
722  return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
723 }
724 template <typename T, size_t N>
726  Mask128<T, N> mask, Vec128<T, N> no) {
727  return Vec128<T, N>{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
728 }
729 template <typename T, size_t N>
731  Mask128<T, N> mask, Vec128<T, N> no) {
732  return Vec128<T, N>{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
733 }
734 
735 } // namespace detail
736 
737 template <typename T, size_t N>
738 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
739  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
740 }
741 
742 template <size_t N>
744  Vec128<float, N> no) {
745  return Vec128<float, N>{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
746 }
747 
748 template <size_t N>
750  Vec128<double, N> no) {
751  return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
752 }
753 
754 // ------------------------------ Mask logical
755 
756 // For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
757 #if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS) && \
758  (HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC >= 700 || \
759  HWY_COMPILER_CLANG >= 800)
760 #define HWY_COMPILER_HAS_MASK_INTRINSICS 1
761 #else
762 #define HWY_COMPILER_HAS_MASK_INTRINSICS 0
763 #endif
764 
765 namespace detail {
766 
767 template <typename T, size_t N>
769  const Mask128<T, N> b) {
770 #if HWY_COMPILER_HAS_MASK_INTRINSICS
771  return Mask128<T, N>{_kand_mask16(a.raw, b.raw)};
772 #else
773  return Mask128<T, N>{a.raw & b.raw};
774 #endif
775 }
776 template <typename T, size_t N>
778  const Mask128<T, N> b) {
779 #if HWY_COMPILER_HAS_MASK_INTRINSICS
780  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
781 #else
782  return Mask128<T, N>{a.raw & b.raw};
783 #endif
784 }
785 template <typename T, size_t N>
787  const Mask128<T, N> b) {
788 #if HWY_COMPILER_HAS_MASK_INTRINSICS
789  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
790 #else
791  return Mask128<T, N>{static_cast<uint16_t>(a.raw & b.raw)};
792 #endif
793 }
794 template <typename T, size_t N>
796  const Mask128<T, N> b) {
797 #if HWY_COMPILER_HAS_MASK_INTRINSICS
798  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
799 #else
800  return Mask128<T, N>{static_cast<uint8_t>(a.raw & b.raw)};
801 #endif
802 }
803 
804 template <typename T, size_t N>
806  const Mask128<T, N> b) {
807 #if HWY_COMPILER_HAS_MASK_INTRINSICS
808  return Mask128<T, N>{_kandn_mask16(a.raw, b.raw)};
809 #else
810  return Mask128<T, N>{~a.raw & b.raw};
811 #endif
812 }
813 template <typename T, size_t N>
815  const Mask128<T, N> b) {
816 #if HWY_COMPILER_HAS_MASK_INTRINSICS
817  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
818 #else
819  return Mask128<T, N>{~a.raw & b.raw};
820 #endif
821 }
822 template <typename T, size_t N>
824  const Mask128<T, N> b) {
825 #if HWY_COMPILER_HAS_MASK_INTRINSICS
826  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
827 #else
828  return Mask128<T, N>{static_cast<uint16_t>(~a.raw & b.raw)};
829 #endif
830 }
831 template <typename T, size_t N>
833  const Mask128<T, N> b) {
834 #if HWY_COMPILER_HAS_MASK_INTRINSICS
835  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
836 #else
837  return Mask128<T, N>{static_cast<uint8_t>(~a.raw & b.raw)};
838 #endif
839 }
840 
841 template <typename T, size_t N>
843  const Mask128<T, N> b) {
844 #if HWY_COMPILER_HAS_MASK_INTRINSICS
845  return Mask128<T, N>{_kor_mask16(a.raw, b.raw)};
846 #else
847  return Mask128<T, N>{a.raw | b.raw};
848 #endif
849 }
850 template <typename T, size_t N>
852  const Mask128<T, N> b) {
853 #if HWY_COMPILER_HAS_MASK_INTRINSICS
854  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
855 #else
856  return Mask128<T, N>{a.raw | b.raw};
857 #endif
858 }
859 template <typename T, size_t N>
861  const Mask128<T, N> b) {
862 #if HWY_COMPILER_HAS_MASK_INTRINSICS
863  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
864 #else
865  return Mask128<T, N>{static_cast<uint16_t>(a.raw | b.raw)};
866 #endif
867 }
868 template <typename T, size_t N>
870  const Mask128<T, N> b) {
871 #if HWY_COMPILER_HAS_MASK_INTRINSICS
872  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
873 #else
874  return Mask128<T, N>{static_cast<uint8_t>(a.raw | b.raw)};
875 #endif
876 }
877 
878 template <typename T, size_t N>
880  const Mask128<T, N> b) {
881 #if HWY_COMPILER_HAS_MASK_INTRINSICS
882  return Mask128<T, N>{_kxor_mask16(a.raw, b.raw)};
883 #else
884  return Mask128<T, N>{a.raw ^ b.raw};
885 #endif
886 }
887 template <typename T, size_t N>
889  const Mask128<T, N> b) {
890 #if HWY_COMPILER_HAS_MASK_INTRINSICS
891  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
892 #else
893  return Mask128<T, N>{a.raw ^ b.raw};
894 #endif
895 }
896 template <typename T, size_t N>
898  const Mask128<T, N> b) {
899 #if HWY_COMPILER_HAS_MASK_INTRINSICS
900  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
901 #else
902  return Mask128<T, N>{static_cast<uint16_t>(a.raw ^ b.raw)};
903 #endif
904 }
905 template <typename T, size_t N>
907  const Mask128<T, N> b) {
908 #if HWY_COMPILER_HAS_MASK_INTRINSICS
909  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
910 #else
911  return Mask128<T, N>{static_cast<uint8_t>(a.raw ^ b.raw)};
912 #endif
913 }
914 
915 } // namespace detail
916 
917 template <typename T, size_t N>
918 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
919  return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
920 }
921 
922 template <typename T, size_t N>
923 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
924  return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
925 }
926 
927 template <typename T, size_t N>
928 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
929  return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
930 }
931 
932 template <typename T, size_t N>
933 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
934  return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
935 }
936 
937 template <typename T, size_t N>
938 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
939  // Flip only the valid bits.
940  return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1));
941 }
942 
943 #else // AVX2 or below
944 
945 // ------------------------------ Mask
946 
947 // Mask and Vec are the same (true = FF..FF).
948 template <typename T, size_t N>
949 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
950  return Mask128<T, N>{v.raw};
951 }
952 
953 template <typename T, size_t N>
954 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
955  return Vec128<T, N>{v.raw};
956 }
957 
958 template <typename T, size_t N>
959 HWY_API Vec128<T, N> VecFromMask(const Simd<T, N> /* tag */,
960  const Mask128<T, N> v) {
961  return Vec128<T, N>{v.raw};
962 }
963 
964 #if HWY_TARGET == HWY_SSSE3
965 
966 // mask ? yes : no
967 template <typename T, size_t N>
968 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
969  Vec128<T, N> no) {
970  const auto vmask = VecFromMask(Simd<T, N>(), mask);
971  return Or(And(vmask, yes), AndNot(vmask, no));
972 }
973 
974 #else // HWY_TARGET == HWY_SSSE3
975 
976 // mask ? yes : no
977 template <typename T, size_t N>
978 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
979  Vec128<T, N> no) {
980  return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
981 }
982 template <size_t N>
983 HWY_API Vec128<float, N> IfThenElse(const Mask128<float, N> mask,
984  const Vec128<float, N> yes,
985  const Vec128<float, N> no) {
986  return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
987 }
988 template <size_t N>
989 HWY_API Vec128<double, N> IfThenElse(const Mask128<double, N> mask,
990  const Vec128<double, N> yes,
991  const Vec128<double, N> no) {
992  return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
993 }
994 
995 #endif // HWY_TARGET == HWY_SSSE3
996 
997 // mask ? yes : 0
998 template <typename T, size_t N>
999 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
1000  return yes & VecFromMask(Simd<T, N>(), mask);
1001 }
1002 
1003 // mask ? 0 : no
1004 template <typename T, size_t N>
1005 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
1006  return AndNot(VecFromMask(Simd<T, N>(), mask), no);
1007 }
1008 
1009 // ------------------------------ Mask logical
1010 
1011 template <typename T, size_t N>
1012 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1013  return MaskFromVec(Not(VecFromMask(Simd<T, N>(), m)));
1014 }
1015 
1016 template <typename T, size_t N>
1017 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
1018  const Simd<T, N> d;
1019  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1020 }
1021 
1022 template <typename T, size_t N>
1023 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
1024  const Simd<T, N> d;
1025  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1026 }
1027 
1028 template <typename T, size_t N>
1029 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
1030  const Simd<T, N> d;
1031  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1032 }
1033 
1034 template <typename T, size_t N>
1035 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1036  const Simd<T, N> d;
1037  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1038 }
1039 
1040 #endif // HWY_TARGET <= HWY_AVX3
1041 
1042 // ================================================== SWIZZLE (1)
1043 
1044 // ------------------------------ Hard-coded shuffles
1045 
1046 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
1047 // Shuffle0321 rotates one lane to the right (the previous least-significant
1048 // lane is now most-significant). These could also be implemented via
1049 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
1050 
1051 // Swap 32-bit halves in 64-bit halves.
1052 template <size_t N>
1054  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1055  return Vec128<uint32_t, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
1056 }
1057 template <size_t N>
1059  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1060  return Vec128<int32_t, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
1061 }
1062 template <size_t N>
1064  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1065  return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0xB1)};
1066 }
1067 
1068 // Swap 64-bit halves
1069 HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) {
1070  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1071 }
1072 HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) {
1073  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1074 }
1075 HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) {
1076  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x4E)};
1077 }
1079  return Vec128<uint64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1080 }
1082  return Vec128<int64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1083 }
1085  return Vec128<double>{_mm_shuffle_pd(v.raw, v.raw, 1)};
1086 }
1087 
1088 // Rotate right 32 bits
1089 HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) {
1090  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
1091 }
1092 HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) {
1093  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
1094 }
1095 HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) {
1096  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x39)};
1097 }
1098 // Rotate left 32 bits
1099 HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) {
1100  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
1101 }
1102 HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) {
1103  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
1104 }
1105 HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) {
1106  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x93)};
1107 }
1108 
1109 // Reverse
1110 HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) {
1111  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
1112 }
1113 HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) {
1114  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
1115 }
1116 HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
1117  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x1B)};
1118 }
1119 
1120 // ================================================== COMPARE
1121 
1122 #if HWY_TARGET <= HWY_AVX3
1123 
1124 // Comparisons set a mask bit to 1 if the condition is true, else 0.
1125 
1126 template <typename TFrom, size_t NFrom, typename TTo, size_t NTo>
1129  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1130  return Mask128<TTo, NTo>{m.raw};
1131 }
1132 
1133 namespace detail {
1134 
1135 template <typename T, size_t N>
1137  const Vec128<T, N> bit) {
1138  return Mask128<T, N>{_mm_test_epi8_mask(v.raw, bit.raw)};
1139 }
1140 template <typename T, size_t N>
1142  const Vec128<T, N> bit) {
1143  return Mask128<T, N>{_mm_test_epi16_mask(v.raw, bit.raw)};
1144 }
1145 template <typename T, size_t N>
1147  const Vec128<T, N> bit) {
1148  return Mask128<T, N>{_mm_test_epi32_mask(v.raw, bit.raw)};
1149 }
1150 template <typename T, size_t N>
1152  const Vec128<T, N> bit) {
1153  return Mask128<T, N>{_mm_test_epi64_mask(v.raw, bit.raw)};
1154 }
1155 
1156 } // namespace detail
1157 
1158 template <typename T, size_t N>
1159 HWY_API Mask128<T, N> TestBit(const Vec128<T, N> v, const Vec128<T, N> bit) {
1160  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1161  return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
1162 }
1163 
1164 // ------------------------------ Equality
1165 
1166 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1168  return Mask128<T, N>{_mm_cmpeq_epi8_mask(a.raw, b.raw)};
1169 }
1170 
1171 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1172 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1173  return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)};
1174 }
1175 
1176 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1177 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1178  return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)};
1179 }
1180 
1181 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1182 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1183  return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)};
1184 }
1185 
1186 template <size_t N>
1187 HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) {
1188  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1189 }
1190 
1191 template <size_t N>
1193  Vec128<double, N> b) {
1194  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1195 }
1196 
1197 // ------------------------------ Inequality
1198 
1199 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1201  return Mask128<T, N>{_mm_cmpneq_epi8_mask(a.raw, b.raw)};
1202 }
1203 
1204 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1205 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1206  return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)};
1207 }
1208 
1209 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1210 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1211  return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)};
1212 }
1213 
1214 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1215 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1216  return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)};
1217 }
1218 
1219 template <size_t N>
1220 HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
1221  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1222 }
1223 
1224 template <size_t N>
1226  Vec128<double, N> b) {
1227  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1228 }
1229 
1230 // ------------------------------ Strict inequality
1231 
1232 // Signed/float <
1233 template <size_t N>
1234 HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1235  return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)};
1236 }
1237 template <size_t N>
1238 HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
1239  Vec128<int16_t, N> b) {
1240  return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)};
1241 }
1242 template <size_t N>
1243 HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
1244  Vec128<int32_t, N> b) {
1245  return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)};
1246 }
1247 template <size_t N>
1248 HWY_API Mask128<int64_t, N> operator>(Vec128<int64_t, N> a,
1249  Vec128<int64_t, N> b) {
1250  return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)};
1251 }
1252 template <size_t N>
1253 HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
1254  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1255 }
1256 template <size_t N>
1258  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
1259 }
1260 
1261 // ------------------------------ Weak inequality
1262 
1263 template <size_t N>
1264 HWY_API Mask128<float, N> operator>=(Vec128<float, N> a, Vec128<float, N> b) {
1265  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1266 }
1267 template <size_t N>
1269  Vec128<double, N> b) {
1270  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
1271 }
1272 
1273 // ------------------------------ Mask
1274 
1275 namespace detail {
1276 
1277 template <typename T, size_t N>
1279  const Vec128<T, N> v) {
1280  return Mask128<T, N>{_mm_movepi8_mask(v.raw)};
1281 }
1282 template <typename T, size_t N>
1284  const Vec128<T, N> v) {
1285  return Mask128<T, N>{_mm_movepi16_mask(v.raw)};
1286 }
1287 template <typename T, size_t N>
1289  const Vec128<T, N> v) {
1290  return Mask128<T, N>{_mm_movepi32_mask(v.raw)};
1291 }
1292 template <typename T, size_t N>
1294  const Vec128<T, N> v) {
1295  return Mask128<T, N>{_mm_movepi64_mask(v.raw)};
1296 }
1297 
1298 } // namespace detail
1299 
1300 template <typename T, size_t N>
1301 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1302  return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1303 }
1304 // There do not seem to be native floating-point versions of these instructions.
1305 template <size_t N>
1308 }
1309 template <size_t N>
1312 }
1313 
1314 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1316  return Vec128<T, N>{_mm_movm_epi8(v.raw)};
1317 }
1318 
1319 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1320 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1321  return Vec128<T, N>{_mm_movm_epi16(v.raw)};
1322 }
1323 
1324 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1325 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1326  return Vec128<T, N>{_mm_movm_epi32(v.raw)};
1327 }
1328 
1329 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1330 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1331  return Vec128<T, N>{_mm_movm_epi64(v.raw)};
1332 }
1333 
1334 template <size_t N>
1336  return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))};
1337 }
1338 
1339 template <size_t N>
1341  return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))};
1342 }
1343 
1344 template <typename T, size_t N>
1345 HWY_API Vec128<T, N> VecFromMask(Simd<T, N> /* tag */, const Mask128<T, N> v) {
1346  return VecFromMask(v);
1347 }
1348 
1349 #else // AVX2 or below
1350 
1351 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
1352 
1353 template <typename TFrom, typename TTo, size_t N>
1354 HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> /*tag*/, Mask128<TFrom, N> m) {
1355  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1356  const Simd<TFrom, N> d;
1357  return MaskFromVec(BitCast(Simd<TTo, N>(), VecFromMask(d, m)));
1358 }
1359 
1360 template <typename T, size_t N>
1361 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
1362  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1363  return (v & bit) == bit;
1364 }
1365 
1366 // ------------------------------ Equality
1367 
1368 // Unsigned
1369 template <size_t N>
1370 HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a,
1371  const Vec128<uint8_t, N> b) {
1372  return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1373 }
1374 template <size_t N>
1375 HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a,
1376  const Vec128<uint16_t, N> b) {
1377  return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1378 }
1379 template <size_t N>
1380 HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a,
1381  const Vec128<uint32_t, N> b) {
1382  return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1383 }
1384 template <size_t N>
1385 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
1386  const Vec128<uint64_t, N> b) {
1387 #if HWY_TARGET == HWY_SSSE3
1388  const Simd<uint32_t, N * 2> d32;
1389  const Simd<uint64_t, N> d64;
1390  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
1391  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
1392  return MaskFromVec(BitCast(d64, cmp64));
1393 #else
1394  return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)};
1395 #endif
1396 }
1397 
1398 // Signed
1399 template <size_t N>
1400 HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a,
1401  const Vec128<int8_t, N> b) {
1402  return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1403 }
1404 template <size_t N>
1405 HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
1406  Vec128<int16_t, N> b) {
1407  return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1408 }
1409 template <size_t N>
1410 HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a,
1411  const Vec128<int32_t, N> b) {
1412  return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1413 }
1414 template <size_t N>
1415 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
1416  const Vec128<int64_t, N> b) {
1417  // Same as signed ==; avoid duplicating the SSSE3 version.
1418  const Simd<uint64_t, N> du;
1419  return RebindMask(Simd<int64_t, N>(), BitCast(du, a) == BitCast(du, b));
1420 }
1421 
1422 // Float
1423 template <size_t N>
1424 HWY_API Mask128<float, N> operator==(const Vec128<float, N> a,
1425  const Vec128<float, N> b) {
1426  return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)};
1427 }
1428 template <size_t N>
1429 HWY_API Mask128<double, N> operator==(const Vec128<double, N> a,
1430  const Vec128<double, N> b) {
1431  return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)};
1432 }
1433 
1434 // ------------------------------ Inequality
1435 
1436 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
1437 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1438  return Not(a == b);
1439 }
1440 
1441 template <size_t N>
1442 HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a,
1443  const Vec128<float, N> b) {
1444  return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)};
1445 }
1446 template <size_t N>
1447 HWY_API Mask128<double, N> operator!=(const Vec128<double, N> a,
1448  const Vec128<double, N> b) {
1449  return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)};
1450 }
1451 
1452 // ------------------------------ Strict inequality
1453 
1454 // Signed/float <
1455 template <size_t N>
1456 HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1457  return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
1458 }
1459 template <size_t N>
1460 HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
1461  Vec128<int16_t, N> b) {
1462  return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
1463 }
1464 template <size_t N>
1465 HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
1466  Vec128<int32_t, N> b) {
1467  return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
1468 }
1469 template <size_t N>
1470 HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
1471  return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
1472 }
1473 template <size_t N>
1474 HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) {
1475  return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
1476 }
1477 
1478 template <size_t N>
1479 HWY_API Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
1480  const Vec128<int64_t, N> b) {
1481 #if HWY_TARGET == HWY_SSSE3
1482  // If the upper half is less than or greater, this is the answer.
1483  const __m128i m_gt = _mm_cmpgt_epi32(a.raw, b.raw);
1484 
1485  // Otherwise, the lower half decides.
1486  const __m128i m_eq = _mm_cmpeq_epi32(a.raw, b.raw);
1487  const __m128i lo_in_hi = _mm_shuffle_epi32(m_gt, _MM_SHUFFLE(2, 2, 0, 0));
1488  const __m128i lo_gt = _mm_and_si128(m_eq, lo_in_hi);
1489 
1490  const __m128i gt = _mm_or_si128(lo_gt, m_gt);
1491  // Copy result in upper 32 bits to lower 32 bits.
1492  return Mask128<int64_t, N>{_mm_shuffle_epi32(gt, _MM_SHUFFLE(3, 3, 1, 1))};
1493 #else
1494  return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)}; // SSE4.2
1495 #endif
1496 }
1497 
1498 
1499 // ------------------------------ Weak inequality
1500 
1501 template <size_t N>
1502 HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,
1503  const Vec128<float, N> b) {
1504  return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)};
1505 }
1506 template <size_t N>
1507 HWY_API Mask128<double, N> operator>=(const Vec128<double, N> a,
1508  const Vec128<double, N> b) {
1509  return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
1510 }
1511 
1512 // ------------------------------ FirstN (Iota, Lt)
1513 
1514 template <typename T, size_t N, HWY_IF_LE128(T, N)>
1515 HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
1516  const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
1517  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
1518 }
1519 
1520 template <class D>
1521 using MFromD = decltype(FirstN(D(), 0));
1522 
1523 #endif // HWY_TARGET <= HWY_AVX3
1524 
1525 // ------------------------------ Reversed comparisons
1526 
1527 template <typename T, size_t N>
1528 HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
1529  return b > a;
1530 }
1531 
1532 template <typename T, size_t N>
1534  return b >= a;
1535 }
1536 
1537 // ================================================== MEMORY (1)
1538 
1539 // Clang static analysis claims the memory immediately after a partial vector
1540 // store is uninitialized, and also flags the input to partial loads (at least
1541 // for loadl_pd) as "garbage". This is a false alarm because msan does not
1542 // raise errors. We work around this by using CopyBytes instead of intrinsics,
1543 // but only for the analyzer to avoid potentially bad code generation.
1544 // Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7.
1545 #ifndef HWY_SAFE_PARTIAL_LOAD_STORE
1546 #if defined(__clang_analyzer__) || \
1547  (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
1548 #define HWY_SAFE_PARTIAL_LOAD_STORE 1
1549 #else
1550 #define HWY_SAFE_PARTIAL_LOAD_STORE 0
1551 #endif
1552 #endif // HWY_SAFE_PARTIAL_LOAD_STORE
1553 
1554 // ------------------------------ Load
1555 
1556 template <typename T>
1557 HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
1558  return Vec128<T>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))};
1559 }
1561  const float* HWY_RESTRICT aligned) {
1562  return Vec128<float>{_mm_load_ps(aligned)};
1563 }
1565  const double* HWY_RESTRICT aligned) {
1566  return Vec128<double>{_mm_load_pd(aligned)};
1567 }
1568 
1569 template <typename T>
1571  return Vec128<T>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))};
1572 }
1573 HWY_API Vec128<float> LoadU(Full128<float> /* tag */,
1574  const float* HWY_RESTRICT p) {
1575  return Vec128<float>{_mm_loadu_ps(p)};
1576 }
1578  const double* HWY_RESTRICT p) {
1579  return Vec128<double>{_mm_loadu_pd(p)};
1580 }
1581 
1582 template <typename T>
1583 HWY_API Vec128<T, 8 / sizeof(T)> Load(Simd<T, 8 / sizeof(T)> /* tag */,
1584  const T* HWY_RESTRICT p) {
1585 #if HWY_SAFE_PARTIAL_LOAD_STORE
1586  __m128i v = _mm_setzero_si128();
1587  CopyBytes<8>(p, &v);
1588  return Vec128<T, 8 / sizeof(T)>{v};
1589 #else
1590  return Vec128<T, 8 / sizeof(T)>{
1591  _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p))};
1592 #endif
1593 }
1594 
1596  const float* HWY_RESTRICT p) {
1597 #if HWY_SAFE_PARTIAL_LOAD_STORE
1598  __m128 v = _mm_setzero_ps();
1599  CopyBytes<8>(p, &v);
1600  return Vec128<float, 2>{v};
1601 #else
1602  const __m128 hi = _mm_setzero_ps();
1603  return Vec128<float, 2>{_mm_loadl_pi(hi, reinterpret_cast<const __m64*>(p))};
1604 #endif
1605 }
1606 
1608  const double* HWY_RESTRICT p) {
1609 #if HWY_SAFE_PARTIAL_LOAD_STORE
1610  __m128d v = _mm_setzero_pd();
1611  CopyBytes<8>(p, &v);
1612  return Vec128<double, 1>{v};
1613 #else
1614  return Vec128<double, 1>{_mm_load_sd(p)};
1615 #endif
1616 }
1617 
1619  const float* HWY_RESTRICT p) {
1620 #if HWY_SAFE_PARTIAL_LOAD_STORE
1621  __m128 v = _mm_setzero_ps();
1622  CopyBytes<4>(p, &v);
1623  return Vec128<float, 1>{v};
1624 #else
1625  return Vec128<float, 1>{_mm_load_ss(p)};
1626 #endif
1627 }
1628 
1629 // Any <= 32 bit except <float, 1>
1630 template <typename T, size_t N, HWY_IF_LE32(T, N)>
1631 HWY_API Vec128<T, N> Load(Simd<T, N> /* tag */, const T* HWY_RESTRICT p) {
1632  constexpr size_t kSize = sizeof(T) * N;
1633 #if HWY_SAFE_PARTIAL_LOAD_STORE
1634  __m128 v = _mm_setzero_ps();
1635  CopyBytes<kSize>(p, &v);
1636  return Vec128<T, N>{v};
1637 #else
1638  int32_t bits;
1639  CopyBytes<kSize>(p, &bits);
1640  return Vec128<T, N>{_mm_cvtsi32_si128(bits)};
1641 #endif
1642 }
1643 
1644 // For < 128 bit, LoadU == Load.
1645 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1647  return Load(d, p);
1648 }
1649 
1650 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
1651 template <typename T, size_t N, HWY_IF_LE128(T, N)>
1652 HWY_API Vec128<T, N> LoadDup128(Simd<T, N> d, const T* HWY_RESTRICT p) {
1653  return LoadU(d, p);
1654 }
1655 
1656 // ------------------------------ MaskedLoad
1657 
1658 #if HWY_TARGET <= HWY_AVX3
1659 
1660 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
1662  const T* HWY_RESTRICT aligned) {
1663  return Vec128<T, N>{_mm_maskz_load_epi32(m.raw, aligned)};
1664 }
1665 
1666 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
1667 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> /* tag */,
1668  const T* HWY_RESTRICT aligned) {
1669  return Vec128<T, N>{_mm_maskz_load_epi64(m.raw, aligned)};
1670 }
1671 
1672 template <size_t N, HWY_IF_LE128(float, N)>
1674  Simd<float, N> /* tag */,
1675  const float* HWY_RESTRICT aligned) {
1676  return Vec128<float, N>{_mm_maskz_load_ps(m.raw, aligned)};
1677 }
1678 
1679 template <size_t N, HWY_IF_LE128(double, N)>
1681  Simd<double, N> /* tag */,
1682  const double* HWY_RESTRICT aligned) {
1683  return Vec128<double, N>{_mm_maskz_load_pd(m.raw, aligned)};
1684 }
1685 
1686 // There is no load_epi8/16, so use loadu instead.
1687 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1), HWY_IF_LE128(T, N)>
1688 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> /* tag */,
1689  const T* HWY_RESTRICT aligned) {
1690  return Vec128<T, N>{_mm_maskz_loadu_epi8(m.raw, aligned)};
1691 }
1692 
1693 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE128(T, N)>
1694 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> /* tag */,
1695  const T* HWY_RESTRICT aligned) {
1696  return Vec128<T, N>{_mm_maskz_loadu_epi16(m.raw, aligned)};
1697 }
1698 
1699 #else
1700 
1701 // Also applies to x86_256-inl.
1702 template <class M, class D>
1703 HWY_API VFromD<D> MaskedLoad(M m, D d, const TFromD<D>* HWY_RESTRICT aligned) {
1704  return IfThenElseZero(m, Load(d, aligned));
1705 }
1706 
1707 #endif
1708 
1709 // ------------------------------ Store
1710 
1711 template <typename T>
1712 HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
1713  _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
1714 }
1715 HWY_API void Store(const Vec128<float> v, Full128<float> /* tag */,
1716  float* HWY_RESTRICT aligned) {
1717  _mm_store_ps(aligned, v.raw);
1718 }
1720  double* HWY_RESTRICT aligned) {
1721  _mm_store_pd(aligned, v.raw);
1722 }
1723 
1724 template <typename T>
1725 HWY_API void StoreU(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT p) {
1726  _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw);
1727 }
1728 HWY_API void StoreU(const Vec128<float> v, Full128<float> /* tag */,
1729  float* HWY_RESTRICT p) {
1730  _mm_storeu_ps(p, v.raw);
1731 }
1733  double* HWY_RESTRICT p) {
1734  _mm_storeu_pd(p, v.raw);
1735 }
1736 
1737 template <typename T>
1738 HWY_API void Store(Vec128<T, 8 / sizeof(T)> v, Simd<T, 8 / sizeof(T)> /* tag */,
1739  T* HWY_RESTRICT p) {
1740 #if HWY_SAFE_PARTIAL_LOAD_STORE
1741  CopyBytes<8>(&v, p);
1742 #else
1743  _mm_storel_epi64(reinterpret_cast<__m128i*>(p), v.raw);
1744 #endif
1745 }
1747  float* HWY_RESTRICT p) {
1748 #if HWY_SAFE_PARTIAL_LOAD_STORE
1749  CopyBytes<8>(&v, p);
1750 #else
1751  _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw);
1752 #endif
1753 }
1755  double* HWY_RESTRICT p) {
1756 #if HWY_SAFE_PARTIAL_LOAD_STORE
1757  CopyBytes<8>(&v, p);
1758 #else
1759  _mm_storel_pd(p, v.raw);
1760 #endif
1761 }
1762 
1763 // Any <= 32 bit except <float, 1>
1764 template <typename T, size_t N, HWY_IF_LE32(T, N)>
1765 HWY_API void Store(Vec128<T, N> v, Simd<T, N> /* tag */, T* HWY_RESTRICT p) {
1766  CopyBytes<sizeof(T) * N>(&v, p);
1767 }
1768 HWY_API void Store(const Vec128<float, 1> v, Simd<float, 1> /* tag */,
1769  float* HWY_RESTRICT p) {
1770 #if HWY_SAFE_PARTIAL_LOAD_STORE
1771  CopyBytes<4>(&v, p);
1772 #else
1773  _mm_store_ss(p, v.raw);
1774 #endif
1775 }
1776 
1777 // For < 128 bit, StoreU == Store.
1778 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1780  Store(v, d, p);
1781 }
1782 
1783 // ================================================== ARITHMETIC
1784 
1785 // ------------------------------ Addition
1786 
1787 // Unsigned
1788 template <size_t N>
1789 HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
1790  const Vec128<uint8_t, N> b) {
1791  return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)};
1792 }
1793 template <size_t N>
1794 HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
1795  const Vec128<uint16_t, N> b) {
1796  return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)};
1797 }
1798 template <size_t N>
1799 HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
1800  const Vec128<uint32_t, N> b) {
1801  return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)};
1802 }
1803 template <size_t N>
1805  const Vec128<uint64_t, N> b) {
1806  return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)};
1807 }
1808 
1809 // Signed
1810 template <size_t N>
1811 HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
1812  const Vec128<int8_t, N> b) {
1813  return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)};
1814 }
1815 template <size_t N>
1816 HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
1817  const Vec128<int16_t, N> b) {
1818  return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)};
1819 }
1820 template <size_t N>
1821 HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
1822  const Vec128<int32_t, N> b) {
1823  return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)};
1824 }
1825 template <size_t N>
1827  const Vec128<int64_t, N> b) {
1828  return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)};
1829 }
1830 
1831 // Float
1832 template <size_t N>
1833 HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
1834  const Vec128<float, N> b) {
1835  return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)};
1836 }
1837 template <size_t N>
1839  const Vec128<double, N> b) {
1840  return Vec128<double, N>{_mm_add_pd(a.raw, b.raw)};
1841 }
1842 
1843 // ------------------------------ Subtraction
1844 
1845 // Unsigned
1846 template <size_t N>
1847 HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
1848  const Vec128<uint8_t, N> b) {
1849  return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
1850 }
1851 template <size_t N>
1852 HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
1853  Vec128<uint16_t, N> b) {
1854  return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
1855 }
1856 template <size_t N>
1857 HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
1858  const Vec128<uint32_t, N> b) {
1859  return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
1860 }
1861 template <size_t N>
1863  const Vec128<uint64_t, N> b) {
1864  return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
1865 }
1866 
1867 // Signed
1868 template <size_t N>
1869 HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
1870  const Vec128<int8_t, N> b) {
1871  return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
1872 }
1873 template <size_t N>
1874 HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
1875  const Vec128<int16_t, N> b) {
1876  return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
1877 }
1878 template <size_t N>
1879 HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
1880  const Vec128<int32_t, N> b) {
1881  return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
1882 }
1883 template <size_t N>
1885  const Vec128<int64_t, N> b) {
1886  return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
1887 }
1888 
1889 // Float
1890 template <size_t N>
1891 HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
1892  const Vec128<float, N> b) {
1893  return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)};
1894 }
1895 template <size_t N>
1897  const Vec128<double, N> b) {
1898  return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)};
1899 }
1900 
1901 // ------------------------------ Saturating addition
1902 
1903 // Returns a + b clamped to the destination range.
1904 
1905 // Unsigned
1906 template <size_t N>
1907 HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
1908  const Vec128<uint8_t, N> b) {
1909  return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)};
1910 }
1911 template <size_t N>
1912 HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
1913  const Vec128<uint16_t, N> b) {
1914  return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)};
1915 }
1916 
1917 // Signed
1918 template <size_t N>
1919 HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
1920  const Vec128<int8_t, N> b) {
1921  return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)};
1922 }
1923 template <size_t N>
1924 HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
1925  const Vec128<int16_t, N> b) {
1926  return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
1927 }
1928 
1929 // ------------------------------ Saturating subtraction
1930 
1931 // Returns a - b clamped to the destination range.
1932 
1933 // Unsigned
1934 template <size_t N>
1935 HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
1936  const Vec128<uint8_t, N> b) {
1937  return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)};
1938 }
1939 template <size_t N>
1940 HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
1941  const Vec128<uint16_t, N> b) {
1942  return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)};
1943 }
1944 
1945 // Signed
1946 template <size_t N>
1947 HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
1948  const Vec128<int8_t, N> b) {
1949  return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)};
1950 }
1951 template <size_t N>
1952 HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
1953  const Vec128<int16_t, N> b) {
1954  return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
1955 }
1956 
1957 // ------------------------------ AverageRound
1958 
1959 // Returns (a + b + 1) / 2
1960 
1961 // Unsigned
1962 template <size_t N>
1963 HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
1964  const Vec128<uint8_t, N> b) {
1965  return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)};
1966 }
1967 template <size_t N>
1968 HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
1969  const Vec128<uint16_t, N> b) {
1970  return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
1971 }
1972 
1973 // ------------------------------ Integer multiplication
1974 
1975 template <size_t N>
1976 HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
1977  const Vec128<uint16_t, N> b) {
1978  return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
1979 }
1980 template <size_t N>
1981 HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
1982  const Vec128<int16_t, N> b) {
1983  return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
1984 }
1985 
1986 // Returns the upper 16 bits of a * b in each lane.
1987 template <size_t N>
1988 HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
1989  const Vec128<uint16_t, N> b) {
1990  return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)};
1991 }
1992 template <size_t N>
1993 HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
1994  const Vec128<int16_t, N> b) {
1995  return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
1996 }
1997 
1998 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
1999 // even and the upper half into its odd neighbor lane.
2000 template <size_t N>
2001 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
2002  const Vec128<uint32_t, N> b) {
2003  return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
2004 }
2005 
2006 #if HWY_TARGET == HWY_SSSE3
2007 
2008 template <size_t N, HWY_IF_LE64(int32_t, N)> // N=1 or 2
2009 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
2010  const Vec128<int32_t, N> b) {
2011  return Set(Simd<int64_t, (N + 1) / 2>(), int64_t(GetLane(a)) * GetLane(b));
2012 }
2013 HWY_API Vec128<int64_t> MulEven(const Vec128<int32_t> a,
2014  const Vec128<int32_t> b) {
2015  alignas(16) int32_t a_lanes[4];
2016  alignas(16) int32_t b_lanes[4];
2017  const Full128<int32_t> di32;
2018  Store(a, di32, a_lanes);
2019  Store(b, di32, b_lanes);
2020  alignas(16) int64_t mul[2];
2021  mul[0] = int64_t(a_lanes[0]) * b_lanes[0];
2022  mul[1] = int64_t(a_lanes[2]) * b_lanes[2];
2023  return Load(Full128<int64_t>(), mul);
2024 }
2025 
2026 #else // HWY_TARGET == HWY_SSSE3
2027 
2028 template <size_t N>
2029 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
2030  const Vec128<int32_t, N> b) {
2031  return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)};
2032 }
2033 
2034 #endif // HWY_TARGET == HWY_SSSE3
2035 
2036 template <size_t N>
2037 HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
2038  const Vec128<uint32_t, N> b) {
2039 #if HWY_TARGET == HWY_SSSE3
2040  // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency.
2041  // 64-bit right shift would also work but also needs port 5, so no benefit.
2042  // Notation: x=don't care, z=0.
2043  const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1));
2044  const auto mullo_x2x0 = MulEven(a, b);
2045  const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
2046  const auto mullo_x3x1 =
2047  MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
2048  // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating
2049  // the latter requires one more instruction or a constant.
2050  const __m128i mul_20 =
2051  _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
2052  const __m128i mul_31 =
2053  _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
2054  return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
2055 #else
2056  return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)};
2057 #endif
2058 }
2059 
2060 template <size_t N>
2061 HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
2062  const Vec128<int32_t, N> b) {
2063  // Same as unsigned; avoid duplicating the SSSE3 code.
2064  const Simd<uint32_t, N> du;
2065  return BitCast(Simd<int32_t, N>(), BitCast(du, a) * BitCast(du, b));
2066 }
2067 
2068 // ------------------------------ ShiftLeft
2069 
2070 template <int kBits, size_t N>
2071 HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
2072  return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
2073 }
2074 
2075 template <int kBits, size_t N>
2076 HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
2077  return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)};
2078 }
2079 
2080 template <int kBits, size_t N>
2082  return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)};
2083 }
2084 
2085 template <int kBits, size_t N>
2086 HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
2087  return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)};
2088 }
2089 template <int kBits, size_t N>
2090 HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
2091  return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)};
2092 }
2093 template <int kBits, size_t N>
2095  return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
2096 }
2097 
2098 template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2099 HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
2100  const Simd<T, N> d8;
2101  // Use raw instead of BitCast to support N=1.
2102  const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
2103  return kBits == 1
2104  ? (v + v)
2105  : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
2106 }
2107 
2108 // ------------------------------ ShiftRight
2109 
2110 template <int kBits, size_t N>
2111 HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
2112  return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)};
2113 }
2114 template <int kBits, size_t N>
2115 HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
2116  return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)};
2117 }
2118 template <int kBits, size_t N>
2120  return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)};
2121 }
2122 
2123 template <int kBits, size_t N>
2124 HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
2125  const Simd<uint8_t, N> d8;
2126  // Use raw instead of BitCast to support N=1.
2127  const Vec128<uint8_t, N> shifted{
2128  ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
2129  return shifted & Set(d8, 0xFF >> kBits);
2130 }
2131 
2132 template <int kBits, size_t N>
2133 HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
2134  return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
2135 }
2136 template <int kBits, size_t N>
2137 HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
2138  return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
2139 }
2140 
2141 template <int kBits, size_t N>
2142 HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
2143  const Simd<int8_t, N> di;
2144  const Simd<uint8_t, N> du;
2145  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
2146  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
2147  return (shifted ^ shifted_sign) - shifted_sign;
2148 }
2149 
2150 // i64 is implemented after BroadcastSignBit.
2151 
2152 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
2153 
2154 template <size_t N>
2155 HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
2156  return VecFromMask(v < Zero(Simd<int8_t, N>()));
2157 }
2158 
2159 template <size_t N>
2161  return ShiftRight<15>(v);
2162 }
2163 
2164 template <size_t N>
2166  return ShiftRight<31>(v);
2167 }
2168 
2169 template <size_t N>
2171 #if HWY_TARGET <= HWY_AVX3
2172  return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, 63)};
2173 #elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
2174  return VecFromMask(v < Zero(Simd<int64_t, N>()));
2175 #else
2176  // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift
2177  // avoids generating a zero.
2178  const Simd<int32_t, N * 2> d32;
2179  const auto sign = ShiftRight<31>(BitCast(d32, v));
2180  return Vec128<int64_t, N>{
2181  _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
2182 #endif
2183 }
2184 
2185 template <size_t N>
2186 HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
2187 #if HWY_TARGET <= HWY_AVX3
2188  return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
2189 #else
2190  const auto zero = Zero(Simd<int64_t, N>());
2191  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
2192 #endif
2193 }
2194 
2195 template <int kBits, size_t N>
2197 #if HWY_TARGET <= HWY_AVX3
2198  return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, kBits)};
2199 #else
2200  const Simd<int64_t, N> di;
2201  const Simd<uint64_t, N> du;
2202  const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
2203  const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
2204  return right | sign;
2205 #endif
2206 }
2207 
2208 // ------------------------------ ZeroIfNegative (BroadcastSignBit)
2209 template <typename T, size_t N, HWY_IF_FLOAT(T)>
2210 HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
2211  const Simd<T, N> d;
2212 #if HWY_TARGET == HWY_SSSE3
2213  const RebindToSigned<decltype(d)> di;
2214  const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
2215 #else
2216  const auto mask = MaskFromVec(v); // MSB is sufficient for BLENDVPS
2217 #endif
2218  return IfThenElse(mask, Zero(d), v);
2219 }
2220 
2221 // ------------------------------ ShiftLeftSame
2222 
2223 template <size_t N>
2224 HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
2225  const int bits) {
2226  return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2227 }
2228 template <size_t N>
2229 HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
2230  const int bits) {
2231  return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2232 }
2233 template <size_t N>
2235  const int bits) {
2236  return Vec128<uint64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2237 }
2238 
2239 template <size_t N>
2240 HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
2241  const int bits) {
2242  return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2243 }
2244 
2245 template <size_t N>
2246 HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
2247  const int bits) {
2248  return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2249 }
2250 
2251 template <size_t N>
2253  const int bits) {
2254  return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2255 }
2256 
2257 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2258 HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
2259  const Simd<T, N> d8;
2260  // Use raw instead of BitCast to support N=1.
2261  const Vec128<T, N> shifted{
2262  ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
2263  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
2264 }
2265 
2266 // ------------------------------ ShiftRightSame (BroadcastSignBit)
2267 
2268 template <size_t N>
2269 HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
2270  const int bits) {
2271  return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2272 }
2273 template <size_t N>
2274 HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
2275  const int bits) {
2276  return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2277 }
2278 template <size_t N>
2280  const int bits) {
2281  return Vec128<uint64_t, N>{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2282 }
2283 
2284 template <size_t N>
2285 HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
2286  const int bits) {
2287  const Simd<uint8_t, N> d8;
2288  // Use raw instead of BitCast to support N=1.
2289  const Vec128<uint8_t, N> shifted{
2290  ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
2291  return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
2292 }
2293 
2294 template <size_t N>
2295 HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
2296  const int bits) {
2297  return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2298 }
2299 
2300 template <size_t N>
2301 HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
2302  const int bits) {
2303  return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2304 }
2305 template <size_t N>
2307  const int bits) {
2308 #if HWY_TARGET <= HWY_AVX3
2309  return Vec128<int64_t, N>{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2310 #else
2311  const Simd<int64_t, N> di;
2312  const Simd<uint64_t, N> du;
2313  const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
2314  const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
2315  return right | sign;
2316 #endif
2317 }
2318 
2319 template <size_t N>
2320 HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
2321  const Simd<int8_t, N> di;
2322  const Simd<uint8_t, N> du;
2323  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
2324  const auto shifted_sign =
2325  BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
2326  return (shifted ^ shifted_sign) - shifted_sign;
2327 }
2328 
2329 // ------------------------------ Floating-point mul / div
2330 
2331 template <size_t N>
2332 HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
2333  return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)};
2334 }
2336  const Vec128<float, 1> b) {
2337  return Vec128<float, 1>{_mm_mul_ss(a.raw, b.raw)};
2338 }
2339 template <size_t N>
2341  const Vec128<double, N> b) {
2342  return Vec128<double, N>{_mm_mul_pd(a.raw, b.raw)};
2343 }
2345  const Vec128<double, 1> b) {
2346  return Vec128<double, 1>{_mm_mul_sd(a.raw, b.raw)};
2347 }
2348 
2349 template <size_t N>
2350 HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
2351  const Vec128<float, N> b) {
2352  return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)};
2353 }
2355  const Vec128<float, 1> b) {
2356  return Vec128<float, 1>{_mm_div_ss(a.raw, b.raw)};
2357 }
2358 template <size_t N>
2360  const Vec128<double, N> b) {
2361  return Vec128<double, N>{_mm_div_pd(a.raw, b.raw)};
2362 }
2364  const Vec128<double, 1> b) {
2365  return Vec128<double, 1>{_mm_div_sd(a.raw, b.raw)};
2366 }
2367 
2368 // Approximate reciprocal
2369 template <size_t N>
2370 HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
2371  return Vec128<float, N>{_mm_rcp_ps(v.raw)};
2372 }
2374  return Vec128<float, 1>{_mm_rcp_ss(v.raw)};
2375 }
2376 
2377 // Absolute value of difference.
2378 template <size_t N>
2379 HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
2380  const Vec128<float, N> b) {
2381  return Abs(a - b);
2382 }
2383 
2384 // ------------------------------ Floating-point multiply-add variants
2385 
2386 // Returns mul * x + add
2387 template <size_t N>
2388 HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
2389  const Vec128<float, N> x,
2390  const Vec128<float, N> add) {
2391 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2392  return mul * x + add;
2393 #else
2394  return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
2395 #endif
2396 }
2397 template <size_t N>
2399  const Vec128<double, N> x,
2400  const Vec128<double, N> add) {
2401 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2402  return mul * x + add;
2403 #else
2404  return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)};
2405 #endif
2406 }
2407 
2408 // Returns add - mul * x
2409 template <size_t N>
2410 HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
2411  const Vec128<float, N> x,
2412  const Vec128<float, N> add) {
2413 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2414  return add - mul * x;
2415 #else
2416  return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
2417 #endif
2418 }
2419 template <size_t N>
2421  const Vec128<double, N> x,
2422  const Vec128<double, N> add) {
2423 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2424  return add - mul * x;
2425 #else
2426  return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)};
2427 #endif
2428 }
2429 
2430 // Returns mul * x - sub
2431 template <size_t N>
2432 HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
2433  const Vec128<float, N> x,
2434  const Vec128<float, N> sub) {
2435 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2436  return mul * x - sub;
2437 #else
2438  return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
2439 #endif
2440 }
2441 template <size_t N>
2443  const Vec128<double, N> x,
2444  const Vec128<double, N> sub) {
2445 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2446  return mul * x - sub;
2447 #else
2448  return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)};
2449 #endif
2450 }
2451 
2452 // Returns -mul * x - sub
2453 template <size_t N>
2454 HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
2455  const Vec128<float, N> x,
2456  const Vec128<float, N> sub) {
2457 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2458  return Neg(mul) * x - sub;
2459 #else
2460  return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
2461 #endif
2462 }
2463 template <size_t N>
2465  const Vec128<double, N> x,
2466  const Vec128<double, N> sub) {
2467 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2468  return Neg(mul) * x - sub;
2469 #else
2470  return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)};
2471 #endif
2472 }
2473 
2474 // ------------------------------ Floating-point square root
2475 
2476 // Full precision square root
2477 template <size_t N>
2478 HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
2479  return Vec128<float, N>{_mm_sqrt_ps(v.raw)};
2480 }
2482  return Vec128<float, 1>{_mm_sqrt_ss(v.raw)};
2483 }
2484 template <size_t N>
2486  return Vec128<double, N>{_mm_sqrt_pd(v.raw)};
2487 }
2489  return Vec128<double, 1>{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)};
2490 }
2491 
2492 // Approximate reciprocal square root
2493 template <size_t N>
2494 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
2495  return Vec128<float, N>{_mm_rsqrt_ps(v.raw)};
2496 }
2498  return Vec128<float, 1>{_mm_rsqrt_ss(v.raw)};
2499 }
2500 
2501 // ------------------------------ Min (Gt, IfThenElse)
2502 
2503 namespace detail {
2504 
2505 template <typename T, size_t N>
2507  const Vec128<T, N> b) {
2508  const Simd<T, N> du;
2509  const RebindToSigned<decltype(du)> di;
2510  const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
2511  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
2512  return IfThenElse(gt, b, a);
2513 }
2514 
2515 } // namespace detail
2516 
2517 // Unsigned
2518 template <size_t N>
2519 HWY_API Vec128<uint8_t, N> Min(const Vec128<uint8_t, N> a,
2520  const Vec128<uint8_t, N> b) {
2521  return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)};
2522 }
2523 template <size_t N>
2524 HWY_API Vec128<uint16_t, N> Min(const Vec128<uint16_t, N> a,
2525  const Vec128<uint16_t, N> b) {
2526 #if HWY_TARGET == HWY_SSSE3
2527  return detail::MinU(a, b);
2528 #else
2529  return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
2530 #endif
2531 }
2532 template <size_t N>
2533 HWY_API Vec128<uint32_t, N> Min(const Vec128<uint32_t, N> a,
2534  const Vec128<uint32_t, N> b) {
2535 #if HWY_TARGET == HWY_SSSE3
2536  return detail::MinU(a, b);
2537 #else
2538  return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)};
2539 #endif
2540 }
2541 template <size_t N>
2542 HWY_API Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
2543  const Vec128<uint64_t, N> b) {
2544 #if HWY_TARGET <= HWY_AVX3
2545  return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)};
2546 #else
2547  return detail::MinU(a, b);
2548 #endif
2549 }
2550 
2551 // Signed
2552 template <size_t N>
2553 HWY_API Vec128<int8_t, N> Min(const Vec128<int8_t, N> a,
2554  const Vec128<int8_t, N> b) {
2555 #if HWY_TARGET == HWY_SSSE3
2556  return IfThenElse(a < b, a, b);
2557 #else
2558  return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)};
2559 #endif
2560 }
2561 template <size_t N>
2562 HWY_API Vec128<int16_t, N> Min(const Vec128<int16_t, N> a,
2563  const Vec128<int16_t, N> b) {
2564  return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)};
2565 }
2566 template <size_t N>
2567 HWY_API Vec128<int32_t, N> Min(const Vec128<int32_t, N> a,
2568  const Vec128<int32_t, N> b) {
2569 #if HWY_TARGET == HWY_SSSE3
2570  return IfThenElse(a < b, a, b);
2571 #else
2572  return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)};
2573 #endif
2574 }
2575 template <size_t N>
2576 HWY_API Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
2577  const Vec128<int64_t, N> b) {
2578 #if HWY_TARGET <= HWY_AVX3
2579  return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)};
2580 #else
2581  return IfThenElse(a < b, a, b);
2582 #endif
2583 }
2584 
2585 // Float
2586 template <size_t N>
2587 HWY_API Vec128<float, N> Min(const Vec128<float, N> a,
2588  const Vec128<float, N> b) {
2589  return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)};
2590 }
2591 template <size_t N>
2593  const Vec128<double, N> b) {
2594  return Vec128<double, N>{_mm_min_pd(a.raw, b.raw)};
2595 }
2596 
2597 // ------------------------------ Max (Gt, IfThenElse)
2598 
2599 namespace detail {
2600 template <typename T, size_t N>
2602  const Vec128<T, N> b) {
2603  const Simd<T, N> du;
2604  const RebindToSigned<decltype(du)> di;
2605  const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
2606  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
2607  return IfThenElse(gt, a, b);
2608 }
2609 
2610 } // namespace detail
2611 
2612 // Unsigned
2613 template <size_t N>
2614 HWY_API Vec128<uint8_t, N> Max(const Vec128<uint8_t, N> a,
2615  const Vec128<uint8_t, N> b) {
2616  return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)};
2617 }
2618 template <size_t N>
2619 HWY_API Vec128<uint16_t, N> Max(const Vec128<uint16_t, N> a,
2620  const Vec128<uint16_t, N> b) {
2621 #if HWY_TARGET == HWY_SSSE3
2622  return detail::MaxU(a, b);
2623 #else
2624  return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
2625 #endif
2626 }
2627 template <size_t N>
2628 HWY_API Vec128<uint32_t, N> Max(const Vec128<uint32_t, N> a,
2629  const Vec128<uint32_t, N> b) {
2630 #if HWY_TARGET == HWY_SSSE3
2631  return detail::MaxU(a, b);
2632 #else
2633  return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)};
2634 #endif
2635 }
2636 template <size_t N>
2637 HWY_API Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
2638  const Vec128<uint64_t, N> b) {
2639 #if HWY_TARGET <= HWY_AVX3
2640  return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)};
2641 #else
2642  return detail::MaxU(a, b);
2643 #endif
2644 }
2645 
2646 // Signed
2647 template <size_t N>
2648 HWY_API Vec128<int8_t, N> Max(const Vec128<int8_t, N> a,
2649  const Vec128<int8_t, N> b) {
2650 #if HWY_TARGET == HWY_SSSE3
2651  return IfThenElse(a < b, b, a);
2652 #else
2653  return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)};
2654 #endif
2655 }
2656 template <size_t N>
2657 HWY_API Vec128<int16_t, N> Max(const Vec128<int16_t, N> a,
2658  const Vec128<int16_t, N> b) {
2659  return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)};
2660 }
2661 template <size_t N>
2662 HWY_API Vec128<int32_t, N> Max(const Vec128<int32_t, N> a,
2663  const Vec128<int32_t, N> b) {
2664 #if HWY_TARGET == HWY_SSSE3
2665  return IfThenElse(a < b, b, a);
2666 #else
2667  return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)};
2668 #endif
2669 }
2670 template <size_t N>
2671 HWY_API Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
2672  const Vec128<int64_t, N> b) {
2673 #if HWY_TARGET <= HWY_AVX3
2674  return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)};
2675 #else
2676  return IfThenElse(a < b, b, a);
2677 #endif
2678 }
2679 
2680 // Float
2681 template <size_t N>
2682 HWY_API Vec128<float, N> Max(const Vec128<float, N> a,
2683  const Vec128<float, N> b) {
2684  return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)};
2685 }
2686 template <size_t N>
2688  const Vec128<double, N> b) {
2689  return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)};
2690 }
2691 
2692 // ================================================== MEMORY (2)
2693 
2694 // ------------------------------ Non-temporal stores
2695 
2696 // On clang6, we see incorrect code generated for _mm_stream_pi, so
2697 // round even partial vectors up to 16 bytes.
2698 template <typename T, size_t N>
2699 HWY_API void Stream(Vec128<T, N> v, Simd<T, N> /* tag */,
2700  T* HWY_RESTRICT aligned) {
2701  _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
2702 }
2703 template <size_t N>
2705  float* HWY_RESTRICT aligned) {
2706  _mm_stream_ps(aligned, v.raw);
2707 }
2708 template <size_t N>
2710  double* HWY_RESTRICT aligned) {
2711  _mm_stream_pd(aligned, v.raw);
2712 }
2713 
2714 // ------------------------------ Scatter
2715 
2716 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2717 HWY_DIAGNOSTICS(push)
2718 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2719 
2720 // Unfortunately the GCC/Clang intrinsics do not accept int64_t*.
2721 using GatherIndex64 = long long int; // NOLINT(google-runtime-int)
2722 static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type");
2723 
2724 #if HWY_TARGET <= HWY_AVX3
2725 namespace detail {
2726 
2727 template <typename T, size_t N>
2729  Simd<T, N> /* tag */, T* HWY_RESTRICT base,
2730  const Vec128<int32_t, N> offset) {
2731  if (N == 4) {
2732  _mm_i32scatter_epi32(base, offset.raw, v.raw, 1);
2733  } else {
2734  const __mmask8 mask = (1u << N) - 1;
2735  _mm_mask_i32scatter_epi32(base, mask, offset.raw, v.raw, 1);
2736  }
2737 }
2738 template <typename T, size_t N>
2740  Simd<T, N> /* tag */, T* HWY_RESTRICT base,
2741  const Vec128<int32_t, N> index) {
2742  if (N == 4) {
2743  _mm_i32scatter_epi32(base, index.raw, v.raw, 4);
2744  } else {
2745  const __mmask8 mask = (1u << N) - 1;
2746  _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, 4);
2747  }
2748 }
2749 
2750 template <typename T, size_t N>
2752  Simd<T, N> /* tag */, T* HWY_RESTRICT base,
2753  const Vec128<int64_t, N> offset) {
2754  if (N == 2) {
2755  _mm_i64scatter_epi64(base, offset.raw, v.raw, 1);
2756  } else {
2757  const __mmask8 mask = (1u << N) - 1;
2758  _mm_mask_i64scatter_epi64(base, mask, offset.raw, v.raw, 1);
2759  }
2760 }
2761 template <typename T, size_t N>
2763  Simd<T, N> /* tag */, T* HWY_RESTRICT base,
2764  const Vec128<int64_t, N> index) {
2765  if (N == 2) {
2766  _mm_i64scatter_epi64(base, index.raw, v.raw, 8);
2767  } else {
2768  const __mmask8 mask = (1u << N) - 1;
2769  _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, 8);
2770  }
2771 }
2772 
2773 } // namespace detail
2774 
2775 template <typename T, size_t N, typename Offset>
2777  const Vec128<Offset, N> offset) {
2778  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2779  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
2780 }
2781 template <typename T, size_t N, typename Index>
2783  const Vec128<Index, N> index) {
2784  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2785  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
2786 }
2787 
2788 template <size_t N>
2790  float* HWY_RESTRICT base,
2791  const Vec128<int32_t, N> offset) {
2792  if (N == 4) {
2793  _mm_i32scatter_ps(base, offset.raw, v.raw, 1);
2794  } else {
2795  const __mmask8 mask = (1u << N) - 1;
2796  _mm_mask_i32scatter_ps(base, mask, offset.raw, v.raw, 1);
2797  }
2798 }
2799 template <size_t N>
2801  float* HWY_RESTRICT base,
2802  const Vec128<int32_t, N> index) {
2803  if (N == 4) {
2804  _mm_i32scatter_ps(base, index.raw, v.raw, 4);
2805  } else {
2806  const __mmask8 mask = (1u << N) - 1;
2807  _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, 4);
2808  }
2809 }
2810 
2811 template <size_t N>
2813  double* HWY_RESTRICT base,
2814  const Vec128<int64_t, N> offset) {
2815  if (N == 2) {
2816  _mm_i64scatter_pd(base, offset.raw, v.raw, 1);
2817  } else {
2818  const __mmask8 mask = (1u << N) - 1;
2819  _mm_mask_i64scatter_pd(base, mask, offset.raw, v.raw, 1);
2820  }
2821 }
2822 template <size_t N>
2824  double* HWY_RESTRICT base,
2825  const Vec128<int64_t, N> index) {
2826  if (N == 2) {
2827  _mm_i64scatter_pd(base, index.raw, v.raw, 8);
2828  } else {
2829  const __mmask8 mask = (1u << N) - 1;
2830  _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, 8);
2831  }
2832 }
2833 #else // HWY_TARGET <= HWY_AVX3
2834 
2835 template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
2836 HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
2837  const Vec128<Offset, N> offset) {
2838  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2839 
2840  alignas(16) T lanes[N];
2841  Store(v, d, lanes);
2842 
2843  alignas(16) Offset offset_lanes[N];
2844  Store(offset, Simd<Offset, N>(), offset_lanes);
2845 
2846  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
2847  for (size_t i = 0; i < N; ++i) {
2848  CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
2849  }
2850 }
2851 
2852 template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
2853 HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
2854  const Vec128<Index, N> index) {
2855  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2856 
2857  alignas(16) T lanes[N];
2858  Store(v, d, lanes);
2859 
2860  alignas(16) Index index_lanes[N];
2861  Store(index, Simd<Index, N>(), index_lanes);
2862 
2863  for (size_t i = 0; i < N; ++i) {
2864  base[index_lanes[i]] = lanes[i];
2865  }
2866 }
2867 
2868 #endif
2869 
2870 // ------------------------------ Gather (Load/Store)
2871 
2872 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2873 
2874 template <typename T, size_t N, typename Offset>
2875 HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
2876  const T* HWY_RESTRICT base,
2877  const Vec128<Offset, N> offset) {
2878  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2879 
2880  alignas(16) Offset offset_lanes[N];
2881  Store(offset, Simd<Offset, N>(), offset_lanes);
2882 
2883  alignas(16) T lanes[N];
2884  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
2885  for (size_t i = 0; i < N; ++i) {
2886  CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
2887  }
2888  return Load(d, lanes);
2889 }
2890 
2891 template <typename T, size_t N, typename Index>
2892 HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
2893  const Vec128<Index, N> index) {
2894  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2895 
2896  alignas(16) Index index_lanes[N];
2897  Store(index, Simd<Index, N>(), index_lanes);
2898 
2899  alignas(16) T lanes[N];
2900  for (size_t i = 0; i < N; ++i) {
2901  lanes[i] = base[index_lanes[i]];
2902  }
2903  return Load(d, lanes);
2904 }
2905 
2906 #else
2907 
2908 namespace detail {
2909 
2910 template <typename T, size_t N>
2911 HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<4> /* tag */,
2912  Simd<T, N> /* d */,
2913  const T* HWY_RESTRICT base,
2914  const Vec128<int32_t, N> offset) {
2915  return Vec128<T, N>{_mm_i32gather_epi32(
2916  reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
2917 }
2918 template <typename T, size_t N>
2919 HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<4> /* tag */,
2920  Simd<T, N> /* d */,
2921  const T* HWY_RESTRICT base,
2922  const Vec128<int32_t, N> index) {
2923  return Vec128<T, N>{_mm_i32gather_epi32(
2924  reinterpret_cast<const int32_t*>(base), index.raw, 4)};
2925 }
2926 
2927 template <typename T, size_t N>
2928 HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<8> /* tag */,
2929  Simd<T, N> /* d */,
2930  const T* HWY_RESTRICT base,
2931  const Vec128<int64_t, N> offset) {
2932  return Vec128<T, N>{_mm_i64gather_epi64(
2933  reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
2934 }
2935 template <typename T, size_t N>
2936 HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<8> /* tag */,
2937  Simd<T, N> /* d */,
2938  const T* HWY_RESTRICT base,
2939  const Vec128<int64_t, N> index) {
2940  return Vec128<T, N>{_mm_i64gather_epi64(
2941  reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)};
2942 }
2943 
2944 } // namespace detail
2945 
2946 template <typename T, size_t N, typename Offset>
2947 HWY_API Vec128<T, N> GatherOffset(Simd<T, N> d, const T* HWY_RESTRICT base,
2948  const Vec128<Offset, N> offset) {
2949  return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
2950 }
2951 template <typename T, size_t N, typename Index>
2952 HWY_API Vec128<T, N> GatherIndex(Simd<T, N> d, const T* HWY_RESTRICT base,
2953  const Vec128<Index, N> index) {
2954  return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
2955 }
2956 
2957 template <size_t N>
2958 HWY_API Vec128<float, N> GatherOffset(Simd<float, N> /* tag */,
2959  const float* HWY_RESTRICT base,
2960  const Vec128<int32_t, N> offset) {
2961  return Vec128<float, N>{_mm_i32gather_ps(base, offset.raw, 1)};
2962 }
2963 template <size_t N>
2964 HWY_API Vec128<float, N> GatherIndex(Simd<float, N> /* tag */,
2965  const float* HWY_RESTRICT base,
2966  const Vec128<int32_t, N> index) {
2967  return Vec128<float, N>{_mm_i32gather_ps(base, index.raw, 4)};
2968 }
2969 
2970 template <size_t N>
2971 HWY_API Vec128<double, N> GatherOffset(Simd<double, N> /* tag */,
2972  const double* HWY_RESTRICT base,
2973  const Vec128<int64_t, N> offset) {
2974  return Vec128<double, N>{_mm_i64gather_pd(base, offset.raw, 1)};
2975 }
2976 template <size_t N>
2977 HWY_API Vec128<double, N> GatherIndex(Simd<double, N> /* tag */,
2978  const double* HWY_RESTRICT base,
2979  const Vec128<int64_t, N> index) {
2980  return Vec128<double, N>{_mm_i64gather_pd(base, index.raw, 8)};
2981 }
2982 
2983 #endif // HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2984 
2985 HWY_DIAGNOSTICS(pop)
2986 
2987 // ================================================== SWIZZLE (2)
2988 
2989 // ------------------------------ LowerHalf
2990 
2991 // Returns upper/lower half of a vector.
2992 template <typename T, size_t N>
2993 HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2> /* tag */, Vec128<T, N> v) {
2994  return Vec128<T, N / 2>{v.raw};
2995 }
2996 
2997 template <typename T, size_t N>
2998 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
2999  return LowerHalf(Simd<T, N / 2>(), v);
3000 }
3001 
3002 // ------------------------------ ShiftLeftBytes
3003 
3004 template <int kBytes, typename T, size_t N>
3005 HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N> /* tag */, Vec128<T, N> v) {
3006  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
3007  return Vec128<T, N>{_mm_slli_si128(v.raw, kBytes)};
3008 }
3009 
3010 template <int kBytes, typename T, size_t N>
3011 HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
3012  return ShiftLeftBytes<kBytes>(Simd<T, N>(), v);
3013 }
3014 
3015 // ------------------------------ ShiftLeftLanes
3016 
3017 template <int kLanes, typename T, size_t N>
3018 HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N> d, const Vec128<T, N> v) {
3019  const Repartition<uint8_t, decltype(d)> d8;
3020  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
3021 }
3022 
3023 template <int kLanes, typename T, size_t N>
3024 HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
3025  return ShiftLeftLanes<kLanes>(Simd<T, N>(), v);
3026 }
3027 
3028 // ------------------------------ ShiftRightBytes
3029 template <int kBytes, typename T, size_t N>
3030 HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N> /* tag */, Vec128<T, N> v) {
3031  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
3032  // For partial vectors, clear upper lanes so we shift in zeros.
3033  if (N != 16 / sizeof(T)) {
3034  const Vec128<T> vfull{v.raw};
3035  v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
3036  }
3037  return Vec128<T, N>{_mm_srli_si128(v.raw, kBytes)};
3038 }
3039 
3040 // ------------------------------ ShiftRightLanes
3041 template <int kLanes, typename T, size_t N>
3042 HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N> d, const Vec128<T, N> v) {
3043  const Repartition<uint8_t, decltype(d)> d8;
3044  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
3045 }
3046 
3047 // ------------------------------ UpperHalf (ShiftRightBytes)
3048 
3049 // Full input: copy hi into lo (smaller instruction encoding than shifts).
3050 template <typename T>
3051 HWY_API Vec128<T, 8 / sizeof(T)> UpperHalf(Half<Full128<T>> /* tag */,
3052  Vec128<T> v) {
3053  return Vec128<T, 8 / sizeof(T)>{_mm_unpackhi_epi64(v.raw, v.raw)};
3054 }
3055 HWY_API Vec128<float, 2> UpperHalf(Simd<float, 2> /* tag */, Vec128<float> v) {
3056  return Vec128<float, 2>{_mm_movehl_ps(v.raw, v.raw)};
3057 }
3059  Vec128<double> v) {
3060  return Vec128<double, 1>{_mm_unpackhi_pd(v.raw, v.raw)};
3061 }
3062 
3063 // Partial
3064 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3065 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N>> /* tag */,
3066  Vec128<T, N> v) {
3067  const Simd<T, N> d;
3068  const auto vu = BitCast(RebindToUnsigned<decltype(d)>(), v);
3069  const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(vu));
3070  return Vec128<T, (N + 1) / 2>{upper.raw};
3071 }
3072 
3073 // ------------------------------ CombineShiftRightBytes
3074 
3075 template <int kBytes, typename T, class V = Vec128<T>>
3076 HWY_API V CombineShiftRightBytes(Full128<T> d, V hi, V lo) {
3077  const Repartition<uint8_t, decltype(d)> d8;
3078  return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8(
3079  BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
3080 }
3081 
3082 template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
3083  class V = Vec128<T, N>>
3084 HWY_API V CombineShiftRightBytes(Simd<T, N> d, V hi, V lo) {
3085  constexpr size_t kSize = N * sizeof(T);
3086  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
3087  const Repartition<uint8_t, decltype(d)> d8;
3088  const Full128<uint8_t> d_full8;
3089  using V8 = VFromD<decltype(d_full8)>;
3090  const V8 hi8{BitCast(d8, hi).raw};
3091  // Move into most-significant bytes
3092  const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
3093  const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
3094  return V{BitCast(Full128<T>(), r).raw};
3095 }
3096 
3097 // ------------------------------ Broadcast/splat any lane
3098 
3099 // Unsigned
3100 template <int kLane, size_t N>
3101 HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
3102  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3103  if (kLane < 4) {
3104  const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
3105  return Vec128<uint16_t, N>{_mm_unpacklo_epi64(lo, lo)};
3106  } else {
3107  const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
3108  return Vec128<uint16_t, N>{_mm_unpackhi_epi64(hi, hi)};
3109  }
3110 }
3111 template <int kLane, size_t N>
3112 HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
3113  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3114  return Vec128<uint32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
3115 }
3116 template <int kLane, size_t N>
3118  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3119  return Vec128<uint64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
3120 }
3121 
3122 // Signed
3123 template <int kLane, size_t N>
3124 HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
3125  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3126  if (kLane < 4) {
3127  const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
3128  return Vec128<int16_t, N>{_mm_unpacklo_epi64(lo, lo)};
3129  } else {
3130  const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
3131  return Vec128<int16_t, N>{_mm_unpackhi_epi64(hi, hi)};
3132  }
3133 }
3134 template <int kLane, size_t N>
3135 HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
3136  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3137  return Vec128<int32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
3138 }
3139 template <int kLane, size_t N>
3141  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3142  return Vec128<int64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
3143 }
3144 
3145 // Float
3146 template <int kLane, size_t N>
3147 HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
3148  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3149  return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
3150 }
3151 template <int kLane, size_t N>
3153  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3154  return Vec128<double, N>{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)};
3155 }
3156 
3157 // ------------------------------ TableLookupBytes
3158 template <typename T, size_t N, typename TI, size_t NI>
3159 HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
3160  const Vec128<TI, NI> from) {
3161  return Vec128<TI, NI>{_mm_shuffle_epi8(bytes.raw, from.raw)};
3162 }
3163 
3164 // ------------------------------ TableLookupBytesOr0
3165 // For all vector widths; x86 anyway zeroes if >= 0x80.
3166 template <class V, class VI>
3167 HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
3168  return TableLookupBytes(bytes, from);
3169 }
3170 
3171 // ------------------------------ TableLookupLanes
3172 
3173 // Returned by SetTableIndices for use by TableLookupLanes.
3174 template <typename T, size_t N>
3175 struct Indices128 {
3176  __m128i raw;
3177 };
3178 
3179 template <typename T, size_t N, HWY_IF_LE128(T, N)>
3180 HWY_API Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
3181 #if HWY_IS_DEBUG_BUILD
3182  for (size_t i = 0; i < N; ++i) {
3183  HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
3184  }
3185 #endif
3186 
3187  const Repartition<uint8_t, decltype(d)> d8;
3188  alignas(16) uint8_t control[16] = {0};
3189  for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
3190  for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
3191  control[idx_lane * sizeof(T) + idx_byte] =
3192  static_cast<uint8_t>(size_t(idx[idx_lane]) * sizeof(T) + idx_byte);
3193  }
3194  }
3195  return Indices128<T, N>{Load(d8, control).raw};
3196 }
3197 
3198 template <size_t N>
3199 HWY_API Vec128<uint32_t, N> TableLookupLanes(
3200  const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
3201  return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
3202 }
3203 template <size_t N>
3204 HWY_API Vec128<int32_t, N> TableLookupLanes(const Vec128<int32_t, N> v,
3205  const Indices128<int32_t, N> idx) {
3206  return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
3207 }
3208 template <size_t N>
3209 HWY_API Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
3210  const Indices128<float, N> idx) {
3211  const Simd<int32_t, N> di;
3212  const Simd<float, N> df;
3213  return BitCast(df,
3214  TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
3215 }
3216 
3217 // ------------------------------ Reverse (Shuffle0123, Shuffle2301)
3218 
3219 template <typename T>
3220 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
3221  return Shuffle0123(v);
3222 }
3223 
3224 template <typename T>
3225 HWY_API Vec128<T, 2> Reverse(Simd<T, 2> /* tag */, const Vec128<T, 2> v) {
3226  return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
3227 }
3228 
3229 template <typename T>
3230 HWY_API Vec128<T, 1> Reverse(Simd<T, 1> /* tag */, const Vec128<T, 1> v) {
3231  return v;
3232 }
3233 
3234 // ------------------------------ InterleaveLower
3235 
3236 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
3237 // the least-significant lane) and "b". To concatenate two half-width integers
3238 // into one, use ZipLower/Upper instead (also works with scalar).
3239 
3240 template <size_t N, HWY_IF_LE128(uint8_t, N)>
3242  const Vec128<uint8_t, N> b) {
3243  return Vec128<uint8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
3244 }
3245 template <size_t N, HWY_IF_LE128(uint16_t, N)>
3247  const Vec128<uint16_t, N> b) {
3248  return Vec128<uint16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)};
3249 }
3250 template <size_t N, HWY_IF_LE128(uint32_t, N)>
3252  const Vec128<uint32_t, N> b) {
3253  return Vec128<uint32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
3254 }
3255 template <size_t N, HWY_IF_LE128(uint64_t, N)>
3257  const Vec128<uint64_t, N> b) {
3258  return Vec128<uint64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
3259 }
3260 
3261 template <size_t N, HWY_IF_LE128(int8_t, N)>
3263  const Vec128<int8_t, N> b) {
3264  return Vec128<int8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
3265 }
3266 template <size_t N, HWY_IF_LE128(int16_t, N)>
3268  const Vec128<int16_t, N> b) {
3269  return Vec128<int16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)};
3270 }
3271 template <size_t N, HWY_IF_LE128(int32_t, N)>
3273  const Vec128<int32_t, N> b) {
3274  return Vec128<int32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
3275 }
3276 template <size_t N, HWY_IF_LE128(int64_t, N)>
3278  const Vec128<int64_t, N> b) {
3279  return Vec128<int64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
3280 }
3281 
3282 template <size_t N, HWY_IF_LE128(float, N)>
3283 HWY_API Vec128<float, N> InterleaveLower(const Vec128<float, N> a,
3284  const Vec128<float, N> b) {
3285  return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)};
3286 }
3287 template <size_t N, HWY_IF_LE128(double, N)>
3289  const Vec128<double, N> b) {
3290  return Vec128<double, N>{_mm_unpacklo_pd(a.raw, b.raw)};
3291 }
3292 
3293 // Additional overload for the optional Simd<> tag.
3294 template <typename T, size_t N, HWY_IF_LE128(T, N), class V = Vec128<T, N>>
3295 HWY_API V InterleaveLower(Simd<T, N> /* tag */, V a, V b) {
3296  return InterleaveLower(a, b);
3297 }
3298 
3299 // ------------------------------ InterleaveUpper (UpperHalf)
3300 
3301 // All functions inside detail lack the required D parameter.
3302 namespace detail {
3303 
3305  const Vec128<uint8_t> b) {
3306  return Vec128<uint8_t>{_mm_unpackhi_epi8(a.raw, b.raw)};
3307 }
3309  const Vec128<uint16_t> b) {
3310  return Vec128<uint16_t>{_mm_unpackhi_epi16(a.raw, b.raw)};
3311 }
3313  const Vec128<uint32_t> b) {
3314  return Vec128<uint32_t>{_mm_unpackhi_epi32(a.raw, b.raw)};
3315 }
3317  const Vec128<uint64_t> b) {
3318  return Vec128<uint64_t>{_mm_unpackhi_epi64(a.raw, b.raw)};
3319 }
3320 
3322  const Vec128<int8_t> b) {
3323  return Vec128<int8_t>{_mm_unpackhi_epi8(a.raw, b.raw)};
3324 }
3326  const Vec128<int16_t> b) {
3327  return Vec128<int16_t>{_mm_unpackhi_epi16(a.raw, b.raw)};
3328 }
3330  const Vec128<int32_t> b) {
3331  return Vec128<int32_t>{_mm_unpackhi_epi32(a.raw, b.raw)};
3332 }
3334  const Vec128<int64_t> b) {
3335  return Vec128<int64_t>{_mm_unpackhi_epi64(a.raw, b.raw)};
3336 }
3337 
3338 HWY_API Vec128<float> InterleaveUpper(const Vec128<float> a,
3339  const Vec128<float> b) {
3340  return Vec128<float>{_mm_unpackhi_ps(a.raw, b.raw)};
3341 }
3343  const Vec128<double> b) {
3344  return Vec128<double>{_mm_unpackhi_pd(a.raw, b.raw)};
3345 }
3346 
3347 } // namespace detail
3348 
3349 // Full
3350 template <typename T, class V = Vec128<T>>
3351 HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
3352  return detail::InterleaveUpper(a, b);
3353 }
3354 
3355 // Partial
3356 template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
3357 HWY_API V InterleaveUpper(Simd<T, N> d, V a, V b) {
3358  const Half<decltype(d)> d2;
3359  return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
3360 }
3361 
3362 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
3363 
3364 // Same as Interleave*, except that the return lanes are double-width integers;
3365 // this is necessary because the single-lane scalar cannot return two values.
3366 template <typename T, size_t N, class DW = RepartitionToWide<Simd<T, N>>>
3367 HWY_API VFromD<DW> ZipLower(Vec128<T, N> a, Vec128<T, N> b) {
3368  return BitCast(DW(), InterleaveLower(a, b));
3369 }
3370 template <typename T, size_t N, class D = Simd<T, N>,
3371  class DW = RepartitionToWide<D>>
3372 HWY_API VFromD<DW> ZipLower(DW dw, Vec128<T, N> a, Vec128<T, N> b) {
3373  return BitCast(dw, InterleaveLower(D(), a, b));
3374 }
3375 
3376 template <typename T, size_t N, class D = Simd<T, N>,
3377  class DW = RepartitionToWide<D>>
3378 HWY_API VFromD<DW> ZipUpper(DW dw, Vec128<T, N> a, Vec128<T, N> b) {
3379  return BitCast(dw, InterleaveUpper(D(), a, b));
3380 }
3381 
3382 // ================================================== COMBINE
3383 
3384 // ------------------------------ Combine (InterleaveLower)
3385 
3386 // N = N/2 + N/2 (upper half undefined)
3387 template <typename T, size_t N, HWY_IF_LE128(T, N)>
3388 HWY_API Vec128<T, N> Combine(Simd<T, N> d, Vec128<T, N / 2> hi_half,
3389  Vec128<T, N / 2> lo_half) {
3390  const Half<decltype(d)> d2;
3391  const RebindToUnsigned<decltype(d2)> du2;
3392  // Treat half-width input as one lane, and expand to two lanes.
3393  using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
3394  const VU lo{BitCast(du2, lo_half).raw};
3395  const VU hi{BitCast(du2, hi_half).raw};
3396  return BitCast(d, InterleaveLower(lo, hi));
3397 }
3398 
3399 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
3400 
3401 template <typename T, HWY_IF_NOT_FLOAT(T)>
3403  Vec128<T, 8 / sizeof(T)> lo) {
3404  return Vec128<T>{_mm_move_epi64(lo.raw)};
3405 }
3406 
3407 template <typename T, HWY_IF_FLOAT(T)>
3408 HWY_API Vec128<T> ZeroExtendVector(Full128<T> d, Vec128<T, 8 / sizeof(T)> lo) {
3409  const RebindToUnsigned<decltype(d)> du;
3410  return BitCast(d, ZeroExtendVector(du, BitCast(Half<decltype(du)>(), lo)));
3411 }
3412 
3413 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3415  return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
3416 }
3417 
3418 // ------------------------------ Concat full (InterleaveLower)
3419 
3420 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
3421 template <typename T>
3422 HWY_API Vec128<T> ConcatLowerLower(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
3423  const Repartition<uint64_t, decltype(d)> d64;
3424  return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
3425 }
3426 
3427 // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
3428 template <typename T>
3429 HWY_API Vec128<T> ConcatUpperUpper(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
3430  const Repartition<uint64_t, decltype(d)> d64;
3431  return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
3432 }
3433 
3434 // hiH,hiL loH,loL |-> hiL,loH (= inner halves)
3435 template <typename T>
3436 HWY_API Vec128<T> ConcatLowerUpper(Full128<T> d, const Vec128<T> hi,
3437  const Vec128<T> lo) {
3438  return CombineShiftRightBytes<8>(d, hi, lo);
3439 }
3440 
3441 // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
3442 template <typename T>
3444 #if HWY_TARGET == HWY_SSSE3
3445  const Full128<double> dd;
3446  const __m128d concat = _mm_move_sd(BitCast(dd, hi).raw, BitCast(dd, lo).raw);
3447  return BitCast(d, Vec128<double>{concat});
3448 #else
3449  (void)d;
3450  return Vec128<T>{_mm_blend_epi16(hi.raw, lo.raw, 0x0F)};
3451 #endif
3452 }
3454  const Vec128<float> hi,
3455  const Vec128<float> lo) {
3456  return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))};
3457 }
3459  const Vec128<double> hi,
3460  const Vec128<double> lo) {
3461  return Vec128<double>{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))};
3462 }
3463 
3464 // ------------------------------ Concat partial (Combine, LowerHalf)
3465 
3466 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3467 HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N> d, Vec128<T, N> hi,
3468  Vec128<T, N> lo) {
3469  const Half<decltype(d)> d2;
3470  return Combine(LowerHalf(d2, hi), LowerHalf(d2, lo));
3471 }
3472 
3473 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3474 HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N> d, Vec128<T, N> hi,
3475  Vec128<T, N> lo) {
3476  const Half<decltype(d)> d2;
3477  return Combine(UpperHalf(d2, hi), UpperHalf(d2, lo));
3478 }
3479 
3480 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3481 HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N> d, const Vec128<T, N> hi,
3482  const Vec128<T, N> lo) {
3483  const Half<decltype(d)> d2;
3484  return Combine(LowerHalf(d2, hi), UpperHalf(d2, lo));
3485 }
3486 
3487 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3489  Vec128<T, N> lo) {
3490  const Half<decltype(d)> d2;
3491  return Combine(UpperHalf(d2, hi), LowerHalf(d2, lo));
3492 }
3493 
3494 // ------------------------------ ConcatOdd
3495 
3496 // 32-bit full
3497 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3498 HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
3499  const RebindToFloat<decltype(d)> df;
3500  return BitCast(
3501  d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
3502  _MM_SHUFFLE(3, 1, 3, 1))});
3503 }
3504 template <size_t N>
3505 HWY_API Vec128<float> ConcatOdd(Full128<float> /* tag */, Vec128<float> hi,
3506  Vec128<float> lo) {
3507  return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
3508 }
3509 
3510 // 32-bit partial
3511 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3512 HWY_API Vec128<T, 2> ConcatOdd(Simd<T, 2> d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
3513  return InterleaveUpper(d, lo, hi);
3514 }
3515 
3516 // 64-bit full - no partial because we need at least two inputs to have
3517 // even/odd.
3518 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3519 HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
3520  return InterleaveUpper(d, lo, hi);
3521 }
3522 
3523 // ------------------------------ ConcatEven (InterleaveLower)
3524 
3525 // 32-bit full
3526 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3527 HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
3528  const RebindToFloat<decltype(d)> df;
3529  return BitCast(
3530  d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
3531  _MM_SHUFFLE(2, 0, 2, 0))});
3532 }
3533 template <size_t N>
3534 HWY_API Vec128<float> ConcatEven(Full128<float> /* tag */, Vec128<float> hi,
3535  Vec128<float> lo) {
3536  return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
3537 }
3538 
3539 // 32-bit partial
3540 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3541 HWY_API Vec128<T, 2> ConcatEven(Simd<T, 2> d, Vec128<T, 2> hi,
3542  Vec128<T, 2> lo) {
3543  return InterleaveLower(d, lo, hi);
3544 }
3545 
3546 // 64-bit full - no partial because we need at least two inputs to have
3547 // even/odd.
3548 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3549 HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
3550  return InterleaveLower(d, lo, hi);
3551 }
3552 
3553 // ------------------------------ OddEven (IfThenElse)
3554 
3555 namespace detail {
3556 
3557 template <typename T, size_t N>
3558 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a,
3559  const Vec128<T, N> b) {
3560  const Simd<T, N> d;
3561  const Repartition<uint8_t, decltype(d)> d8;
3562  alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3563  0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3564  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
3565 }
3566 template <typename T, size_t N>
3567 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a,
3568  const Vec128<T, N> b) {
3569 #if HWY_TARGET == HWY_SSSE3
3570  const Simd<T, N> d;
3571  const Repartition<uint8_t, decltype(d)> d8;
3572  alignas(16) constexpr uint8_t mask[16] = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
3573  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
3574  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
3575 #else
3576  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
3577 #endif
3578 }
3579 template <typename T, size_t N>
3580 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a,
3581  const Vec128<T, N> b) {
3582 #if HWY_TARGET == HWY_SSSE3
3583  const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1));
3584  const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
3585  return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
3586 #else
3587  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x33)};
3588 #endif
3589 }
3590 template <typename T, size_t N>
3591 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a,
3592  const Vec128<T, N> b) {
3593 #if HWY_TARGET == HWY_SSSE3
3594  const Full128<double> dd;
3595  const __m128d concat = _mm_move_sd(BitCast(dd, a).raw, BitCast(dd, b).raw);
3596  return BitCast(Full128<T>(), Vec128<double>{concat});
3597 #else
3598  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x0F)};
3599 #endif
3600 }
3601 
3602 } // namespace detail
3603 
3604 template <typename T, size_t N>
3605 HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
3606  return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
3607 }
3608 template <size_t N>
3609 HWY_API Vec128<float, N> OddEven(const Vec128<float, N> a,
3610  const Vec128<float, N> b) {
3611 #if HWY_TARGET == HWY_SSSE3
3612  // SHUFPS must fill the lower half of the output from one register, so we
3613  // need another shuffle. Unpack avoids another immediate byte.
3614  const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1));
3615  const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
3616  return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
3617 #else
3618  return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
3619 #endif
3620 }
3621 
3622 template <size_t N>
3624  const Vec128<double, N> b) {
3625  return Vec128<double>{_mm_shuffle_pd(b.raw, a.raw, _MM_SHUFFLE2(1, 0))};
3626 }
3627 
3628 // ------------------------------ Shl (ZipLower, Mul)
3629 
3630 // Use AVX2/3 variable shifts where available, otherwise multiply by powers of
3631 // two from loading float exponents, which is considerably faster (according
3632 // to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.
3633 
3634 #if HWY_TARGET > HWY_AVX3 // AVX2 or older
3635 namespace detail {
3636 
3637 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
3638 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3639 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
3640  const Simd<T, N> d;
3641  const RepartitionToWide<decltype(d)> dw;
3642  const Rebind<float, decltype(dw)> df;
3643  const auto zero = Zero(d);
3644  // Move into exponent (this u16 will become the upper half of an f32)
3645  const auto exp = ShiftLeft<23 - 16>(v);
3646  const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f
3647  // Insert 0 into lower halves for reinterpreting as binary32.
3648  const auto f0 = ZipLower(dw, zero, upper);
3649  const auto f1 = ZipUpper(dw, zero, upper);
3650  // See comment below.
3651  const Vec128<int32_t, N> bits0{_mm_cvtps_epi32(BitCast(df, f0).raw)};
3652  const Vec128<int32_t, N> bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)};
3653  return Vec128<MakeUnsigned<T>, N>{_mm_packus_epi32(bits0.raw, bits1.raw)};
3654 }
3655 
3656 // Same, for 32-bit shifts.
3657 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3658 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
3659  const Simd<T, N> d;
3660  const auto exp = ShiftLeft<23>(v);
3661  const auto f = exp + Set(d, 0x3F800000); // 1.0f
3662  // Do not use ConvertTo because we rely on the native 0x80..00 overflow
3663  // behavior. cvt instead of cvtt should be equivalent, but avoids test
3664  // failure under GCC 10.2.1.
3665  return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
3666 }
3667 
3668 } // namespace detail
3669 #endif // HWY_TARGET > HWY_AVX3
3670 
3671 template <size_t N>
3673  const Vec128<uint16_t, N> bits) {
3674 #if HWY_TARGET <= HWY_AVX3
3675  return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)};
3676 #else
3677  return v * detail::Pow2(bits);
3678 #endif
3679 }
3681  const Vec128<uint16_t, 1> bits) {
3682  return Vec128<uint16_t, 1>{_mm_sll_epi16(v.raw, bits.raw)};
3683 }
3684 
3685 template <size_t N>
3687  const Vec128<uint32_t, N> bits) {
3688 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3689  return v * detail::Pow2(bits);
3690 #else
3691  return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)};
3692 #endif
3693 }
3695  const Vec128<uint32_t, 1> bits) {
3696  return Vec128<uint32_t, 1>{_mm_sll_epi32(v.raw, bits.raw)};
3697 }
3698 
3699 HWY_API Vec128<uint64_t> operator<<(const Vec128<uint64_t> v,
3700  const Vec128<uint64_t> bits) {
3701 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3702  // Individual shifts and combine
3703  const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)};
3704  const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
3705  const Vec128<uint64_t> out1{_mm_sll_epi64(v.raw, bits1)};
3706  return ConcatUpperLower(Full128<uint64_t>(), out1, out0);
3707 #else
3708  return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)};
3709 #endif
3710 }
3711 HWY_API Vec128<uint64_t, 1> operator<<(const Vec128<uint64_t, 1> v,
3712  const Vec128<uint64_t, 1> bits) {
3713  return Vec128<uint64_t, 1>{_mm_sll_epi64(v.raw, bits.raw)};
3714 }
3715 
3716 // Signed left shift is the same as unsigned.
3717 template <typename T, size_t N, HWY_IF_SIGNED(T)>
3718 HWY_API Vec128<T, N> operator<<(const Vec128<T, N> v, const Vec128<T, N> bits) {
3719  const Simd<T, N> di;
3720  const Simd<MakeUnsigned<T>, N> du;
3721  return BitCast(di, BitCast(du, v) << BitCast(du, bits));
3722 }
3723 
3724 // ------------------------------ Shr (mul, mask, BroadcastSignBit)
3725 
3726 // Use AVX2+ variable shifts except for SSSE3/SSE4 or 16-bit. There, we use
3727 // widening multiplication by powers of two obtained by loading float exponents,
3728 // followed by a constant right-shift. This is still faster than a scalar or
3729 // bit-test approach: https://gcc.godbolt.org/z/9G7Y9v.
3730 
3731 template <size_t N>
3733  const Vec128<uint16_t, N> bits) {
3734 #if HWY_TARGET <= HWY_AVX3
3735  return Vec128<uint16_t, N>{_mm_srlv_epi16(in.raw, bits.raw)};
3736 #else
3737  const Simd<uint16_t, N> d;
3738  // For bits=0, we cannot mul by 2^16, so fix the result later.
3739  const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits));
3740  // Replace output with input where bits == 0.
3741  return IfThenElse(bits == Zero(d), in, out);
3742 #endif
3743 }
3745  const Vec128<uint16_t, 1> bits) {
3746  return Vec128<uint16_t, 1>{_mm_srl_epi16(in.raw, bits.raw)};
3747 }
3748 
3749 template <size_t N>
3751  const Vec128<uint32_t, N> bits) {
3752 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3753  // 32x32 -> 64 bit mul, then shift right by 32.
3754  const Simd<uint32_t, N> d32;
3755  // Move odd lanes into position for the second mul. Shuffle more gracefully
3756  // handles N=1 than repartitioning to u64 and shifting 32 bits right.
3757  const Vec128<uint32_t, N> in31{_mm_shuffle_epi32(in.raw, 0x31)};
3758  // For bits=0, we cannot mul by 2^32, so fix the result later.
3759  const auto mul = detail::Pow2(Set(d32, 32) - bits);
3760  const auto out20 = ShiftRight<32>(MulEven(in, mul)); // z 2 z 0
3761  const Vec128<uint32_t, N> mul31{_mm_shuffle_epi32(mul.raw, 0x31)};
3762  // No need to shift right, already in the correct position.
3763  const auto out31 = BitCast(d32, MulEven(in31, mul31)); // 3 ? 1 ?
3764  const Vec128<uint32_t, N> out = OddEven(out31, BitCast(d32, out20));
3765  // Replace output with input where bits == 0.
3766  return IfThenElse(bits == Zero(d32), in, out);
3767 #else
3768  return Vec128<uint32_t, N>{_mm_srlv_epi32(in.raw, bits.raw)};
3769 #endif
3770 }
3772  const Vec128<uint32_t, 1> bits) {
3773  return Vec128<uint32_t, 1>{_mm_srl_epi32(in.raw, bits.raw)};
3774 }
3775 
3776 HWY_API Vec128<uint64_t> operator>>(const Vec128<uint64_t> v,
3777  const Vec128<uint64_t> bits) {
3778 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3779  // Individual shifts and combine
3780  const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)};
3781  const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
3782  const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)};
3783  return ConcatUpperLower(Full128<uint64_t>(), out1, out0);
3784 #else
3785  return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)};
3786 #endif
3787 }
3788 HWY_API Vec128<uint64_t, 1> operator>>(const Vec128<uint64_t, 1> v,
3789  const Vec128<uint64_t, 1> bits) {
3790  return Vec128<uint64_t, 1>{_mm_srl_epi64(v.raw, bits.raw)};
3791 }
3792 
3793 #if HWY_TARGET > HWY_AVX3 // AVX2 or older
3794 namespace detail {
3795 
3796 // Also used in x86_256-inl.h.
3797 template <class DI, class V>
3798 HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) {
3799  const RebindToUnsigned<DI> du;
3800  const auto count = BitCast(du, count_i); // same type as value to shift
3801  // Clear sign and restore afterwards. This is preferable to shifting the MSB
3802  // downwards because Shr is somewhat more expensive than Shl.
3803  const auto sign = BroadcastSignBit(v);
3804  const auto abs = BitCast(du, v ^ sign); // off by one, but fixed below
3805  return BitCast(di, abs >> count) ^ sign;
3806 }
3807 
3808 } // namespace detail
3809 #endif // HWY_TARGET > HWY_AVX3
3810 
3811 template <size_t N>
3813  const Vec128<int16_t, N> bits) {
3814 #if HWY_TARGET <= HWY_AVX3
3815  return Vec128<int16_t, N>{_mm_srav_epi16(v.raw, bits.raw)};
3816 #else
3817  return detail::SignedShr(Simd<int16_t, N>(), v, bits);
3818 #endif
3819 }
3821  const Vec128<int16_t, 1> bits) {
3822  return Vec128<int16_t, 1>{_mm_sra_epi16(v.raw, bits.raw)};
3823 }
3824 
3825 template <size_t N>
3827  const Vec128<int32_t, N> bits) {
3828 #if HWY_TARGET <= HWY_AVX3
3829  return Vec128<int32_t, N>{_mm_srav_epi32(v.raw, bits.raw)};
3830 #else
3831  return detail::SignedShr(Simd<int32_t, N>(), v, bits);
3832 #endif
3833 }
3835  const Vec128<int32_t, 1> bits) {
3836  return Vec128<int32_t, 1>{_mm_sra_epi32(v.raw, bits.raw)};
3837 }
3838 
3839 template <size_t N>
3841  const Vec128<int64_t, N> bits) {
3842 #if HWY_TARGET <= HWY_AVX3
3843  return Vec128<int64_t, N>{_mm_srav_epi64(v.raw, bits.raw)};
3844 #else
3845  return detail::SignedShr(Simd<int64_t, N>(), v, bits);
3846 #endif
3847 }
3848 
3849 // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
3850 
3851 HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
3852  const Vec128<uint64_t> b) {
3853  alignas(16) uint64_t mul[2];
3854  mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
3855  return Load(Full128<uint64_t>(), mul);
3856 }
3857 
3858 HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
3859  const Vec128<uint64_t> b) {
3860  alignas(16) uint64_t mul[2];
3861  const Half<Full128<uint64_t>> d2;
3862  mul[0] =
3863  Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
3864  return Load(Full128<uint64_t>(), mul);
3865 }
3866 
3867 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
3868 
3869 template <size_t N>
3870 HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N> df32,
3871  Vec128<bfloat16_t, 2 * N> a,
3872  Vec128<bfloat16_t, 2 * N> b,
3873  const Vec128<float, N> sum0,
3874  Vec128<float, N>& sum1) {
3875  // TODO(janwas): _mm_dpbf16_ps when available
3876  const Repartition<uint16_t, decltype(df32)> du16;
3877  const RebindToUnsigned<decltype(df32)> du32;
3878  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
3879  // Lane order within sum0/1 is undefined, hence we can avoid the
3880  // longer-latency lane-crossing PromoteTo.
3881  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
3882  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
3883  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
3884  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
3885  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
3886  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
3887 }
3888 
3889 // ================================================== CONVERT
3890 
3891 // ------------------------------ Promotions (part w/ narrow lanes -> full)
3892 
3893 // Unsigned: zero-extend.
3894 template <size_t N>
3895 HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N> /* tag */,
3896  const Vec128<uint8_t, N> v) {
3897 #if HWY_TARGET == HWY_SSSE3
3898  const __m128i zero = _mm_setzero_si128();
3899  return Vec128<uint16_t, N>{_mm_unpacklo_epi8(v.raw, zero)};
3900 #else
3901  return Vec128<uint16_t, N>{_mm_cvtepu8_epi16(v.raw)};
3902 #endif
3903 }
3904 template <size_t N>
3905 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
3906  const Vec128<uint16_t, N> v) {
3907 #if HWY_TARGET == HWY_SSSE3
3908  return Vec128<uint32_t, N>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
3909 #else
3910  return Vec128<uint32_t, N>{_mm_cvtepu16_epi32(v.raw)};
3911 #endif
3912 }
3913 template <size_t N>
3915  const Vec128<uint32_t, N> v) {
3916 #if HWY_TARGET == HWY_SSSE3
3917  return Vec128<uint64_t, N>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())};
3918 #else
3919  return Vec128<uint64_t, N>{_mm_cvtepu32_epi64(v.raw)};
3920 #endif
3921 }
3922 template <size_t N>
3923 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
3924  const Vec128<uint8_t, N> v) {
3925 #if HWY_TARGET == HWY_SSSE3
3926  const __m128i zero = _mm_setzero_si128();
3927  const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
3928  return Vec128<uint32_t, N>{_mm_unpacklo_epi16(u16, zero)};
3929 #else
3930  return Vec128<uint32_t, N>{_mm_cvtepu8_epi32(v.raw)};
3931 #endif
3932 }
3933 
3934 // Unsigned to signed: same plus cast.
3935 template <size_t N>
3936 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> di,
3937  const Vec128<uint8_t, N> v) {
3938  return BitCast(di, PromoteTo(Simd<uint16_t, N>(), v));
3939 }
3940 template <size_t N>
3941 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> di,
3942  const Vec128<uint16_t, N> v) {
3943  return BitCast(di, PromoteTo(Simd<uint32_t, N>(), v));
3944 }
3945 template <size_t N>
3946 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> di,
3947  const Vec128<uint8_t, N> v) {
3948  return BitCast(di, PromoteTo(Simd<uint32_t, N>(), v));
3949 }
3950 
3951 // Signed: replicate sign bit.
3952 template <size_t N>
3953 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
3954  const Vec128<int8_t, N> v) {
3955 #if HWY_TARGET == HWY_SSSE3
3956  return ShiftRight<8>(Vec128<int16_t, N>{_mm_unpacklo_epi8(v.raw, v.raw)});
3957 #else
3958  return Vec128<int16_t, N>{_mm_cvtepi8_epi16(v.raw)};
3959 #endif
3960 }
3961 template <size_t N>
3962 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
3963  const Vec128<int16_t, N> v) {
3964 #if HWY_TARGET == HWY_SSSE3
3965  return ShiftRight<16>(Vec128<int32_t, N>{_mm_unpacklo_epi16(v.raw, v.raw)});
3966 #else
3967  return Vec128<int32_t, N>{_mm_cvtepi16_epi32(v.raw)};
3968 #endif
3969 }
3970 template <size_t N>
3971 HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N> /* tag */,
3972  const Vec128<int32_t, N> v) {
3973 #if HWY_TARGET == HWY_SSSE3
3974  return ShiftRight<32>(Vec128<int64_t, N>{_mm_unpacklo_epi32(v.raw, v.raw)});
3975 #else
3976  return Vec128<int64_t, N>{_mm_cvtepi32_epi64(v.raw)};
3977 #endif
3978 }
3979 template <size_t N>
3980 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
3981  const Vec128<int8_t, N> v) {
3982 #if HWY_TARGET == HWY_SSSE3
3983  const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
3984  const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
3985  return ShiftRight<24>(Vec128<int32_t, N>{x4});
3986 #else
3987  return Vec128<int32_t, N>{_mm_cvtepi8_epi32(v.raw)};
3988 #endif
3989 }
3990 
3991 // Workaround for origin tracking bug in Clang msan prior to 11.0
3992 // (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid")
3993 #if defined(MEMORY_SANITIZER) && \
3994  (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
3995 #define HWY_INLINE_F16 HWY_NOINLINE
3996 #else
3997 #define HWY_INLINE_F16 HWY_INLINE
3998 #endif
3999 template <size_t N>
4001  const Vec128<float16_t, N> v) {
4002 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
4003  const RebindToSigned<decltype(df32)> di32;
4004  const RebindToUnsigned<decltype(df32)> du32;
4005  // Expand to u32 so we can shift.
4006  const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
4007  const auto sign = ShiftRight<15>(bits16);
4008  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
4009  const auto mantissa = bits16 & Set(du32, 0x3FF);
4010  const auto subnormal =
4011  BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
4012  Set(df32, 1.0f / 16384 / 1024));
4013 
4014  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
4015  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
4016  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
4017  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
4018  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
4019 #else
4020  (void)df32;
4021  return Vec128<float, N>{_mm_cvtph_ps(v.raw)};
4022 #endif
4023 }
4024 
4025 template <size_t N>
4026 HWY_API Vec128<float, N> PromoteTo(Simd<float, N> df32,
4027  const Vec128<bfloat16_t, N> v) {
4028  const Rebind<uint16_t, decltype(df32)> du16;
4029  const RebindToSigned<decltype(df32)> di32;
4030  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
4031 }
4032 
4033 template <size_t N>
4035  const Vec128<float, N> v) {
4036  return Vec128<double, N>{_mm_cvtps_pd(v.raw)};
4037 }
4038 
4039 template <size_t N>
4040 HWY_API Vec128<double, N> PromoteTo(Simd<double, N> /* tag */,
4041  const Vec128<int32_t, N> v) {
4042  return Vec128<double, N>{_mm_cvtepi32_pd(v.raw)};
4043 }
4044 
4045 // ------------------------------ Demotions (full -> part w/ narrow lanes)
4046 
4047 template <size_t N>
4048 HWY_API Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N> /* tag */,
4049  const Vec128<int32_t, N> v) {
4050 #if HWY_TARGET == HWY_SSSE3
4051  const Simd<int32_t, N> di32;
4052  const Simd<uint16_t, N * 2> du16;
4053  const auto zero_if_neg = AndNot(ShiftRight<31>(v), v);
4054  const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF)));
4055  const auto clamped = Or(zero_if_neg, too_big);
4056  // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts.
4057  alignas(16) constexpr uint16_t kLower2Bytes[16] = {
4058  0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
4059  const auto lo2 = Load(du16, kLower2Bytes);
4060  return Vec128<uint16_t, N>{TableLookupBytes(BitCast(du16, clamped), lo2).raw};
4061 #else
4062  return Vec128<uint16_t, N>{_mm_packus_epi32(v.raw, v.raw)};
4063 #endif
4064 }
4065 
4066 template <size_t N>
4067 HWY_API Vec128<int16_t, N> DemoteTo(Simd<int16_t, N> /* tag */,
4068  const Vec128<int32_t, N> v) {
4069  return Vec128<int16_t, N>{_mm_packs_epi32(v.raw, v.raw)};
4070 }
4071 
4072 template <size_t N>
4073 HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N> /* tag */,
4074  const Vec128<int32_t, N> v) {
4075  const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
4076  return Vec128<uint8_t, N>{_mm_packus_epi16(i16, i16)};
4077 }
4078 
4079 template <size_t N>
4080 HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N> /* tag */,
4081  const Vec128<int16_t, N> v) {
4082  return Vec128<uint8_t, N>{_mm_packus_epi16(v.raw, v.raw)};
4083 }
4084 
4085 template <size_t N>
4086 HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N> /* tag */,
4087  const Vec128<int32_t, N> v) {
4088  const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
4089  return Vec128<int8_t, N>{_mm_packs_epi16(i16, i16)};
4090 }
4091 
4092 template <size_t N>
4093 HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N> /* tag */,
4094  const Vec128<int16_t, N> v) {
4095  return Vec128<int8_t, N>{_mm_packs_epi16(v.raw, v.raw)};
4096 }
4097 
4098 template <size_t N>
4099 HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N> df16,
4100  const Vec128<float, N> v) {
4101 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
4102  const RebindToUnsigned<decltype(df16)> du16;
4103  const Rebind<uint32_t, decltype(df16)> du;
4104  const RebindToSigned<decltype(du)> di;
4105  const auto bits32 = BitCast(du, v);
4106  const auto sign = ShiftRight<31>(bits32);
4107  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
4108  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
4109 
4110  const auto k15 = Set(di, 15);
4111  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
4112  const auto is_tiny = exp < Set(di, -24);
4113 
4114  const auto is_subnormal = exp < Set(di, -14);
4115  const auto biased_exp16 =
4116  BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
4117  const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
4118  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
4119  (mantissa32 >> (Set(du, 13) + sub_exp));
4120  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
4121  ShiftRight<13>(mantissa32)); // <1024
4122 
4123  const auto sign16 = ShiftLeft<15>(sign);
4124  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
4125  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
4126  return BitCast(df16, DemoteTo(du16, bits16));
4127 #else
4128  (void)df16;
4129  return Vec128<float16_t, N>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
4130 #endif
4131 }
4132 
4133 template <size_t N>
4134 HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N> dbf16,
4135  const Vec128<float, N> v) {
4136  // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16.
4137  const Rebind<int32_t, decltype(dbf16)> di32;
4138  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
4139  const Rebind<uint16_t, decltype(dbf16)> du16;
4140  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
4141  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
4142 }
4143 
4144 template <size_t N>
4145 HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
4146  Simd<bfloat16_t, 2 * N> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
4147  // TODO(janwas): _mm_cvtne2ps_pbh once we have avx512bf16.
4148  const RebindToUnsigned<decltype(dbf16)> du16;
4149  const Repartition<uint32_t, decltype(dbf16)> du32;
4150  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
4151  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
4152 }
4153 
4154 template <size_t N>
4156  const Vec128<double, N> v) {
4157  return Vec128<float, N>{_mm_cvtpd_ps(v.raw)};
4158 }
4159 
4160 namespace detail {
4161 
4162 // For well-defined float->int demotion in all x86_*-inl.h.
4163 
4164 template <size_t N>
4166  -> decltype(Zero(d)) {
4167  // The max can be exactly represented in binary64, so clamping beforehand
4168  // prevents x86 conversion from raising an exception and returning 80..00.
4169  return Min(v, Set(d, 2147483647.0));
4170 }
4171 
4172 // For ConvertTo float->int of same size, clamping before conversion would
4173 // change the result because the max integer value is not exactly representable.
4174 // Instead detect the overflow result after conversion and fix it.
4175 template <typename TI, size_t N, class DF = Simd<MakeFloat<TI>, N>>
4177  decltype(Zero(DF())) original,
4178  decltype(Zero(di).raw) converted_raw)
4179  -> decltype(Zero(di)) {
4180  // Combinations of original and output sign:
4181  // --: normal <0 or -huge_val to 80..00: OK
4182  // -+: -0 to 0 : OK
4183  // +-: +huge_val to 80..00 : xor with FF..FF to get 7F..FF
4184  // ++: normal >0 : OK
4185  const auto converted = decltype(Zero(di)){converted_raw};
4186  const auto sign_wrong = AndNot(BitCast(di, original), converted);
4187  return BitCast(di, Xor(converted, BroadcastSignBit(sign_wrong)));
4188 }
4189 
4190 } // namespace detail
4191 
4192 template <size_t N>
4193 HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N> /* tag */,
4194  const Vec128<double, N> v) {
4195  const auto clamped = detail::ClampF64ToI32Max(Simd<double, N>(), v);
4196  return Vec128<int32_t, N>{_mm_cvttpd_epi32(clamped.raw)};
4197 }
4198 
4199 // For already range-limited input [0, 255].
4200 template <size_t N>
4201 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
4202  const Simd<uint32_t, N> d32;
4203  const Simd<uint8_t, N * 4> d8;
4204  alignas(16) static constexpr uint32_t k8From32[4] = {
4205  0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
4206  // Also replicate bytes into all 32 bit lanes for safety.
4207  const auto quad = TableLookupBytes(v, Load(d32, k8From32));
4208  return LowerHalf(LowerHalf(BitCast(d8, quad)));
4209 }
4210 
4211 // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
4212 
4213 template <size_t N>
4214 HWY_API Vec128<float, N> ConvertTo(Simd<float, N> /* tag */,
4215  const Vec128<int32_t, N> v) {
4216  return Vec128<float, N>{_mm_cvtepi32_ps(v.raw)};
4217 }
4218 
4219 template <size_t N>
4221  const Vec128<int64_t, N> v) {
4222 #if HWY_TARGET <= HWY_AVX3
4223  (void)dd;
4224  return Vec128<double, N>{_mm_cvtepi64_pd(v.raw)};
4225 #else
4226  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
4227  const Repartition<uint32_t, decltype(dd)> d32;
4228  const Repartition<uint64_t, decltype(dd)> d64;
4229 
4230  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
4231  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
4232  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
4233 
4234  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
4235  const auto k52 = Set(d32, 0x43300000);
4236  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
4237 
4238  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
4239  return (v_upper - k84_63_52) + v_lower; // order matters!
4240 #endif
4241 }
4242 
4243 // Truncates (rounds toward zero).
4244 template <size_t N>
4245 HWY_API Vec128<int32_t, N> ConvertTo(const Simd<int32_t, N> di,
4246  const Vec128<float, N> v) {
4247  return detail::FixConversionOverflow(di, v, _mm_cvttps_epi32(v.raw));
4248 }
4249 
4250 // Full (partial handled below)
4252 #if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64
4253  return detail::FixConversionOverflow(di, v, _mm_cvttpd_epi64(v.raw));
4254 #elif HWY_ARCH_X86_64
4255  const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw));
4256  const Half<Full128<double>> dd2;
4257  const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(UpperHalf(dd2, v).raw));
4258  return detail::FixConversionOverflow(di, v, _mm_unpacklo_epi64(i0, i1));
4259 #else
4260  using VI = decltype(Zero(di));
4261  const VI k0 = Zero(di);
4262  const VI k1 = Set(di, 1);
4263  const VI k51 = Set(di, 51);
4264 
4265  // Exponent indicates whether the number can be represented as int64_t.
4266  const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF);
4267  const VI exp = biased_exp - Set(di, 0x3FF);
4268  const auto in_range = exp < Set(di, 63);
4269 
4270  // If we were to cap the exponent at 51 and add 2^52, the number would be in
4271  // [2^52, 2^53) and mantissa bits could be read out directly. We need to
4272  // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
4273  // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
4274  // manually shift the mantissa into place (we already have many of the
4275  // inputs anyway).
4276  const VI shift_mnt = Max(k51 - exp, k0);
4277  const VI shift_int = Max(exp - k51, k0);
4278  const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1);
4279  // Include implicit 1-bit; shift by one more to ensure it's in the mantissa.
4280  const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1);
4281  // For inputs larger than 2^52, insert zeros at the bottom.
4282  const VI shifted = int52 << shift_int;
4283  // Restore the one bit lost when shifting in the implicit 1-bit.
4284  const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
4285 
4286  // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
4287  const VI sign_mask = BroadcastSignBit(BitCast(di, v));
4288  const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
4289  const VI magnitude = IfThenElse(in_range, restored, limit);
4290 
4291  // If the input was negative, negate the integer (two's complement).
4292  return (magnitude ^ sign_mask) - sign_mask;
4293 #endif
4294 }
4296  const Vec128<double, 1> v) {
4297  // Only need to specialize for non-AVX3, 64-bit (single scalar op)
4298 #if HWY_TARGET > HWY_AVX3 && HWY_ARCH_X86_64
4299  const Vec128<int64_t, 1> i0{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))};
4300  return detail::FixConversionOverflow(di, v, i0.raw);
4301 #else
4302  (void)di;
4303  const auto full = ConvertTo(Full128<int64_t>(), Vec128<double>{v.raw});
4304  return Vec128<int64_t, 1>{full.raw};
4305 #endif
4306 }
4307 
4308 template <size_t N>
4309 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
4310  const Simd<int32_t, N> di;
4311  return detail::FixConversionOverflow(di, v, _mm_cvtps_epi32(v.raw));
4312 }
4313 
4314 // ------------------------------ Floating-point rounding (ConvertTo)
4315 
4316 #if HWY_TARGET == HWY_SSSE3
4317 
4318 // Toward nearest integer, ties to even
4319 template <typename T, size_t N, HWY_IF_FLOAT(T)>
4321  // Rely on rounding after addition with a large value such that no mantissa
4322  // bits remain (assuming the current mode is nearest-even). We may need a
4323  // compiler flag for precise floating-point to prevent "optimizing" this out.
4324  const Simd<T, N> df;
4325  const auto max = Set(df, MantissaEnd<T>());
4326  const auto large = CopySignToAbs(max, v);
4327  const auto added = large + v;
4328  const auto rounded = added - large;
4329  // Keep original if NaN or the magnitude is large (already an int).
4330  return IfThenElse(Abs(v) < max, rounded, v);
4331 }
4332 
4333 namespace detail {
4334 
4335 // Truncating to integer and converting back to float is correct except when the
4336 // input magnitude is large, in which case the input was already an integer
4337 // (because mantissa >> exponent is zero).
4338 template <typename T, size_t N, HWY_IF_FLOAT(T)>
4340  return Abs(v) < Set(Simd<T, N>(), MantissaEnd<T>());
4341 }
4342 
4343 } // namespace detail
4344 
4345 // Toward zero, aka truncate
4346 template <typename T, size_t N, HWY_IF_FLOAT(T)>
4348  const Simd<T, N> df;
4349  const RebindToSigned<decltype(df)> di;
4350 
4351  const auto integer = ConvertTo(di, v); // round toward 0
4352  const auto int_f = ConvertTo(df, integer);
4353 
4354  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
4355 }
4356 
4357 // Toward +infinity, aka ceiling
4358 template <typename T, size_t N, HWY_IF_FLOAT(T)>
4360  const Simd<T, N> df;
4361  const RebindToSigned<decltype(df)> di;
4362 
4363  const auto integer = ConvertTo(di, v); // round toward 0
4364  const auto int_f = ConvertTo(df, integer);
4365 
4366  // Truncating a positive non-integer ends up smaller; if so, add 1.
4367  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
4368 
4369  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
4370 }
4371 
4372 // Toward -infinity, aka floor
4373 template <typename T, size_t N, HWY_IF_FLOAT(T)>
4375  const Simd<T, N> df;
4376  const RebindToSigned<decltype(df)> di;
4377 
4378  const auto integer = ConvertTo(di, v); // round toward 0
4379  const auto int_f = ConvertTo(df, integer);
4380 
4381  // Truncating a negative non-integer ends up larger; if so, subtract 1.
4382  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
4383 
4384  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
4385 }
4386 
4387 #else
4388 
4389 // Toward nearest integer, ties to even
4390 template <size_t N>
4391 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
4392  return Vec128<float, N>{
4393  _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
4394 }
4395 template <size_t N>
4396 HWY_API Vec128<double, N> Round(const Vec128<double, N> v) {
4397  return Vec128<double, N>{
4398  _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
4399 }
4400 
4401 // Toward zero, aka truncate
4402 template <size_t N>
4403 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
4404  return Vec128<float, N>{
4405  _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
4406 }
4407 template <size_t N>
4408 HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) {
4409  return Vec128<double, N>{
4410  _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
4411 }
4412 
4413 // Toward +infinity, aka ceiling
4414 template <size_t N>
4415 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
4416  return Vec128<float, N>{
4417  _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
4418 }
4419 template <size_t N>
4420 HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) {
4421  return Vec128<double, N>{
4422  _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
4423 }
4424 
4425 // Toward -infinity, aka floor
4426 template <size_t N>
4427 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
4428  return Vec128<float, N>{
4429  _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
4430 }
4431 template <size_t N>
4432 HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) {
4433  return Vec128<double, N>{
4434  _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
4435 }
4436 
4437 #endif // !HWY_SSSE3
4438 
4439 // ================================================== CRYPTO
4440 
4441 #if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
4442 
4443 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
4444 #ifdef HWY_NATIVE_AES
4445 #undef HWY_NATIVE_AES
4446 #else
4447 #define HWY_NATIVE_AES
4448 #endif
4449 
4450 HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
4451  Vec128<uint8_t> round_key) {
4452  return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
4453 }
4454 
4455 template <size_t N, HWY_IF_LE128(uint64_t, N)>
4456 HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a,
4457  Vec128<uint64_t, N> b) {
4458  return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)};
4459 }
4460 
4461 template <size_t N, HWY_IF_LE128(uint64_t, N)>
4462 HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a,
4463  Vec128<uint64_t, N> b) {
4464  return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)};
4465 }
4466 
4467 #endif // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
4468 
4469 // ================================================== MISC
4470 
4471 // Returns a vector with lane i=[0, N) set to "first" + i.
4472 template <typename T, size_t N, typename T2, HWY_IF_LE128(T, N)>
4473 HWY_API Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
4474  HWY_ALIGN T lanes[16 / sizeof(T)];
4475  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
4476  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
4477  }
4478  return Load(d, lanes);
4479 }
4480 
4481 #if HWY_TARGET <= HWY_AVX3
4482 
4483 // ------------------------------ LoadMaskBits
4484 
4485 // `p` points to at least 8 readable bytes, not all of which need be valid.
4486 template <typename T, size_t N, HWY_IF_LE128(T, N)>
4487 HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N> /* tag */,
4488  const uint8_t* HWY_RESTRICT bits) {
4489  uint64_t mask_bits = 0;
4490  constexpr size_t kNumBytes = (N + 7) / 8;
4491  CopyBytes<kNumBytes>(bits, &mask_bits);
4492  if (N < 8) {
4493  mask_bits &= (1ull << N) - 1;
4494  }
4495 
4496  return Mask128<T, N>::FromBits(mask_bits);
4497 }
4498 
4499 // ------------------------------ StoreMaskBits
4500 
4501 // `p` points to at least 8 writable bytes.
4502 template <typename T, size_t N>
4503 HWY_API size_t StoreMaskBits(const Simd<T, N> /* tag */,
4504  const Mask128<T, N> mask, uint8_t* bits) {
4505  constexpr size_t kNumBytes = (N + 7) / 8;
4506  CopyBytes<kNumBytes>(&mask.raw, bits);
4507 
4508  // Non-full byte, need to clear the undefined upper bits.
4509  if (N < 8) {
4510  const int mask = (1 << N) - 1;
4511  bits[0] = static_cast<uint8_t>(bits[0] & mask);
4512  }
4513 
4514  return kNumBytes;
4515 }
4516 
4517 // ------------------------------ Mask testing
4518 
4519 // Beware: the suffix indicates the number of mask bits, not lane size!
4520 
4521 template <typename T, size_t N>
4522 HWY_API size_t CountTrue(const Simd<T, N> /* tag */, const Mask128<T, N> mask) {
4523  const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
4524  return PopCount(mask_bits);
4525 }
4526 
4527 template <typename T, size_t N>
4528 HWY_API intptr_t FindFirstTrue(const Simd<T, N> /* tag */,
4529  const Mask128<T, N> mask) {
4530  const uint32_t mask_bits = static_cast<uint32_t>(mask.raw) & ((1u << N) - 1);
4531  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
4532 }
4533 
4534 template <typename T, size_t N>
4535 HWY_API bool AllFalse(const Simd<T, N> /* tag */, const Mask128<T, N> mask) {
4536  const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
4537  return mask_bits == 0;
4538 }
4539 
4540 template <typename T, size_t N>
4541 HWY_API bool AllTrue(const Simd<T, N> /* tag */, const Mask128<T, N> mask) {
4542  const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
4543  // Cannot use _kortestc because we may have less than 8 mask bits.
4544  return mask_bits == (1u << N) - 1;
4545 }
4546 
4547 // ------------------------------ Compress
4548 
4549 #if HWY_TARGET != HWY_AVX3_DL
4550 namespace detail {
4551 
4552 // Returns permutevar_epi16 indices for 16-bit Compress. Also used by x86_256.
4553 HWY_INLINE Vec128<uint16_t, 8> IndicesForCompress16(uint64_t mask_bits) {
4554  Full128<uint16_t> du16;
4555  // Table of u16 indices packed into bytes to reduce L1 usage. Will be unpacked
4556  // to u16. Ideally we would broadcast 8*3 (half of the 8 bytes currently used)
4557  // bits into each lane and then varshift, but that does not fit in 16 bits.
4558  Rebind<uint8_t, decltype(du16)> du8;
4559  alignas(16) constexpr uint8_t tbl[2048] = {
4560  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
4561  1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 2,
4562  0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
4563  0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 2, 3, 0, 0,
4564  0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0,
4565  0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0,
4566  0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
4567  0, 1, 2, 4, 0, 0, 0, 0, 0, 0, 1, 2, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0,
4568  0, 3, 4, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 2,
4569  3, 4, 0, 0, 0, 0, 0, 0, 2, 3, 4, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 1,
4570  2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 1, 5, 0,
4571  0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 2, 5, 0, 0, 0, 0, 0, 0, 0, 2, 5, 0,
4572  0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 3, 5, 0, 0, 0,
4573  0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0,
4574  0, 0, 2, 3, 5, 0, 0, 0, 0, 0, 0, 2, 3, 5, 0, 0, 0, 0, 1, 2, 3, 5, 0, 0, 0,
4575  0, 0, 1, 2, 3, 5, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0,
4576  1, 4, 5, 0, 0, 0, 0, 0, 0, 1, 4, 5, 0, 0, 0, 0, 2, 4, 5, 0, 0, 0, 0, 0, 0,
4577  2, 4, 5, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 3, 4,
4578  5, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 1, 3, 4, 5, 0, 0, 0, 0, 0, 1, 3,
4579  4, 5, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 1, 2, 3, 4,
4580  5, 0, 0, 0, 0, 1, 2, 3, 4, 5, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0,
4581  0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0,
4582  0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0,
4583  0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 3, 6, 0, 0, 0, 0, 0, 1, 3, 6, 0, 0, 0, 0, 0,
4584  0, 1, 3, 6, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 1,
4585  2, 3, 6, 0, 0, 0, 0, 0, 1, 2, 3, 6, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4,
4586  6, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 2, 4, 6,
4587  0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 1, 2, 4, 6, 0, 0, 0, 0, 0, 1, 2, 4,
4588  6, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 1, 3, 4, 6, 0,
4589  0, 0, 0, 0, 1, 3, 4, 6, 0, 0, 0, 2, 3, 4, 6, 0, 0, 0, 0, 0, 2, 3, 4, 6, 0,
4590  0, 0, 1, 2, 3, 4, 6, 0, 0, 0, 0, 1, 2, 3, 4, 6, 0, 0, 5, 6, 0, 0, 0, 0, 0,
4591  0, 0, 5, 6, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0,
4592  2, 5, 6, 0, 0, 0, 0, 0, 0, 2, 5, 6, 0, 0, 0, 0, 1, 2, 5, 6, 0, 0, 0, 0, 0,
4593  1, 2, 5, 6, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 1, 3,
4594  5, 6, 0, 0, 0, 0, 0, 1, 3, 5, 6, 0, 0, 0, 2, 3, 5, 6, 0, 0, 0, 0, 0, 2, 3,
4595  5, 6, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 4, 5, 6, 0,
4596  0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 1, 4, 5, 6, 0, 0, 0, 0, 0, 1, 4, 5, 6,
4597  0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 1, 2, 4, 5, 6, 0,
4598  0, 0, 0, 1, 2, 4, 5, 6, 0, 0, 3, 4, 5, 6, 0, 0, 0, 0, 0, 3, 4, 5, 6, 0, 0,
4599  0, 1, 3, 4, 5, 6, 0, 0, 0, 0, 1, 3, 4, 5, 6, 0, 0, 2, 3, 4, 5, 6, 0, 0, 0,
4600  0, 2, 3, 4, 5, 6, 0, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 1, 2, 3, 4, 5, 6, 0, 7,
4601  0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 7, 0, 0, 0, 0, 0, 0, 0, 1,
4602  7, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 1, 2, 7,
4603  0, 0, 0, 0, 0, 0, 1, 2, 7, 0, 0, 0, 0, 3, 7, 0, 0, 0, 0, 0, 0, 0, 3, 7, 0,
4604  0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 2, 3, 7, 0, 0,
4605  0, 0, 0, 0, 2, 3, 7, 0, 0, 0, 0, 1, 2, 3, 7, 0, 0, 0, 0, 0, 1, 2, 3, 7, 0,
4606  0, 0, 4, 7, 0, 0, 0, 0, 0, 0, 0, 4, 7, 0, 0, 0, 0, 0, 1, 4, 7, 0, 0, 0, 0,
4607  0, 0, 1, 4, 7, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0,
4608  1, 2, 4, 7, 0, 0, 0, 0, 0, 1, 2, 4, 7, 0, 0, 0, 3, 4, 7, 0, 0, 0, 0, 0, 0,
4609  3, 4, 7, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 2, 3,
4610  4, 7, 0, 0, 0, 0, 0, 2, 3, 4, 7, 0, 0, 0, 1, 2, 3, 4, 7, 0, 0, 0, 0, 1, 2,
4611  3, 4, 7, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 1, 5, 7, 0,
4612  0, 0, 0, 0, 0, 1, 5, 7, 0, 0, 0, 0, 2, 5, 7, 0, 0, 0, 0, 0, 0, 2, 5, 7, 0,
4613  0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 3, 5, 7, 0, 0, 0,
4614  0, 0, 0, 3, 5, 7, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0,
4615  0, 2, 3, 5, 7, 0, 0, 0, 0, 0, 2, 3, 5, 7, 0, 0, 0, 1, 2, 3, 5, 7, 0, 0, 0,
4616  0, 1, 2, 3, 5, 7, 0, 0, 4, 5, 7, 0, 0, 0, 0, 0, 0, 4, 5, 7, 0, 0, 0, 0, 1,
4617  4, 5, 7, 0, 0, 0, 0, 0, 1, 4, 5, 7, 0, 0, 0, 2, 4, 5, 7, 0, 0, 0, 0, 0, 2,
4618  4, 5, 7, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 3, 4, 5,
4619  7, 0, 0, 0, 0, 0, 3, 4, 5, 7, 0, 0, 0, 1, 3, 4, 5, 7, 0, 0, 0, 0, 1, 3, 4,
4620  5, 7, 0, 0, 2, 3, 4, 5, 7, 0, 0, 0, 0, 2, 3, 4, 5, 7, 0, 0, 1, 2, 3, 4, 5,
4621  7, 0, 0, 0, 1, 2, 3, 4, 5, 7, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0,
4622  0, 0, 1, 6, 7, 0, 0, 0, 0, 0, 0, 1, 6, 7, 0, 0, 0, 0, 2, 6, 7, 0, 0, 0, 0,
4623  0, 0, 2, 6, 7, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0,
4624  3, 6, 7, 0, 0, 0, 0, 0, 0, 3, 6, 7, 0, 0, 0, 0, 1, 3, 6, 7, 0, 0, 0, 0, 0,
4625  1, 3, 6, 7, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 1, 2,
4626  3, 6, 7, 0, 0, 0, 0, 1, 2, 3, 6, 7, 0, 0, 4, 6, 7, 0, 0, 0, 0, 0, 0, 4, 6,
4627  7, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 2, 4, 6, 7,
4628  0, 0, 0, 0, 0, 2, 4, 6, 7, 0, 0, 0, 1, 2, 4, 6, 7, 0, 0, 0, 0, 1, 2, 4, 6,
4629  7, 0, 0, 3, 4, 6, 7, 0, 0, 0, 0, 0, 3, 4, 6, 7, 0, 0, 0, 1, 3, 4, 6, 7, 0,
4630  0, 0, 0, 1, 3, 4, 6, 7, 0, 0, 2, 3, 4, 6, 7, 0, 0, 0, 0, 2, 3, 4, 6, 7, 0,
4631  0, 1, 2, 3, 4, 6, 7, 0, 0, 0, 1, 2, 3, 4, 6, 7, 0, 5, 6, 7, 0, 0, 0, 0, 0,
4632  0, 5, 6, 7, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 2,
4633  5, 6, 7, 0, 0, 0, 0, 0, 2, 5, 6, 7, 0, 0, 0, 1, 2, 5, 6, 7, 0, 0, 0, 0, 1,
4634  2, 5, 6, 7, 0, 0, 3, 5, 6, 7, 0, 0, 0, 0, 0, 3, 5, 6, 7, 0, 0, 0, 1, 3, 5,
4635  6, 7, 0, 0, 0, 0, 1, 3, 5, 6, 7, 0, 0, 2, 3, 5, 6, 7, 0, 0, 0, 0, 2, 3, 5,
4636  6, 7, 0, 0, 1, 2, 3, 5, 6, 7, 0, 0, 0, 1, 2, 3, 5, 6, 7, 0, 4, 5, 6, 7, 0,
4637  0, 0, 0, 0, 4, 5, 6, 7, 0, 0, 0, 1, 4, 5, 6, 7, 0, 0, 0, 0, 1, 4, 5, 6, 7,
4638  0, 0, 2, 4, 5, 6, 7, 0, 0, 0, 0, 2, 4, 5, 6, 7, 0, 0, 1, 2, 4, 5, 6, 7, 0,
4639  0, 0, 1, 2, 4, 5, 6, 7, 0, 3, 4, 5, 6, 7, 0, 0, 0, 0, 3, 4, 5, 6, 7, 0, 0,
4640  1, 3, 4, 5, 6, 7, 0, 0, 0, 1, 3, 4, 5, 6, 7, 0, 2, 3, 4, 5, 6, 7, 0, 0, 0,
4641  2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7};
4642  return PromoteTo(du16, Load(du8, tbl + mask_bits * 8));
4643 }
4644 
4645 } // namespace detail
4646 #endif // HWY_TARGET != HWY_AVX3_DL
4647 
4648 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4650  const Simd<T, N> d;
4651  const Rebind<uint16_t, decltype(d)> du;
4652  const auto vu = BitCast(du, v); // (required for float16_t inputs)
4653 
4654 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
4655  const Vec128<uint16_t, N> cu{_mm_maskz_compress_epi16(mask.raw, vu.raw)};
4656 #else
4657  const auto idx = detail::IndicesForCompress16(uint64_t{mask.raw});
4658  const Vec128<uint16_t, N> cu{_mm_permutexvar_epi16(idx.raw, vu.raw)};
4659 #endif // HWY_TARGET != HWY_AVX3_DL
4660  return BitCast(d, cu);
4661 }
4662 
4663 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4664 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
4665  return Vec128<T, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
4666 }
4667 
4668 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4669 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
4670  return Vec128<T, N>{_mm_maskz_compress_epi64(mask.raw, v.raw)};
4671 }
4672 
4673 template <size_t N>
4675  return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)};
4676 }
4677 
4678 template <size_t N>
4680  Mask128<double, N> mask) {
4681  return Vec128<double, N>{_mm_maskz_compress_pd(mask.raw, v.raw)};
4682 }
4683 
4684 // ------------------------------ CompressBits (LoadMaskBits)
4685 
4686 template <typename T, size_t N>
4687 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
4688  const uint8_t* HWY_RESTRICT bits) {
4689  return Compress(v, LoadMaskBits(Simd<T, N>(), bits));
4690 }
4691 
4692 // ------------------------------ CompressStore
4693 
4694 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4696  T* HWY_RESTRICT unaligned) {
4697  const Rebind<uint16_t, decltype(d)> du;
4698  const auto vu = BitCast(du, v); // (required for float16_t inputs)
4699 
4700  const uint64_t mask_bits{mask.raw};
4701 
4702 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
4703  _mm_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw);
4704 #else
4705  const auto idx = detail::IndicesForCompress16(mask_bits);
4706  const Vec128<uint16_t, N> cu{_mm_permutexvar_epi16(idx.raw, vu.raw)};
4707  StoreU(BitCast(d, cu), d, unaligned);
4708 #endif // HWY_TARGET == HWY_AVX3_DL
4709  return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
4710 }
4711 
4712 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4713 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask,
4714  Simd<T, N> /* tag */, T* HWY_RESTRICT unaligned) {
4715  _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
4716  return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
4717 }
4718 
4719 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4720 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask,
4721  Simd<T, N> /* tag */, T* HWY_RESTRICT unaligned) {
4722  _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
4723  return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
4724 }
4725 
4726 template <size_t N, HWY_IF_LE128(float, N)>
4728  Simd<float, N> /* tag */,
4729  float* HWY_RESTRICT unaligned) {
4730  _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
4731  return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
4732 }
4733 
4734 template <size_t N, HWY_IF_LE128(double, N)>
4736  Simd<double, N> /* tag */,
4737  double* HWY_RESTRICT unaligned) {
4738  _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
4739  return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
4740 }
4741 
4742 // ------------------------------ CompressBitsStore (LoadMaskBits)
4743 
4744 template <typename T, size_t N>
4745 HWY_API size_t CompressBitsStore(Vec128<T, N> v,
4746  const uint8_t* HWY_RESTRICT bits, Simd<T, N> d,
4747  T* HWY_RESTRICT unaligned) {
4748  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
4749 }
4750 
4751 #else // AVX2 or below
4752 
4753 // ------------------------------ LoadMaskBits (TestBit)
4754 
4755 namespace detail {
4756 
4757 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
4758 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t mask_bits) {
4759  const RebindToUnsigned<decltype(d)> du;
4760  // Easier than Set(), which would require an >8-bit type, which would not
4761  // compile for T=uint8_t, N=1.
4762  const Vec128<T, N> vbits{_mm_cvtsi32_si128(static_cast<int>(mask_bits))};
4763 
4764  // Replicate bytes 8x such that each byte contains the bit that governs it.
4765  alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
4766  1, 1, 1, 1, 1, 1, 1, 1};
4767  const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
4768 
4769  alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4770  1, 2, 4, 8, 16, 32, 64, 128};
4771  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
4772 }
4773 
4774 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4775 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t mask_bits) {
4776  const RebindToUnsigned<decltype(d)> du;
4777  alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4778  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
4779  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4780 }
4781 
4782 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4783 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t mask_bits) {
4784  const RebindToUnsigned<decltype(d)> du;
4785  alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
4786  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
4787  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4788 }
4789 
4790 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4791 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t mask_bits) {
4792  const RebindToUnsigned<decltype(d)> du;
4793  alignas(16) constexpr uint64_t kBit[8] = {1, 2};
4794  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
4795 }
4796 
4797 } // namespace detail
4798 
4799 // `p` points to at least 8 readable bytes, not all of which need be valid.
4800 template <typename T, size_t N, HWY_IF_LE128(T, N)>
4801 HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N> d,
4802  const uint8_t* HWY_RESTRICT bits) {
4803  uint64_t mask_bits = 0;
4804  constexpr size_t kNumBytes = (N + 7) / 8;
4805  CopyBytes<kNumBytes>(bits, &mask_bits);
4806  if (N < 8) {
4807  mask_bits &= (1ull << N) - 1;
4808  }
4809 
4810  return detail::LoadMaskBits(d, mask_bits);
4811 }
4812 
4813 // ------------------------------ StoreMaskBits
4814 
4815 namespace detail {
4816 
4817 constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) {
4818  return static_cast<uint64_t>(static_cast<unsigned>(mask_bits));
4819 }
4820 
4821 template <typename T, size_t N>
4822 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
4823  const Mask128<T, N> mask) {
4824  const Simd<T, N> d;
4825  const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw;
4826  return U64FromInt(_mm_movemask_epi8(sign_bits));
4827 }
4828 
4829 template <typename T, size_t N>
4830 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
4831  const Mask128<T, N> mask) {
4832  // Remove useless lower half of each u16 while preserving the sign bit.
4833  const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
4834  return U64FromInt(_mm_movemask_epi8(sign_bits));
4835 }
4836 
4837 template <typename T, size_t N>
4838 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
4839  const Mask128<T, N> mask) {
4840  const Simd<T, N> d;
4841  const Simd<float, N> df;
4842  const auto sign_bits = BitCast(df, VecFromMask(d, mask));
4843  return U64FromInt(_mm_movemask_ps(sign_bits.raw));
4844 }
4845 
4846 template <typename T, size_t N>
4847 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/,
4848  const Mask128<T, N> mask) {
4849  const Simd<T, N> d;
4850  const Simd<double, N> df;
4851  const auto sign_bits = BitCast(df, VecFromMask(d, mask));
4852  return U64FromInt(_mm_movemask_pd(sign_bits.raw));
4853 }
4854 
4855 // Returns the lowest N of the _mm_movemask* bits.
4856 template <typename T, size_t N>
4857 constexpr uint64_t OnlyActive(uint64_t mask_bits) {
4858  return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
4859 }
4860 
4861 template <typename T, size_t N>
4862 HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
4863  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
4864 }
4865 
4866 } // namespace detail
4867 
4868 // `p` points to at least 8 writable bytes.
4869 template <typename T, size_t N>
4870 HWY_API size_t StoreMaskBits(const Simd<T, N> /* tag */,
4871  const Mask128<T, N> mask, uint8_t* bits) {
4872  constexpr size_t kNumBytes = (N + 7) / 8;
4873  const uint64_t mask_bits = detail::BitsFromMask(mask);
4874  CopyBytes<kNumBytes>(&mask_bits, bits);
4875  return kNumBytes;
4876 }
4877 
4878 // ------------------------------ Mask testing
4879 
4880 template <typename T, size_t N>
4881 HWY_API bool AllFalse(const Simd<T, N> /* tag */, const Mask128<T, N> mask) {
4882  // Cheaper than PTEST, which is 2 uop / 3L.
4883  return detail::BitsFromMask(mask) == 0;
4884 }
4885 
4886 template <typename T, size_t N>
4887 HWY_API bool AllTrue(const Simd<T, N> /* tag */, const Mask128<T, N> mask) {
4888  constexpr uint64_t kAllBits =
4889  detail::OnlyActive<T, N>((1ull << (16 / sizeof(T))) - 1);
4890  return detail::BitsFromMask(mask) == kAllBits;
4891 }
4892 
4893 template <typename T, size_t N>
4894 HWY_API size_t CountTrue(const Simd<T, N> /* tag */, const Mask128<T, N> mask) {
4895  return PopCount(detail::BitsFromMask(mask));
4896 }
4897 
4898 template <typename T, size_t N>
4899 HWY_API intptr_t FindFirstTrue(const Simd<T, N> /* tag */,
4900  const Mask128<T, N> mask) {
4901  const uint64_t mask_bits = detail::BitsFromMask(mask);
4902  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
4903 }
4904 
4905 // ------------------------------ Compress, CompressBits
4906 
4907 namespace detail {
4908 
4909 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4910 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N> d, uint64_t mask_bits) {
4911  HWY_DASSERT(mask_bits < 256);
4912  const Rebind<uint8_t, decltype(d)> d8;
4913  const Simd<uint16_t, N> du;
4914 
4915  // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
4916  // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
4917  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
4918  // store lane indices and convert to byte indices (2*lane + 0..1), with the
4919  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
4920  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
4921  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
4922  // is likely more costly than the higher cache footprint from storing bytes.
4923  alignas(16) constexpr uint8_t table[2048] = {
4924  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
4925  0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
4926  0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
4927  0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
4928  0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
4929  6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
4930  0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
4931  0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
4932  2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
4933  0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
4934  0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
4935  0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
4936  0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
4937  6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
4938  8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
4939  0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
4940  4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
4941  10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
4942  0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
4943  0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
4944  0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
4945  4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
4946  0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
4947  0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
4948  2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
4949  10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
4950  0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
4951  0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
4952  0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
4953  0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
4954  0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
4955  0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
4956  6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
4957  12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
4958  0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
4959  0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
4960  0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
4961  8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
4962  0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
4963  0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
4964  2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
4965  8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
4966  12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
4967  0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
4968  0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
4969  10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
4970  12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
4971  0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
4972  4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
4973  6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
4974  0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
4975  0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
4976  0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
4977  4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
4978  12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
4979  0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
4980  2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
4981  0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
4982  0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
4983  0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
4984  0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
4985  14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
4986  0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
4987  0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
4988  8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
4989  14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
4990  0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
4991  0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
4992  0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
4993  6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
4994  14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
4995  0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
4996  2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
4997  14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
4998  0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
4999  0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
5000  0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
5001  6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
5002  10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
5003  0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
5004  4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
5005  8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
5006  0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
5007  0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
5008  0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
5009  4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
5010  0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
5011  0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
5012  2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
5013  14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
5014  0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
5015  0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
5016  0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
5017  12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
5018  14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
5019  0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
5020  6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
5021  8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
5022  14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
5023  0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
5024  0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
5025  10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
5026  14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
5027  0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
5028  2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
5029  10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
5030  12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
5031  0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
5032  0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
5033  8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
5034  10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
5035  0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
5036  4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
5037  6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
5038 
5039  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
5040  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
5041  return BitCast(d, pairs + Set(du, 0x0100));
5042 }
5043 
5044 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
5045 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N> d, uint64_t mask_bits) {
5046  HWY_DASSERT(mask_bits < 16);
5047 
5048  // There are only 4 lanes, so we can afford to load the index vector directly.
5049  alignas(16) constexpr uint8_t packed_array[256] = {
5050  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
5051  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
5052  4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
5053  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, //
5054  8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
5055  0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, //
5056  4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, //
5057  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, //
5058  12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
5059  0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
5060  4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
5061  0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, //
5062  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
5063  0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
5064  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
5065  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5066 
5067  const Repartition<uint8_t, decltype(d)> d8;
5068  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
5069 }
5070 
5071 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
5072 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N> d, uint64_t mask_bits) {
5073  HWY_DASSERT(mask_bits < 4);
5074 
5075  // There are only 2 lanes, so we can afford to load the index vector directly.
5076  alignas(16) constexpr uint8_t packed_array[64] = {
5077  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
5078  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
5079  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
5080  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5081 
5082  const Repartition<uint8_t, decltype(d)> d8;
5083  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
5084 }
5085 
5086 } // namespace detail
5087 
5088 template <typename T, size_t N>
5089 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> m) {
5090  const Simd<T, N> d;
5091  const RebindToUnsigned<decltype(d)> du;
5092 
5093  const uint64_t mask_bits = detail::BitsFromMask(m);
5094  HWY_DASSERT(mask_bits < (1ull << N));
5095 
5096  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
5097  return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
5098 }
5099 
5100 template <typename T, size_t N>
5101 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
5102  const uint8_t* HWY_RESTRICT bits) {
5103  const Simd<T, N> d;
5104  const RebindToUnsigned<decltype(d)> du;
5105 
5106  uint64_t mask_bits = 0;
5107  constexpr size_t kNumBytes = (N + 7) / 8;
5108  CopyBytes<kNumBytes>(bits, &mask_bits);
5109  if (N < 8) {
5110  mask_bits &= (1ull << N) - 1;
5111  }
5112 
5113  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
5114  return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
5115 }
5116 
5117 // ------------------------------ CompressStore, CompressBitsStore
5118 
5119 template <typename T, size_t N>
5120 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N> d,
5121  T* HWY_RESTRICT unaligned) {
5122  const RebindToUnsigned<decltype(d)> du;
5123 
5124  const uint64_t mask_bits = detail::BitsFromMask(m);
5125  HWY_DASSERT(mask_bits < (1ull << N));
5126 
5127  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
5128  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
5129  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
5130  StoreU(compressed, d, unaligned);
5131  return PopCount(mask_bits);
5132 }
5133 
5134 template <typename T, size_t N>
5135 HWY_API size_t CompressBitsStore(Vec128<T, N> v,
5136  const uint8_t* HWY_RESTRICT bits, Simd<T, N> d,
5137  T* HWY_RESTRICT unaligned) {
5138  const RebindToUnsigned<decltype(d)> du;
5139 
5140  uint64_t mask_bits = 0;
5141  constexpr size_t kNumBytes = (N + 7) / 8;
5142  CopyBytes<kNumBytes>(bits, &mask_bits);
5143  if (N < 8) {
5144  mask_bits &= (1ull << N) - 1;
5145  }
5146 
5147  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
5148  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
5149  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
5150  StoreU(compressed, d, unaligned);
5151  return PopCount(mask_bits);
5152 }
5153 
5154 #endif // HWY_TARGET <= HWY_AVX3
5155 
5156 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
5157 // TableLookupBytes)
5158 
5159 // 128 bits
5160 HWY_API void StoreInterleaved3(const Vec128<uint8_t> v0,
5161  const Vec128<uint8_t> v1,
5162  const Vec128<uint8_t> v2, Full128<uint8_t> d,
5163  uint8_t* HWY_RESTRICT unaligned) {
5164  const auto k5 = Set(d, 5);
5165  const auto k6 = Set(d, 6);
5166 
5167  // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
5168  // 0x80 so lanes to be filled from other vectors are 0 for blending.
5169  alignas(16) static constexpr uint8_t tbl_r0[16] = {
5170  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
5171  3, 0x80, 0x80, 4, 0x80, 0x80, 5};
5172  alignas(16) static constexpr uint8_t tbl_g0[16] = {
5173  0x80, 0, 0x80, 0x80, 1, 0x80, //
5174  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
5175  const auto shuf_r0 = Load(d, tbl_r0);
5176  const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB
5177  const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
5178  const auto r0 = TableLookupBytes(v0, shuf_r0); // 5..4..3..2..1..0
5179  const auto g0 = TableLookupBytes(v1, shuf_g0); // ..4..3..2..1..0.
5180  const auto b0 = TableLookupBytes(v2, shuf_b0); // .4..3..2..1..0..
5181  const auto int0 = r0 | g0 | b0;
5182  StoreU(int0, d, unaligned + 0 * 16);
5183 
5184  // Second vector: g10,r10, bgr[9:6], b5,g5
5185  const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
5186  const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
5187  const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
5188  const auto r1 = TableLookupBytes(v0, shuf_r1);
5189  const auto g1 = TableLookupBytes(v1, shuf_g1);
5190  const auto b1 = TableLookupBytes(v2, shuf_b1);
5191  const auto int1 = r1 | g1 | b1;
5192  StoreU(int1, d, unaligned + 1 * 16);
5193 
5194  // Third vector: bgr[15:11], b10
5195  const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
5196  const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
5197  const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
5198  const auto r2 = TableLookupBytes(v0, shuf_r2);
5199  const auto g2 = TableLookupBytes(v1, shuf_g2);
5200  const auto b2 = TableLookupBytes(v2, shuf_b2);
5201  const auto int2 = r2 | g2 | b2;
5202  StoreU(int2, d, unaligned + 2 * 16);
5203 }
5204 
5205 // 64 bits
5206 HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> v0,
5207  const Vec128<uint8_t, 8> v1,
5208  const Vec128<uint8_t, 8> v2, Simd<uint8_t, 8> d,
5209  uint8_t* HWY_RESTRICT unaligned) {
5210  // Use full vectors for the shuffles and first result.
5211  const Full128<uint8_t> d_full;
5212  const auto k5 = Set(d_full, 5);
5213  const auto k6 = Set(d_full, 6);
5214 
5215  const Vec128<uint8_t> full_a{v0.raw};
5216  const Vec128<uint8_t> full_b{v1.raw};
5217  const Vec128<uint8_t> full_c{v2.raw};
5218 
5219  // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
5220  // 0x80 so lanes to be filled from other vectors are 0 for blending.
5221  alignas(16) static constexpr uint8_t tbl_r0[16] = {
5222  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
5223  3, 0x80, 0x80, 4, 0x80, 0x80, 5};
5224  alignas(16) static constexpr uint8_t tbl_g0[16] = {
5225  0x80, 0, 0x80, 0x80, 1, 0x80, //
5226  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
5227  const auto shuf_r0 = Load(d_full, tbl_r0);
5228  const auto shuf_g0 = Load(d_full, tbl_g0); // cannot reuse r0 due to 5 in MSB
5229  const auto shuf_b0 = CombineShiftRightBytes<15>(d_full, shuf_g0, shuf_g0);
5230  const auto r0 = TableLookupBytes(full_a, shuf_r0); // 5..4..3..2..1..0
5231  const auto g0 = TableLookupBytes(full_b, shuf_g0); // ..4..3..2..1..0.
5232  const auto b0 = TableLookupBytes(full_c, shuf_b0); // .4..3..2..1..0..
5233  const auto int0 = r0 | g0 | b0;
5234  StoreU(int0, d_full, unaligned + 0 * 16);
5235 
5236  // Second (HALF) vector: bgr[7:6], b5,g5
5237  const auto shuf_r1 = shuf_b0 + k6; // ..7..6..
5238  const auto shuf_g1 = shuf_r0 + k5; // .7..6..5
5239  const auto shuf_b1 = shuf_g0 + k5; // 7..6..5.
5240  const auto r1 = TableLookupBytes(full_a, shuf_r1);
5241  const auto g1 = TableLookupBytes(full_b, shuf_g1);
5242  const auto b1 = TableLookupBytes(full_c, shuf_b1);
5243  const decltype(Zero(d)) int1{(r1 | g1 | b1).raw};
5244  StoreU(int1, d, unaligned + 1 * 16);
5245 }
5246 
5247 // <= 32 bits
5248 template <size_t N, HWY_IF_LE32(uint8_t, N)>
5249 HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> v0,
5250  const Vec128<uint8_t, N> v1,
5251  const Vec128<uint8_t, N> v2,
5252  Simd<uint8_t, N> /*tag*/,
5253  uint8_t* HWY_RESTRICT unaligned) {
5254  // Use full vectors for the shuffles and result.
5255  const Full128<uint8_t> d_full;
5256 
5257  const Vec128<uint8_t> full_a{v0.raw};
5258  const Vec128<uint8_t> full_b{v1.raw};
5259  const Vec128<uint8_t> full_c{v2.raw};
5260 
5261  // Shuffle (v0,v1,v2) vector bytes to bgr[3:0].
5262  // 0x80 so lanes to be filled from other vectors are 0 for blending.
5263  alignas(16) static constexpr uint8_t tbl_r0[16] = {
5264  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, //
5265  0x80, 0x80, 0x80, 0x80};
5266  const auto shuf_r0 = Load(d_full, tbl_r0);
5267  const auto shuf_g0 = CombineShiftRightBytes<15>(d_full, shuf_r0, shuf_r0);
5268  const auto shuf_b0 = CombineShiftRightBytes<14>(d_full, shuf_r0, shuf_r0);
5269  const auto r0 = TableLookupBytes(full_a, shuf_r0); // ......3..2..1..0
5270  const auto g0 = TableLookupBytes(full_b, shuf_g0); // .....3..2..1..0.
5271  const auto b0 = TableLookupBytes(full_c, shuf_b0); // ....3..2..1..0..
5272  const auto int0 = r0 | g0 | b0;
5273  alignas(16) uint8_t buf[16];
5274  StoreU(int0, d_full, buf);
5275  CopyBytes<N * 3>(buf, unaligned);
5276 }
5277 
5278 // ------------------------------ StoreInterleaved4
5279 
5280 // 128 bits
5281 HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
5282  const Vec128<uint8_t> v1,
5283  const Vec128<uint8_t> v2,
5284  const Vec128<uint8_t> v3, Full128<uint8_t> d8,
5285  uint8_t* HWY_RESTRICT unaligned) {
5286  const RepartitionToWide<decltype(d8)> d16;
5287  const RepartitionToWide<decltype(d16)> d32;
5288  // let a,b,c,d denote v0..3.
5289  const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
5290  const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
5291  const auto ba8 = ZipUpper(d16, v0, v1);
5292  const auto dc8 = ZipUpper(d16, v2, v3);
5293  const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
5294  const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4
5295  const auto dcba_8 = ZipLower(d32, ba8, dc8); // d..aB d..a8
5296  const auto dcba_C = ZipUpper(d32, ba8, dc8); // d..aF d..aC
5297  StoreU(BitCast(d8, dcba_0), d8, unaligned + 0 * 16);
5298  StoreU(BitCast(d8, dcba_4), d8, unaligned + 1 * 16);
5299  StoreU(BitCast(d8, dcba_8), d8, unaligned + 2 * 16);
5300  StoreU(BitCast(d8, dcba_C), d8, unaligned + 3 * 16);
5301 }
5302 
5303 // 64 bits
5304 HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> in0,
5305  const Vec128<uint8_t, 8> in1,
5306  const Vec128<uint8_t, 8> in2,
5307  const Vec128<uint8_t, 8> in3,
5308  Simd<uint8_t, 8> /*tag*/,
5309  uint8_t* HWY_RESTRICT unaligned) {
5310  // Use full vectors to reduce the number of stores.
5311  const Full128<uint8_t> d_full8;
5312  const RepartitionToWide<decltype(d_full8)> d16;
5313  const RepartitionToWide<decltype(d16)> d32;
5314  const Vec128<uint8_t> v0{in0.raw};
5315  const Vec128<uint8_t> v1{in1.raw};
5316  const Vec128<uint8_t> v2{in2.raw};
5317  const Vec128<uint8_t> v3{in3.raw};
5318  // let a,b,c,d denote v0..3.
5319  const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
5320  const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
5321  const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
5322  const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4
5323  StoreU(BitCast(d_full8, dcba_0), d_full8, unaligned + 0 * 16);
5324  StoreU(BitCast(d_full8, dcba_4), d_full8, unaligned + 1 * 16);
5325 }
5326 
5327 // <= 32 bits
5328 template <size_t N, HWY_IF_LE32(uint8_t, N)>
5329 HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0,
5330  const Vec128<uint8_t, N> in1,
5331  const Vec128<uint8_t, N> in2,
5332  const Vec128<uint8_t, N> in3,
5333  Simd<uint8_t, N> /*tag*/,
5334  uint8_t* HWY_RESTRICT unaligned) {
5335  // Use full vectors to reduce the number of stores.
5336  const Full128<uint8_t> d_full8;
5337  const RepartitionToWide<decltype(d_full8)> d16;
5338  const RepartitionToWide<decltype(d16)> d32;
5339  const Vec128<uint8_t> v0{in0.raw};
5340  const Vec128<uint8_t> v1{in1.raw};
5341  const Vec128<uint8_t> v2{in2.raw};
5342  const Vec128<uint8_t> v3{in3.raw};
5343  // let a,b,c,d denote v0..3.
5344  const auto ba0 = ZipLower(d16, v0, v1); // b3 a3 .. b0 a0
5345  const auto dc0 = ZipLower(d16, v2, v3); // d3 c3 .. d0 c0
5346  const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
5347  alignas(16) uint8_t buf[16];
5348  StoreU(BitCast(d_full8, dcba_0), d_full8, buf);
5349  CopyBytes<4 * N>(buf, unaligned);
5350 }
5351 
5352 // ------------------------------ Reductions
5353 
5354 namespace detail {
5355 
5356 // N=1 for any T: no-op
5357 template <typename T>
5358 HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
5359  const Vec128<T, 1> v) {
5360  return v;
5361 }
5362 template <typename T>
5363 HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
5364  const Vec128<T, 1> v) {
5365  return v;
5366 }
5367 template <typename T>
5368 HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
5369  const Vec128<T, 1> v) {
5370  return v;
5371 }
5372 
5373 // u32/i32/f32:
5374 
5375 // N=2
5376 template <typename T>
5377 HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
5378  const Vec128<T, 2> v10) {
5379  return v10 + Shuffle2301(v10);
5380 }
5381 template <typename T>
5382 HWY_INLINE Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
5383  const Vec128<T, 2> v10) {
5384  return Min(v10, Shuffle2301(v10));
5385 }
5386 template <typename T>
5387 HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
5388  const Vec128<T, 2> v10) {
5389  return Max(v10, Shuffle2301(v10));
5390 }
5391 
5392 // N=4 (full)
5393 template <typename T>
5394 HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
5395  const Vec128<T> v3210) {
5396  const Vec128<T> v1032 = Shuffle1032(v3210);
5397  const Vec128<T> v31_20_31_20 = v3210 + v1032;
5398  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
5399  return v20_31_20_31 + v31_20_31_20;
5400 }
5401 template <typename T>
5402 HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
5403  const Vec128<T> v3210) {
5404  const Vec128<T> v1032 = Shuffle1032(v3210);
5405  const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
5406  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
5407  return Min(v20_31_20_31, v31_20_31_20);
5408 }
5409 template <typename T>
5410 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
5411  const Vec128<T> v3210) {
5412  const Vec128<T> v1032 = Shuffle1032(v3210);
5413  const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
5414  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
5415  return Max(v20_31_20_31, v31_20_31_20);
5416 }
5417 
5418 // u64/i64/f64:
5419 
5420 // N=2 (full)
5421 template <typename T>
5422 HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
5423  const Vec128<T> v10) {
5424  const Vec128<T> v01 = Shuffle01(v10);
5425  return v10 + v01;
5426 }
5427 template <typename T>
5428 HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
5429  const Vec128<T> v10) {
5430  const Vec128<T> v01 = Shuffle01(v10);
5431  return Min(v10, v01);
5432 }
5433 template <typename T>
5434 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
5435  const Vec128<T> v10) {
5436  const Vec128<T> v01 = Shuffle01(v10);
5437  return Max(v10, v01);
5438 }
5439 
5440 } // namespace detail
5441 
5442 // Supported for u/i/f 32/64. Returns the same value in each lane.
5443 template <typename T, size_t N>
5444 HWY_API Vec128<T, N> SumOfLanes(Simd<T, N> /* tag */, const Vec128<T, N> v) {
5445  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
5446 }
5447 template <typename T, size_t N>
5448 HWY_API Vec128<T, N> MinOfLanes(Simd<T, N> /* tag */, const Vec128<T, N> v) {
5449  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
5450 }
5451 template <typename T, size_t N>
5452 HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N> /* tag */, const Vec128<T, N> v) {
5453  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
5454 }
5455 
5456 // ================================================== DEPRECATED
5457 
5458 template <typename T, size_t N>
5459 HWY_API size_t StoreMaskBits(const Mask128<T, N> mask, uint8_t* bits) {
5460  return StoreMaskBits(Simd<T, N>(), mask, bits);
5461 }
5462 
5463 template <typename T, size_t N>
5464 HWY_API bool AllTrue(const Mask128<T, N> mask) {
5465  return AllTrue(Simd<T, N>(), mask);
5466 }
5467 
5468 template <typename T, size_t N>
5469 HWY_API bool AllFalse(const Mask128<T, N> mask) {
5470  return AllFalse(Simd<T, N>(), mask);
5471 }
5472 
5473 template <typename T, size_t N>
5474 HWY_API size_t CountTrue(const Mask128<T, N> mask) {
5475  return CountTrue(Simd<T, N>(), mask);
5476 }
5477 
5478 template <typename T, size_t N>
5479 HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
5480  return SumOfLanes(Simd<T, N>(), v);
5481 }
5482 template <typename T, size_t N>
5483 HWY_API Vec128<T, N> MinOfLanes(const Vec128<T, N> v) {
5484  return MinOfLanes(Simd<T, N>(), v);
5485 }
5486 template <typename T, size_t N>
5487 HWY_API Vec128<T, N> MaxOfLanes(const Vec128<T, N> v) {
5488  return MaxOfLanes(Simd<T, N>(), v);
5489 }
5490 
5491 template <typename T, size_t N>
5492 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Vec128<T, N> v) {
5493  return UpperHalf(Half<Simd<T, N>>(), v);
5494 }
5495 
5496 template <int kBytes, typename T, size_t N>
5497 HWY_API Vec128<T, N> ShiftRightBytes(const Vec128<T, N> v) {
5498  return ShiftRightBytes<kBytes>(Simd<T, N>(), v);
5499 }
5500 
5501 template <int kLanes, typename T, size_t N>
5502 HWY_API Vec128<T, N> ShiftRightLanes(const Vec128<T, N> v) {
5503  return ShiftRightLanes<kLanes>(Simd<T, N>(), v);
5504 }
5505 
5506 template <size_t kBytes, typename T, size_t N>
5507 HWY_API Vec128<T, N> CombineShiftRightBytes(Vec128<T, N> hi, Vec128<T, N> lo) {
5508  return CombineShiftRightBytes<kBytes>(Simd<T, N>(), hi, lo);
5509 }
5510 
5511 template <typename T, size_t N>
5512 HWY_API Vec128<T, N> InterleaveUpper(Vec128<T, N> a, Vec128<T, N> b) {
5513  return InterleaveUpper(Simd<T, N>(), a, b);
5514 }
5515 
5516 template <typename T, size_t N, class D = Simd<T, N>>
5517 HWY_API VFromD<RepartitionToWide<D>> ZipUpper(Vec128<T, N> a, Vec128<T, N> b) {
5518  return InterleaveUpper(RepartitionToWide<D>(), a, b);
5519 }
5520 
5521 template <typename T, size_t N2>
5522 HWY_API Vec128<T, N2 * 2> Combine(Vec128<T, N2> hi2, Vec128<T, N2> lo2) {
5523  return Combine(Simd<T, N2 * 2>(), hi2, lo2);
5524 }
5525 
5526 template <typename T, size_t N2, HWY_IF_LE64(T, N2)>
5527 HWY_API Vec128<T, N2 * 2> ZeroExtendVector(Vec128<T, N2> lo) {
5528  return ZeroExtendVector(Simd<T, N2 * 2>(), lo);
5529 }
5530 
5531 template <typename T, size_t N>
5532 HWY_API Vec128<T, N> ConcatLowerLower(Vec128<T, N> hi, Vec128<T, N> lo) {
5533  return ConcatLowerLower(Simd<T, N>(), hi, lo);
5534 }
5535 
5536 template <typename T, size_t N>
5537 HWY_API Vec128<T, N> ConcatUpperUpper(Vec128<T, N> hi, Vec128<T, N> lo) {
5538  return ConcatUpperUpper(Simd<T, N>(), hi, lo);
5539 }
5540 
5541 template <typename T, size_t N>
5542 HWY_API Vec128<T, N> ConcatLowerUpper(const Vec128<T, N> hi,
5543  const Vec128<T, N> lo) {
5544  return ConcatLowerUpper(Simd<T, N>(), hi, lo);
5545 }
5546 
5547 template <typename T, size_t N>
5548 HWY_API Vec128<T, N> ConcatUpperLower(Vec128<T, N> hi, Vec128<T, N> lo) {
5549  return ConcatUpperLower(Simd<T, N>(), hi, lo);
5550 }
5551 
5552 // ================================================== Operator wrapper
5553 
5554 // These apply to all x86_*-inl.h because there are no restrictions on V.
5555 
5556 template <class V>
5557 HWY_API V Add(V a, V b) {
5558  return a + b;
5559 }
5560 template <class V>
5561 HWY_API V Sub(V a, V b) {
5562  return a - b;
5563 }
5564 
5565 template <class V>
5566 HWY_API V Mul(V a, V b) {
5567  return a * b;
5568 }
5569 template <class V>
5570 HWY_API V Div(V a, V b) {
5571  return a / b;
5572 }
5573 
5574 template <class V>
5575 V Shl(V a, V b) {
5576  return a << b;
5577 }
5578 template <class V>
5579 V Shr(V a, V b) {
5580  return a >> b;
5581 }
5582 
5583 template <class V>
5584 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
5585  return a == b;
5586 }
5587 template <class V>
5588 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
5589  return a != b;
5590 }
5591 template <class V>
5592 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
5593  return a < b;
5594 }
5595 
5596 template <class V>
5597 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
5598  return a > b;
5599 }
5600 template <class V>
5601 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
5602  return a >= b;
5603 }
5604 
5605 template <class V>
5606 HWY_API auto Le(V a, V b) -> decltype(a == b) {
5607  return a <= b;
5608 }
5609 
5610 // NOLINTNEXTLINE(google-readability-namespace-comments)
5611 } // namespace HWY_NAMESPACE
5612 } // namespace hwy
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:66
#define HWY_IF_LE64(T, N)
Definition: base.h:271
#define HWY_API
Definition: base.h:117
#define HWY_INLINE
Definition: base.h:59
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:67
#define HWY_DASSERT(condition)
Definition: base.h:163
#define HWY_MAYBE_UNUSED
Definition: base.h:70
Definition: arm_neon-inl.h:506
static Mask128< T, N > FromBits(uint64_t mask_bits)
Definition: x86_128-inl.h:133
Raw raw
Definition: arm_neon-inl.h:516
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:508
Definition: arm_neon-inl.h:468
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: x86_128-inl.h:84
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: x86_128-inl.h:87
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: x86_128-inl.h:75
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: x86_128-inl.h:90
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: x86_128-inl.h:72
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:469
Raw raw
Definition: arm_neon-inl.h:501
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: x86_128-inl.h:78
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: x86_128-inl.h:81
Definition: x86_256-inl.h:67
Definition: x86_512-inl.h:101
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1463
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2811
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4233
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4288
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:842
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:2739
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1278
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1136
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2332
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:127
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4095
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:879
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:672
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:578
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:768
HWY_INLINE auto FixConversionOverflow(Simd< TI, N > di, decltype(Zero(DF())) original, decltype(Zero(di).raw) converted_raw) -> decltype(Zero(di))
Definition: x86_128-inl.h:4176
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4100
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MaxU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:2601
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MinU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:2506
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2184
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:4431
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:4165
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:551
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3490
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:2728
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:714
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2176
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:805
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4091
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:624
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:803
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1573
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:797
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:66
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_API Vec128< uint32_t, N > TableLookupLanes(const Vec128< uint32_t, N > v, const Indices128< uint32_t, N > idx)
Definition: arm_neon-inl.h:3342
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3064
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1167
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1520
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5035
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3435
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2829
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5027
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1073
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3629
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2949
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3052
HWY_API size_t StoreMaskBits(Simd< T, N >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4528
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4207
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5040
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_API Vec128< T, N > LoadDup128(Simd< T, N > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2164
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4054
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2352
V Shl(V a, V b)
Definition: arm_neon-inl.h:5018
HWY_API Vec128< T, N > VecFromMask(const Mask128< T, N > v)
Definition: arm_neon-inl.h:1607
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5044
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1206
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3760
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3495
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N > d, const int32_t *idx)
Definition: arm_neon-inl.h:3323
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1600
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: shared-inl.h:149
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:904
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3947
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3726
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3389
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1795
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:422
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3070
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3802
Vec128< T, N > Iota(const Simd< T, N > d, const T2 first)
Definition: arm_neon-inl.h:734
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3483
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1438
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1443
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:3907
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: shared-inl.h:151
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2903
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: shared-inl.h:147
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:778
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1619
HWY_API Vec128< T, N > Undefined(Simd< T, N >)
Definition: arm_neon-inl.h:724
HWY_API intptr_t FindFirstTrue(const Simd< T, N >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4520
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1448
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4509
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:392
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5000
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3395
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4538
HWY_API bool AllTrue(const Simd< T, N > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4557
HWY_API void StoreInterleaved3(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4829
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_API VFromD< DW > ZipUpper(DW dw, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3538
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1581
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1655
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2170
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1735
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2846
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:794
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:475
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:325
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3506
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1827
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3041
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1126
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:449
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec128< uint8_t, 8 > hi, Vec128< uint8_t, 8 > lo)
Definition: arm_neon-inl.h:3566
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1194
Simd< T, 16/sizeof(T)> Full128
Definition: arm_neon-inl.h:30
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4071
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2157
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4036
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5004
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3637
typename D::template Rebind< T > Rebind
Definition: shared-inl.h:144
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1799
HWY_API V InterleaveUpper(Simd< T, N >, V a, V b)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1084
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< uint32_t, 2 > Shuffle2301(const Vec128< uint32_t, 2 > v)
Definition: arm_neon-inl.h:1698
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:319
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5049
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1077
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:532
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2890
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1366
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4276
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
V Shr(V a, V b)
Definition: arm_neon-inl.h:5022
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:720
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4812
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:3844
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4211
Neg(const Vec128< int64_t, 1 > v)
Definition: arm_neon-inl.h:866
HWY_API Vec128< uint8_t, 4 > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2699
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3235
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3752
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4787
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2818
HWY_API VFromD< DW > ZipLower(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3527
typename D::Half Half
Definition: shared-inl.h:164
typename D::template Repartition< T > Repartition
Definition: shared-inl.h:155
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:2665
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4802
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3401
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:3957
HWY_API void StoreInterleaved4(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, const Vec128< uint8_t > v3, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4864
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5013
HWY_API Vec128< uint8_t, 8 > UpperHalf(Simd< uint8_t, 8 >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3096
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3407
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:984
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5009
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1590
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:535
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3385
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1320
long long int GatherIndex64
Definition: x86_128-inl.h:2721
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4780
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:405
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
typename D::T TFromD
Definition: shared-inl.h:140
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4012
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1296
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
Definition: aligned_allocator.h:23
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:565
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:613
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:535
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:589
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:555
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:521
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:529
#define HWY_ALIGN
Definition: set_macros-inl.h:78
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
Definition: arm_neon-inl.h:3318
__m128i raw
Definition: x86_128-inl.h:3176
Definition: shared-inl.h:35
HWY_INLINE __m128d operator()(__m128i v)
Definition: x86_128-inl.h:208
HWY_INLINE __m128 operator()(__m128i v)
Definition: x86_128-inl.h:204
HWY_INLINE __m128i operator()(__m128i v)
Definition: x86_128-inl.h:200
Definition: arm_neon-inl.h:522
Simd< T, N > operator()(const Vec128< T, N > *) const
Definition: x86_128-inl.h:157
Definition: x86_128-inl.h:172
decltype(DeduceD()(static_cast< V * >(nullptr))) type
Definition: x86_128-inl.h:173
__m128d type
Definition: x86_128-inl.h:60
__f32x4 type
Definition: wasm_128-inl.h:62
Definition: x86_128-inl.h:51
__v128_u type
Definition: wasm_128-inl.h:58
Definition: x86_128-inl.h:111
__mmask16 type
Definition: x86_128-inl.h:112
Definition: x86_128-inl.h:115
__mmask8 type
Definition: x86_128-inl.h:116
Definition: x86_128-inl.h:119
__mmask8 type
Definition: x86_128-inl.h:120
Definition: x86_128-inl.h:123
__mmask8 type
Definition: x86_128-inl.h:124
Definition: x86_128-inl.h:109
Definition: base.h:290
HWY_AFTER_NAMESPACE()
#define HWY_INLINE_F16
Definition: x86_128-inl.h:3997
HWY_BEFORE_NAMESPACE()