Grok  9.5.0
x86_512-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // 512-bit AVX512 vectors and operations.
16 // External include guard in highway.h - see comment there.
17 
18 // WARNING: most operations do not cross 128-bit block boundaries. In
19 // particular, "Broadcast", pack and zip behavior may be surprising.
20 
21 #include <immintrin.h> // AVX2+
22 
23 #include "hwy/base.h"
24 
25 #if defined(_MSC_VER) && defined(__clang__)
26 // Including <immintrin.h> should be enough, but Clang's headers helpfully skip
27 // including these headers when _MSC_VER is defined, like when using clang-cl.
28 // Include these directly here.
29 // clang-format off
30 #include <smmintrin.h>
31 
32 #include <avxintrin.h>
33 #include <avx2intrin.h>
34 #include <f16cintrin.h>
35 #include <fmaintrin.h>
36 
37 #include <avx512fintrin.h>
38 #include <avx512vlintrin.h>
39 #include <avx512bwintrin.h>
40 #include <avx512dqintrin.h>
41 #include <avx512vlbwintrin.h>
42 #include <avx512vldqintrin.h>
43 #include <avx512bitalgintrin.h>
44 #include <avx512vlbitalgintrin.h>
45 #include <avx512vpopcntdqintrin.h>
46 #include <avx512vpopcntdqvlintrin.h>
47 // clang-format on
48 #endif
49 
50 #include <stddef.h>
51 #include <stdint.h>
52 
53 // For half-width vectors. Already includes base.h and shared-inl.h.
54 #include "hwy/ops/x86_256-inl.h"
55 
57 namespace hwy {
58 namespace HWY_NAMESPACE {
59 
60 template <typename T>
61 using Full512 = Simd<T, 64 / sizeof(T)>;
62 
63 namespace detail {
64 
65 template <typename T>
66 struct Raw512 {
67  using type = __m512i;
68 };
69 template <>
70 struct Raw512<float> {
71  using type = __m512;
72 };
73 template <>
74 struct Raw512<double> {
75  using type = __m512d;
76 };
77 
78 // Template arg: sizeof(lane type)
79 template <size_t size>
80 struct RawMask512 {};
81 template <>
82 struct RawMask512<1> {
83  using type = __mmask64;
84 };
85 template <>
86 struct RawMask512<2> {
87  using type = __mmask32;
88 };
89 template <>
90 struct RawMask512<4> {
91  using type = __mmask16;
92 };
93 template <>
94 struct RawMask512<8> {
95  using type = __mmask8;
96 };
97 
98 } // namespace detail
99 
100 template <typename T>
101 class Vec512 {
102  using Raw = typename detail::Raw512<T>::type;
103 
104  public:
105  // Compound assignment. Only usable if there is a corresponding non-member
106  // binary operator overload. For example, only f32 and f64 support division.
108  return *this = (*this * other);
109  }
111  return *this = (*this / other);
112  }
114  return *this = (*this + other);
115  }
117  return *this = (*this - other);
118  }
120  return *this = (*this & other);
121  }
123  return *this = (*this | other);
124  }
126  return *this = (*this ^ other);
127  }
128 
130 };
131 
132 // Mask register: one bit per lane.
133 template <typename T>
134 struct Mask512 {
135  typename detail::RawMask512<sizeof(T)>::type raw;
136 };
137 
138 // ------------------------------ BitCast
139 
140 namespace detail {
141 
142 HWY_INLINE __m512i BitCastToInteger(__m512i v) { return v; }
143 HWY_INLINE __m512i BitCastToInteger(__m512 v) { return _mm512_castps_si512(v); }
144 HWY_INLINE __m512i BitCastToInteger(__m512d v) {
145  return _mm512_castpd_si512(v);
146 }
147 
148 template <typename T>
151 }
152 
153 // Cannot rely on function overloading because return types differ.
154 template <typename T>
156  HWY_INLINE __m512i operator()(__m512i v) { return v; }
157 };
158 template <>
159 struct BitCastFromInteger512<float> {
160  HWY_INLINE __m512 operator()(__m512i v) { return _mm512_castsi512_ps(v); }
161 };
162 template <>
163 struct BitCastFromInteger512<double> {
164  HWY_INLINE __m512d operator()(__m512i v) { return _mm512_castsi512_pd(v); }
165 };
166 
167 template <typename T>
170 }
171 
172 } // namespace detail
173 
174 template <typename T, typename FromT>
177 }
178 
179 // ------------------------------ Set
180 
181 // Returns an all-zero vector.
182 template <typename T>
184  return Vec512<T>{_mm512_setzero_si512()};
185 }
187  return Vec512<float>{_mm512_setzero_ps()};
188 }
190  return Vec512<double>{_mm512_setzero_pd()};
191 }
192 
193 // Returns a vector with all lanes set to "t".
194 HWY_API Vec512<uint8_t> Set(Full512<uint8_t> /* tag */, const uint8_t t) {
195  return Vec512<uint8_t>{_mm512_set1_epi8(static_cast<char>(t))}; // NOLINT
196 }
197 HWY_API Vec512<uint16_t> Set(Full512<uint16_t> /* tag */, const uint16_t t) {
198  return Vec512<uint16_t>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT
199 }
200 HWY_API Vec512<uint32_t> Set(Full512<uint32_t> /* tag */, const uint32_t t) {
201  return Vec512<uint32_t>{_mm512_set1_epi32(static_cast<int>(t))};
202 }
203 HWY_API Vec512<uint64_t> Set(Full512<uint64_t> /* tag */, const uint64_t t) {
204  return Vec512<uint64_t>{
205  _mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT
206 }
207 HWY_API Vec512<int8_t> Set(Full512<int8_t> /* tag */, const int8_t t) {
208  return Vec512<int8_t>{_mm512_set1_epi8(static_cast<char>(t))}; // NOLINT
209 }
210 HWY_API Vec512<int16_t> Set(Full512<int16_t> /* tag */, const int16_t t) {
211  return Vec512<int16_t>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT
212 }
213 HWY_API Vec512<int32_t> Set(Full512<int32_t> /* tag */, const int32_t t) {
214  return Vec512<int32_t>{_mm512_set1_epi32(t)};
215 }
216 HWY_API Vec512<int64_t> Set(Full512<int64_t> /* tag */, const int64_t t) {
217  return Vec512<int64_t>{
218  _mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT
219 }
220 HWY_API Vec512<float> Set(Full512<float> /* tag */, const float t) {
221  return Vec512<float>{_mm512_set1_ps(t)};
222 }
223 HWY_API Vec512<double> Set(Full512<double> /* tag */, const double t) {
224  return Vec512<double>{_mm512_set1_pd(t)};
225 }
226 
227 HWY_DIAGNOSTICS(push)
228 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
229 
230 // Returns a vector with uninitialized elements.
231 template <typename T>
233  // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
234  // generate an XOR instruction.
235  return Vec512<T>{_mm512_undefined_epi32()};
236 }
238  return Vec512<float>{_mm512_undefined_ps()};
239 }
241  return Vec512<double>{_mm512_undefined_pd()};
242 }
243 
244 HWY_DIAGNOSTICS(pop)
245 
246 // ================================================== LOGICAL
247 
248 // ------------------------------ Not
249 
250 template <typename T>
252  using TU = MakeUnsigned<T>;
253  const __m512i vu = BitCast(Full512<TU>(), v).raw;
254  return BitCast(Full512<T>(),
255  Vec512<TU>{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)});
256 }
257 
258 // ------------------------------ And
259 
260 template <typename T>
262  return Vec512<T>{_mm512_and_si512(a.raw, b.raw)};
263 }
264 
266  return Vec512<float>{_mm512_and_ps(a.raw, b.raw)};
267 }
269  return Vec512<double>{_mm512_and_pd(a.raw, b.raw)};
270 }
271 
272 // ------------------------------ AndNot
273 
274 // Returns ~not_mask & mask.
275 template <typename T>
276 HWY_API Vec512<T> AndNot(const Vec512<T> not_mask, const Vec512<T> mask) {
277  return Vec512<T>{_mm512_andnot_si512(not_mask.raw, mask.raw)};
278 }
280  const Vec512<float> mask) {
281  return Vec512<float>{_mm512_andnot_ps(not_mask.raw, mask.raw)};
282 }
284  const Vec512<double> mask) {
285  return Vec512<double>{_mm512_andnot_pd(not_mask.raw, mask.raw)};
286 }
287 
288 // ------------------------------ Or
289 
290 template <typename T>
292  return Vec512<T>{_mm512_or_si512(a.raw, b.raw)};
293 }
294 
296  return Vec512<float>{_mm512_or_ps(a.raw, b.raw)};
297 }
299  return Vec512<double>{_mm512_or_pd(a.raw, b.raw)};
300 }
301 
302 // ------------------------------ Xor
303 
304 template <typename T>
306  return Vec512<T>{_mm512_xor_si512(a.raw, b.raw)};
307 }
308 
310  return Vec512<float>{_mm512_xor_ps(a.raw, b.raw)};
311 }
313  return Vec512<double>{_mm512_xor_pd(a.raw, b.raw)};
314 }
315 
316 // ------------------------------ Operator overloads (internal-only if float)
317 
318 template <typename T>
320  return And(a, b);
321 }
322 
323 template <typename T>
325  return Or(a, b);
326 }
327 
328 template <typename T>
330  return Xor(a, b);
331 }
332 
333 // ------------------------------ PopulationCount
334 
335 // 8/16 require BITALG, 32/64 require VPOPCNTDQ.
336 #if HWY_TARGET == HWY_AVX3_DL
337 
338 #ifdef HWY_NATIVE_POPCNT
339 #undef HWY_NATIVE_POPCNT
340 #else
341 #define HWY_NATIVE_POPCNT
342 #endif
343 
344 namespace detail {
345 
346 template <typename T>
348  return Vec512<T>{_mm512_popcnt_epi8(v.raw)};
349 }
350 template <typename T>
352  return Vec512<T>{_mm512_popcnt_epi16(v.raw)};
353 }
354 template <typename T>
356  return Vec512<T>{_mm512_popcnt_epi32(v.raw)};
357 }
358 template <typename T>
360  return Vec512<T>{_mm512_popcnt_epi64(v.raw)};
361 }
362 
363 } // namespace detail
364 
365 template <typename T>
367  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
368 }
369 
370 #endif // HWY_TARGET == HWY_AVX3_DL
371 
372 // ================================================== SIGN
373 
374 // ------------------------------ CopySign
375 
376 template <typename T>
377 HWY_API Vec512<T> CopySign(const Vec512<T> magn, const Vec512<T> sign) {
378  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
379 
380  const Full512<T> d;
381  const auto msb = SignBit(d);
382 
383  const Rebind<MakeUnsigned<T>, decltype(d)> du;
384  // Truth table for msb, magn, sign | bitwise msb ? sign : mag
385  // 0 0 0 | 0
386  // 0 0 1 | 0
387  // 0 1 0 | 1
388  // 0 1 1 | 1
389  // 1 0 0 | 0
390  // 1 0 1 | 1
391  // 1 1 0 | 0
392  // 1 1 1 | 1
393  // The lane size does not matter because we are not using predication.
394  const __m512i out = _mm512_ternarylogic_epi32(
395  BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
396  return BitCast(d, decltype(Zero(du)){out});
397 }
398 
399 template <typename T>
401  // AVX3 can also handle abs < 0, so no extra action needed.
402  return CopySign(abs, sign);
403 }
404 
405 // ================================================== MASK
406 
407 // ------------------------------ FirstN
408 
409 // Possibilities for constructing a bitmask of N ones:
410 // - kshift* only consider the lowest byte of the shift count, so they would
411 // not correctly handle large n.
412 // - Scalar shifts >= 64 are UB.
413 // - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However,
414 // we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds.
415 
416 #if HWY_ARCH_X86_32
417 namespace detail {
418 
419 // 32 bit mask is sufficient for lane size >= 2.
420 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
421 HWY_INLINE Mask512<T> FirstN(size_t n) {
422  Mask512<T> m;
423  m.raw = static_cast<decltype(m.raw)>(_bzhi_u32(~uint32_t(0), n));
424  return m;
425 }
426 
427 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
428 HWY_INLINE Mask512<T> FirstN(size_t n) {
429  const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t(0);
430  return Mask512<T>{static_cast<__mmask64>(bits)};
431 }
432 
433 } // namespace detail
434 #endif // HWY_ARCH_X86_32
435 
436 template <typename T>
437 HWY_API Mask512<T> FirstN(const Full512<T> /*tag*/, size_t n) {
438 #if HWY_ARCH_X86_64
439  Mask512<T> m;
440  m.raw = static_cast<decltype(m.raw)>(_bzhi_u64(~uint64_t(0), n));
441  return m;
442 #else
443  return detail::FirstN<T>(n);
444 #endif // HWY_ARCH_X86_64
445 }
446 
447 // ------------------------------ IfThenElse
448 
449 // Returns mask ? b : a.
450 
451 namespace detail {
452 
453 // Templates for signed/unsigned integer of a particular size.
454 template <typename T>
456  const Mask512<T> mask, const Vec512<T> yes,
457  const Vec512<T> no) {
458  return Vec512<T>{_mm512_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
459 }
460 template <typename T>
462  const Mask512<T> mask, const Vec512<T> yes,
463  const Vec512<T> no) {
464  return Vec512<T>{_mm512_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
465 }
466 template <typename T>
468  const Mask512<T> mask, const Vec512<T> yes,
469  const Vec512<T> no) {
470  return Vec512<T>{_mm512_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
471 }
472 template <typename T>
474  const Mask512<T> mask, const Vec512<T> yes,
475  const Vec512<T> no) {
476  return Vec512<T>{_mm512_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
477 }
478 
479 } // namespace detail
480 
481 template <typename T>
483  const Vec512<T> no) {
484  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
485 }
487  const Vec512<float> yes,
488  const Vec512<float> no) {
489  return Vec512<float>{_mm512_mask_mov_ps(no.raw, mask.raw, yes.raw)};
490 }
492  const Vec512<double> yes,
493  const Vec512<double> no) {
494  return Vec512<double>{_mm512_mask_mov_pd(no.raw, mask.raw, yes.raw)};
495 }
496 
497 namespace detail {
498 
499 template <typename T>
501  const Mask512<T> mask,
502  const Vec512<T> yes) {
503  return Vec512<T>{_mm512_maskz_mov_epi8(mask.raw, yes.raw)};
504 }
505 template <typename T>
507  const Mask512<T> mask,
508  const Vec512<T> yes) {
509  return Vec512<T>{_mm512_maskz_mov_epi16(mask.raw, yes.raw)};
510 }
511 template <typename T>
513  const Mask512<T> mask,
514  const Vec512<T> yes) {
515  return Vec512<T>{_mm512_maskz_mov_epi32(mask.raw, yes.raw)};
516 }
517 template <typename T>
519  const Mask512<T> mask,
520  const Vec512<T> yes) {
521  return Vec512<T>{_mm512_maskz_mov_epi64(mask.raw, yes.raw)};
522 }
523 
524 } // namespace detail
525 
526 template <typename T>
528  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
529 }
531  const Vec512<float> yes) {
532  return Vec512<float>{_mm512_maskz_mov_ps(mask.raw, yes.raw)};
533 }
535  const Vec512<double> yes) {
536  return Vec512<double>{_mm512_maskz_mov_pd(mask.raw, yes.raw)};
537 }
538 
539 namespace detail {
540 
541 template <typename T>
543  const Mask512<T> mask, const Vec512<T> no) {
544  // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
545  return Vec512<T>{_mm512_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
546 }
547 template <typename T>
549  const Mask512<T> mask, const Vec512<T> no) {
550  return Vec512<T>{_mm512_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
551 }
552 template <typename T>
554  const Mask512<T> mask, const Vec512<T> no) {
555  return Vec512<T>{_mm512_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
556 }
557 template <typename T>
559  const Mask512<T> mask, const Vec512<T> no) {
560  return Vec512<T>{_mm512_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
561 }
562 
563 } // namespace detail
564 
565 template <typename T>
567  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
568 }
570  const Vec512<float> no) {
571  return Vec512<float>{_mm512_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
572 }
574  const Vec512<double> no) {
575  return Vec512<double>{_mm512_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
576 }
577 
578 template <typename T, HWY_IF_FLOAT(T)>
580  // AVX3 MaskFromVec only looks at the MSB
581  return IfThenZeroElse(MaskFromVec(v), v);
582 }
583 
584 // ================================================== ARITHMETIC
585 
586 // ------------------------------ Addition
587 
588 // Unsigned
590  const Vec512<uint8_t> b) {
591  return Vec512<uint8_t>{_mm512_add_epi8(a.raw, b.raw)};
592 }
594  const Vec512<uint16_t> b) {
595  return Vec512<uint16_t>{_mm512_add_epi16(a.raw, b.raw)};
596 }
598  const Vec512<uint32_t> b) {
599  return Vec512<uint32_t>{_mm512_add_epi32(a.raw, b.raw)};
600 }
602  const Vec512<uint64_t> b) {
603  return Vec512<uint64_t>{_mm512_add_epi64(a.raw, b.raw)};
604 }
605 
606 // Signed
608  const Vec512<int8_t> b) {
609  return Vec512<int8_t>{_mm512_add_epi8(a.raw, b.raw)};
610 }
612  const Vec512<int16_t> b) {
613  return Vec512<int16_t>{_mm512_add_epi16(a.raw, b.raw)};
614 }
616  const Vec512<int32_t> b) {
617  return Vec512<int32_t>{_mm512_add_epi32(a.raw, b.raw)};
618 }
620  const Vec512<int64_t> b) {
621  return Vec512<int64_t>{_mm512_add_epi64(a.raw, b.raw)};
622 }
623 
624 // Float
626  return Vec512<float>{_mm512_add_ps(a.raw, b.raw)};
627 }
629  const Vec512<double> b) {
630  return Vec512<double>{_mm512_add_pd(a.raw, b.raw)};
631 }
632 
633 // ------------------------------ Subtraction
634 
635 // Unsigned
637  const Vec512<uint8_t> b) {
638  return Vec512<uint8_t>{_mm512_sub_epi8(a.raw, b.raw)};
639 }
641  const Vec512<uint16_t> b) {
642  return Vec512<uint16_t>{_mm512_sub_epi16(a.raw, b.raw)};
643 }
645  const Vec512<uint32_t> b) {
646  return Vec512<uint32_t>{_mm512_sub_epi32(a.raw, b.raw)};
647 }
649  const Vec512<uint64_t> b) {
650  return Vec512<uint64_t>{_mm512_sub_epi64(a.raw, b.raw)};
651 }
652 
653 // Signed
655  const Vec512<int8_t> b) {
656  return Vec512<int8_t>{_mm512_sub_epi8(a.raw, b.raw)};
657 }
659  const Vec512<int16_t> b) {
660  return Vec512<int16_t>{_mm512_sub_epi16(a.raw, b.raw)};
661 }
663  const Vec512<int32_t> b) {
664  return Vec512<int32_t>{_mm512_sub_epi32(a.raw, b.raw)};
665 }
667  const Vec512<int64_t> b) {
668  return Vec512<int64_t>{_mm512_sub_epi64(a.raw, b.raw)};
669 }
670 
671 // Float
673  return Vec512<float>{_mm512_sub_ps(a.raw, b.raw)};
674 }
676  const Vec512<double> b) {
677  return Vec512<double>{_mm512_sub_pd(a.raw, b.raw)};
678 }
679 
680 // ------------------------------ Saturating addition
681 
682 // Returns a + b clamped to the destination range.
683 
684 // Unsigned
686  const Vec512<uint8_t> b) {
687  return Vec512<uint8_t>{_mm512_adds_epu8(a.raw, b.raw)};
688 }
690  const Vec512<uint16_t> b) {
691  return Vec512<uint16_t>{_mm512_adds_epu16(a.raw, b.raw)};
692 }
693 
694 // Signed
696  const Vec512<int8_t> b) {
697  return Vec512<int8_t>{_mm512_adds_epi8(a.raw, b.raw)};
698 }
700  const Vec512<int16_t> b) {
701  return Vec512<int16_t>{_mm512_adds_epi16(a.raw, b.raw)};
702 }
703 
704 // ------------------------------ Saturating subtraction
705 
706 // Returns a - b clamped to the destination range.
707 
708 // Unsigned
710  const Vec512<uint8_t> b) {
711  return Vec512<uint8_t>{_mm512_subs_epu8(a.raw, b.raw)};
712 }
714  const Vec512<uint16_t> b) {
715  return Vec512<uint16_t>{_mm512_subs_epu16(a.raw, b.raw)};
716 }
717 
718 // Signed
720  const Vec512<int8_t> b) {
721  return Vec512<int8_t>{_mm512_subs_epi8(a.raw, b.raw)};
722 }
724  const Vec512<int16_t> b) {
725  return Vec512<int16_t>{_mm512_subs_epi16(a.raw, b.raw)};
726 }
727 
728 // ------------------------------ Average
729 
730 // Returns (a + b + 1) / 2
731 
732 // Unsigned
734  const Vec512<uint8_t> b) {
735  return Vec512<uint8_t>{_mm512_avg_epu8(a.raw, b.raw)};
736 }
738  const Vec512<uint16_t> b) {
739  return Vec512<uint16_t>{_mm512_avg_epu16(a.raw, b.raw)};
740 }
741 
742 // ------------------------------ Abs (Sub)
743 
744 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
746 #if HWY_COMPILER_MSVC
747  // Workaround for incorrect codegen? (untested due to internal compiler error)
748  const auto zero = Zero(Full512<int8_t>());
749  return Vec512<int8_t>{_mm512_max_epi8(v.raw, (zero - v).raw)};
750 #else
751  return Vec512<int8_t>{_mm512_abs_epi8(v.raw)};
752 #endif
753 }
755  return Vec512<int16_t>{_mm512_abs_epi16(v.raw)};
756 }
758  return Vec512<int32_t>{_mm512_abs_epi32(v.raw)};
759 }
761  return Vec512<int64_t>{_mm512_abs_epi64(v.raw)};
762 }
763 
764 // These aren't native instructions, they also involve AND with constant.
766  return Vec512<float>{_mm512_abs_ps(v.raw)};
767 }
769  return Vec512<double>{_mm512_abs_pd(v.raw)};
770 }
771 // ------------------------------ ShiftLeft
772 
773 template <int kBits>
775  return Vec512<uint16_t>{_mm512_slli_epi16(v.raw, kBits)};
776 }
777 
778 template <int kBits>
780  return Vec512<uint32_t>{_mm512_slli_epi32(v.raw, kBits)};
781 }
782 
783 template <int kBits>
785  return Vec512<uint64_t>{_mm512_slli_epi64(v.raw, kBits)};
786 }
787 
788 template <int kBits>
790  return Vec512<int16_t>{_mm512_slli_epi16(v.raw, kBits)};
791 }
792 
793 template <int kBits>
795  return Vec512<int32_t>{_mm512_slli_epi32(v.raw, kBits)};
796 }
797 
798 template <int kBits>
800  return Vec512<int64_t>{_mm512_slli_epi64(v.raw, kBits)};
801 }
802 
803 template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
805  const Full512<T> d8;
806  const RepartitionToWide<decltype(d8)> d16;
807  const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
808  return kBits == 1
809  ? (v + v)
810  : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
811 }
812 
813 // ------------------------------ ShiftRight
814 
815 template <int kBits>
817  return Vec512<uint16_t>{_mm512_srli_epi16(v.raw, kBits)};
818 }
819 
820 template <int kBits>
822  return Vec512<uint32_t>{_mm512_srli_epi32(v.raw, kBits)};
823 }
824 
825 template <int kBits>
827  return Vec512<uint64_t>{_mm512_srli_epi64(v.raw, kBits)};
828 }
829 
830 template <int kBits>
832  const Full512<uint8_t> d8;
833  // Use raw instead of BitCast to support N=1.
834  const Vec512<uint8_t> shifted{ShiftRight<kBits>(Vec512<uint16_t>{v.raw}).raw};
835  return shifted & Set(d8, 0xFF >> kBits);
836 }
837 
838 template <int kBits>
840  return Vec512<int16_t>{_mm512_srai_epi16(v.raw, kBits)};
841 }
842 
843 template <int kBits>
845  return Vec512<int32_t>{_mm512_srai_epi32(v.raw, kBits)};
846 }
847 
848 template <int kBits>
850  return Vec512<int64_t>{_mm512_srai_epi64(v.raw, kBits)};
851 }
852 
853 template <int kBits>
855  const Full512<int8_t> di;
856  const Full512<uint8_t> du;
857  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
858  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
859  return (shifted ^ shifted_sign) - shifted_sign;
860 }
861 
862 // ------------------------------ ShiftLeftSame
863 
865  const int bits) {
866  return Vec512<uint16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
867 }
869  const int bits) {
870  return Vec512<uint32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
871 }
873  const int bits) {
874  return Vec512<uint64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
875 }
876 
878  return Vec512<int16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
879 }
880 
882  return Vec512<int32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
883 }
884 
886  return Vec512<int64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
887 }
888 
889 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
890 HWY_API Vec512<T> ShiftLeftSame(const Vec512<T> v, const int bits) {
891  const Full512<T> d8;
892  const RepartitionToWide<decltype(d8)> d16;
893  const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
894  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
895 }
896 
897 // ------------------------------ ShiftRightSame
898 
900  const int bits) {
901  return Vec512<uint16_t>{_mm512_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
902 }
904  const int bits) {
905  return Vec512<uint32_t>{_mm512_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
906 }
908  const int bits) {
909  return Vec512<uint64_t>{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
910 }
911 
913  const Full512<uint8_t> d8;
914  const RepartitionToWide<decltype(d8)> d16;
915  const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
916  return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
917 }
918 
920  const int bits) {
921  return Vec512<int16_t>{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
922 }
923 
925  const int bits) {
926  return Vec512<int32_t>{_mm512_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
927 }
929  const int bits) {
930  return Vec512<int64_t>{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
931 }
932 
934  const Full512<int8_t> di;
935  const Full512<uint8_t> du;
936  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
937  const auto shifted_sign =
938  BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
939  return (shifted ^ shifted_sign) - shifted_sign;
940 }
941 
942 // ------------------------------ Shl
943 
945  const Vec512<uint16_t> bits) {
946  return Vec512<uint16_t>{_mm512_sllv_epi16(v.raw, bits.raw)};
947 }
948 
950  const Vec512<uint32_t> bits) {
951  return Vec512<uint32_t>{_mm512_sllv_epi32(v.raw, bits.raw)};
952 }
953 
955  const Vec512<uint64_t> bits) {
956  return Vec512<uint64_t>{_mm512_sllv_epi64(v.raw, bits.raw)};
957 }
958 
959 // Signed left shift is the same as unsigned.
960 template <typename T, HWY_IF_SIGNED(T)>
962  const Full512<T> di;
963  const Full512<MakeUnsigned<T>> du;
964  return BitCast(di, BitCast(du, v) << BitCast(du, bits));
965 }
966 
967 // ------------------------------ Shr
968 
970  const Vec512<uint16_t> bits) {
971  return Vec512<uint16_t>{_mm512_srlv_epi16(v.raw, bits.raw)};
972 }
973 
975  const Vec512<uint32_t> bits) {
976  return Vec512<uint32_t>{_mm512_srlv_epi32(v.raw, bits.raw)};
977 }
978 
980  const Vec512<uint64_t> bits) {
981  return Vec512<uint64_t>{_mm512_srlv_epi64(v.raw, bits.raw)};
982 }
983 
985  const Vec512<int16_t> bits) {
986  return Vec512<int16_t>{_mm512_srav_epi16(v.raw, bits.raw)};
987 }
988 
990  const Vec512<int32_t> bits) {
991  return Vec512<int32_t>{_mm512_srav_epi32(v.raw, bits.raw)};
992 }
993 
995  const Vec512<int64_t> bits) {
996  return Vec512<int64_t>{_mm512_srav_epi64(v.raw, bits.raw)};
997 }
998 
999 // ------------------------------ Minimum
1000 
1001 // Unsigned
1003  return Vec512<uint8_t>{_mm512_min_epu8(a.raw, b.raw)};
1004 }
1006  const Vec512<uint16_t> b) {
1007  return Vec512<uint16_t>{_mm512_min_epu16(a.raw, b.raw)};
1008 }
1010  const Vec512<uint32_t> b) {
1011  return Vec512<uint32_t>{_mm512_min_epu32(a.raw, b.raw)};
1012 }
1014  const Vec512<uint64_t> b) {
1015  return Vec512<uint64_t>{_mm512_min_epu64(a.raw, b.raw)};
1016 }
1017 
1018 // Signed
1020  return Vec512<int8_t>{_mm512_min_epi8(a.raw, b.raw)};
1021 }
1023  return Vec512<int16_t>{_mm512_min_epi16(a.raw, b.raw)};
1024 }
1026  return Vec512<int32_t>{_mm512_min_epi32(a.raw, b.raw)};
1027 }
1029  return Vec512<int64_t>{_mm512_min_epi64(a.raw, b.raw)};
1030 }
1031 
1032 // Float
1034  return Vec512<float>{_mm512_min_ps(a.raw, b.raw)};
1035 }
1037  return Vec512<double>{_mm512_min_pd(a.raw, b.raw)};
1038 }
1039 
1040 // ------------------------------ Maximum
1041 
1042 // Unsigned
1044  return Vec512<uint8_t>{_mm512_max_epu8(a.raw, b.raw)};
1045 }
1047  const Vec512<uint16_t> b) {
1048  return Vec512<uint16_t>{_mm512_max_epu16(a.raw, b.raw)};
1049 }
1051  const Vec512<uint32_t> b) {
1052  return Vec512<uint32_t>{_mm512_max_epu32(a.raw, b.raw)};
1053 }
1055  const Vec512<uint64_t> b) {
1056  return Vec512<uint64_t>{_mm512_max_epu64(a.raw, b.raw)};
1057 }
1058 
1059 // Signed
1061  return Vec512<int8_t>{_mm512_max_epi8(a.raw, b.raw)};
1062 }
1064  return Vec512<int16_t>{_mm512_max_epi16(a.raw, b.raw)};
1065 }
1067  return Vec512<int32_t>{_mm512_max_epi32(a.raw, b.raw)};
1068 }
1070  return Vec512<int64_t>{_mm512_max_epi64(a.raw, b.raw)};
1071 }
1072 
1073 // Float
1075  return Vec512<float>{_mm512_max_ps(a.raw, b.raw)};
1076 }
1078  return Vec512<double>{_mm512_max_pd(a.raw, b.raw)};
1079 }
1080 
1081 // ------------------------------ Integer multiplication
1082 
1083 // Unsigned
1085  const Vec512<uint16_t> b) {
1086  return Vec512<uint16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
1087 }
1089  const Vec512<uint32_t> b) {
1090  return Vec512<uint32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
1091 }
1092 
1093 // Signed
1095  const Vec512<int16_t> b) {
1096  return Vec512<int16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
1097 }
1099  const Vec512<int32_t> b) {
1100  return Vec512<int32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
1101 }
1102 
1103 // Returns the upper 16 bits of a * b in each lane.
1105  const Vec512<uint16_t> b) {
1106  return Vec512<uint16_t>{_mm512_mulhi_epu16(a.raw, b.raw)};
1107 }
1109  const Vec512<int16_t> b) {
1110  return Vec512<int16_t>{_mm512_mulhi_epi16(a.raw, b.raw)};
1111 }
1112 
1113 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
1114 // even and the upper half into its odd neighbor lane.
1116  const Vec512<int32_t> b) {
1117  return Vec512<int64_t>{_mm512_mul_epi32(a.raw, b.raw)};
1118 }
1120  const Vec512<uint32_t> b) {
1121  return Vec512<uint64_t>{_mm512_mul_epu32(a.raw, b.raw)};
1122 }
1123 
1124 // ------------------------------ Neg (Sub)
1125 
1126 template <typename T, HWY_IF_FLOAT(T)>
1128  return Xor(v, SignBit(Full512<T>()));
1129 }
1130 
1131 template <typename T, HWY_IF_NOT_FLOAT(T)>
1132 HWY_API Vec512<T> Neg(const Vec512<T> v) {
1133  return Zero(Full512<T>()) - v;
1134 }
1135 
1136 // ------------------------------ Floating-point mul / div
1137 
1139  return Vec512<float>{_mm512_mul_ps(a.raw, b.raw)};
1140 }
1142  const Vec512<double> b) {
1143  return Vec512<double>{_mm512_mul_pd(a.raw, b.raw)};
1144 }
1145 
1147  return Vec512<float>{_mm512_div_ps(a.raw, b.raw)};
1148 }
1150  const Vec512<double> b) {
1151  return Vec512<double>{_mm512_div_pd(a.raw, b.raw)};
1152 }
1153 
1154 // Approximate reciprocal
1156  return Vec512<float>{_mm512_rcp14_ps(v.raw)};
1157 }
1158 
1159 // Absolute value of difference.
1161  return Abs(a - b);
1162 }
1163 
1164 // ------------------------------ Floating-point multiply-add variants
1165 
1166 // Returns mul * x + add
1168  const Vec512<float> add) {
1169  return Vec512<float>{_mm512_fmadd_ps(mul.raw, x.raw, add.raw)};
1170 }
1172  const Vec512<double> add) {
1173  return Vec512<double>{_mm512_fmadd_pd(mul.raw, x.raw, add.raw)};
1174 }
1175 
1176 // Returns add - mul * x
1178  const Vec512<float> add) {
1179  return Vec512<float>{_mm512_fnmadd_ps(mul.raw, x.raw, add.raw)};
1180 }
1182  const Vec512<double> x,
1183  const Vec512<double> add) {
1184  return Vec512<double>{_mm512_fnmadd_pd(mul.raw, x.raw, add.raw)};
1185 }
1186 
1187 // Returns mul * x - sub
1189  const Vec512<float> sub) {
1190  return Vec512<float>{_mm512_fmsub_ps(mul.raw, x.raw, sub.raw)};
1191 }
1193  const Vec512<double> sub) {
1194  return Vec512<double>{_mm512_fmsub_pd(mul.raw, x.raw, sub.raw)};
1195 }
1196 
1197 // Returns -mul * x - sub
1199  const Vec512<float> sub) {
1200  return Vec512<float>{_mm512_fnmsub_ps(mul.raw, x.raw, sub.raw)};
1201 }
1203  const Vec512<double> x,
1204  const Vec512<double> sub) {
1205  return Vec512<double>{_mm512_fnmsub_pd(mul.raw, x.raw, sub.raw)};
1206 }
1207 
1208 // ------------------------------ Floating-point square root
1209 
1210 // Full precision square root
1212  return Vec512<float>{_mm512_sqrt_ps(v.raw)};
1213 }
1215  return Vec512<double>{_mm512_sqrt_pd(v.raw)};
1216 }
1217 
1218 // Approximate reciprocal square root
1220  return Vec512<float>{_mm512_rsqrt14_ps(v.raw)};
1221 }
1222 
1223 // ------------------------------ Floating-point rounding
1224 
1225 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1226 HWY_DIAGNOSTICS(push)
1227 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1228 
1229 // Toward nearest integer, tie to even
1230 HWY_API Vec512<float> Round(const Vec512<float> v) {
1231  return Vec512<float>{_mm512_roundscale_ps(
1232  v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1233 }
1235  return Vec512<double>{_mm512_roundscale_pd(
1236  v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1237 }
1238 
1239 // Toward zero, aka truncate
1241  return Vec512<float>{
1242  _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1243 }
1245  return Vec512<double>{
1246  _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1247 }
1248 
1249 // Toward +infinity, aka ceiling
1251  return Vec512<float>{
1252  _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1253 }
1255  return Vec512<double>{
1256  _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1257 }
1258 
1259 // Toward -infinity, aka floor
1261  return Vec512<float>{
1262  _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1263 }
1265  return Vec512<double>{
1266  _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1267 }
1268 
1269 HWY_DIAGNOSTICS(pop)
1270 
1271 // ================================================== COMPARE
1272 
1273 // Comparisons set a mask bit to 1 if the condition is true, else 0.
1274 
1275 template <typename TFrom, typename TTo>
1277  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1278  return Mask512<TTo>{m.raw};
1279 }
1280 
1281 namespace detail {
1282 
1283 template <typename T>
1285  const Vec512<T> bit) {
1286  return Mask512<T>{_mm512_test_epi8_mask(v.raw, bit.raw)};
1287 }
1288 template <typename T>
1290  const Vec512<T> bit) {
1291  return Mask512<T>{_mm512_test_epi16_mask(v.raw, bit.raw)};
1292 }
1293 template <typename T>
1295  const Vec512<T> bit) {
1296  return Mask512<T>{_mm512_test_epi32_mask(v.raw, bit.raw)};
1297 }
1298 template <typename T>
1300  const Vec512<T> bit) {
1301  return Mask512<T>{_mm512_test_epi64_mask(v.raw, bit.raw)};
1302 }
1303 
1304 } // namespace detail
1305 
1306 template <typename T>
1308  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1309  return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
1310 }
1311 
1312 // ------------------------------ Equality
1313 
1314 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1316  return Mask512<T>{_mm512_cmpeq_epi8_mask(a.raw, b.raw)};
1317 }
1318 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1319 HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1320  return Mask512<T>{_mm512_cmpeq_epi16_mask(a.raw, b.raw)};
1321 }
1322 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1323 HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1324  return Mask512<T>{_mm512_cmpeq_epi32_mask(a.raw, b.raw)};
1325 }
1326 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1327 HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1328  return Mask512<T>{_mm512_cmpeq_epi64_mask(a.raw, b.raw)};
1329 }
1330 
1332  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1333 }
1334 
1336  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1337 }
1338 
1339 // ------------------------------ Inequality
1340 
1341 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1343  return Mask512<T>{_mm512_cmpneq_epi8_mask(a.raw, b.raw)};
1344 }
1345 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1346 HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1347  return Mask512<T>{_mm512_cmpneq_epi16_mask(a.raw, b.raw)};
1348 }
1349 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1350 HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1351  return Mask512<T>{_mm512_cmpneq_epi32_mask(a.raw, b.raw)};
1352 }
1353 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1354 HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1355  return Mask512<T>{_mm512_cmpneq_epi64_mask(a.raw, b.raw)};
1356 }
1357 
1359  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1360 }
1361 
1363  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1364 }
1365 
1366 // ------------------------------ Strict inequality
1367 
1369  return Mask512<int8_t>{_mm512_cmpgt_epi8_mask(a.raw, b.raw)};
1370 }
1372  return Mask512<int16_t>{_mm512_cmpgt_epi16_mask(a.raw, b.raw)};
1373 }
1375  return Mask512<int32_t>{_mm512_cmpgt_epi32_mask(a.raw, b.raw)};
1376 }
1378  return Mask512<int64_t>{_mm512_cmpgt_epi64_mask(a.raw, b.raw)};
1379 }
1381  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1382 }
1384  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
1385 }
1386 
1387 // ------------------------------ Weak inequality
1388 
1390  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1391 }
1393  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
1394 }
1395 
1396 // ------------------------------ Reversed comparisons
1397 
1398 template <typename T>
1400  return b > a;
1401 }
1402 
1403 template <typename T>
1405  return b >= a;
1406 }
1407 
1408 // ------------------------------ Mask
1409 
1410 namespace detail {
1411 
1412 template <typename T>
1414  return Mask512<T>{_mm512_movepi8_mask(v.raw)};
1415 }
1416 template <typename T>
1418  return Mask512<T>{_mm512_movepi16_mask(v.raw)};
1419 }
1420 template <typename T>
1422  return Mask512<T>{_mm512_movepi32_mask(v.raw)};
1423 }
1424 template <typename T>
1426  return Mask512<T>{_mm512_movepi64_mask(v.raw)};
1427 }
1428 
1429 } // namespace detail
1430 
1431 template <typename T>
1433  return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1434 }
1435 // There do not seem to be native floating-point versions of these instructions.
1438 }
1441 }
1442 
1444  return Vec512<uint8_t>{_mm512_movm_epi8(v.raw)};
1445 }
1447  return Vec512<int8_t>{_mm512_movm_epi8(v.raw)};
1448 }
1449 
1451  return Vec512<uint16_t>{_mm512_movm_epi16(v.raw)};
1452 }
1454  return Vec512<int16_t>{_mm512_movm_epi16(v.raw)};
1455 }
1456 
1458  return Vec512<uint32_t>{_mm512_movm_epi32(v.raw)};
1459 }
1461  return Vec512<int32_t>{_mm512_movm_epi32(v.raw)};
1462 }
1464  return Vec512<float>{_mm512_castsi512_ps(_mm512_movm_epi32(v.raw))};
1465 }
1466 
1468  return Vec512<uint64_t>{_mm512_movm_epi64(v.raw)};
1469 }
1471  return Vec512<int64_t>{_mm512_movm_epi64(v.raw)};
1472 }
1474  return Vec512<double>{_mm512_castsi512_pd(_mm512_movm_epi64(v.raw))};
1475 }
1476 
1477 template <typename T>
1479  return VecFromMask(v);
1480 }
1481 
1482 // ------------------------------ Mask logical
1483 
1484 namespace detail {
1485 
1486 template <typename T>
1488 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1489  return Mask512<T>{_knot_mask64(m.raw)};
1490 #else
1491  return Mask512<T>{~m.raw};
1492 #endif
1493 }
1494 template <typename T>
1496 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1497  return Mask512<T>{_knot_mask32(m.raw)};
1498 #else
1499  return Mask512<T>{~m.raw};
1500 #endif
1501 }
1502 template <typename T>
1504 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1505  return Mask512<T>{_knot_mask16(m.raw)};
1506 #else
1507  return Mask512<T>{static_cast<uint16_t>(~m.raw & 0xFFFF)};
1508 #endif
1509 }
1510 template <typename T>
1512 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1513  return Mask512<T>{_knot_mask8(m.raw)};
1514 #else
1515  return Mask512<T>{static_cast<uint8_t>(~m.raw & 0xFF)};
1516 #endif
1517 }
1518 
1519 template <typename T>
1521  const Mask512<T> b) {
1522 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1523  return Mask512<T>{_kand_mask64(a.raw, b.raw)};
1524 #else
1525  return Mask512<T>{a.raw & b.raw};
1526 #endif
1527 }
1528 template <typename T>
1530  const Mask512<T> b) {
1531 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1532  return Mask512<T>{_kand_mask32(a.raw, b.raw)};
1533 #else
1534  return Mask512<T>{a.raw & b.raw};
1535 #endif
1536 }
1537 template <typename T>
1539  const Mask512<T> b) {
1540 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1541  return Mask512<T>{_kand_mask16(a.raw, b.raw)};
1542 #else
1543  return Mask512<T>{static_cast<uint16_t>(a.raw & b.raw)};
1544 #endif
1545 }
1546 template <typename T>
1548  const Mask512<T> b) {
1549 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1550  return Mask512<T>{_kand_mask8(a.raw, b.raw)};
1551 #else
1552  return Mask512<T>{static_cast<uint8_t>(a.raw & b.raw)};
1553 #endif
1554 }
1555 
1556 template <typename T>
1558  const Mask512<T> b) {
1559 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1560  return Mask512<T>{_kandn_mask64(a.raw, b.raw)};
1561 #else
1562  return Mask512<T>{~a.raw & b.raw};
1563 #endif
1564 }
1565 template <typename T>
1567  const Mask512<T> b) {
1568 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1569  return Mask512<T>{_kandn_mask32(a.raw, b.raw)};
1570 #else
1571  return Mask512<T>{~a.raw & b.raw};
1572 #endif
1573 }
1574 template <typename T>
1576  const Mask512<T> b) {
1577 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1578  return Mask512<T>{_kandn_mask16(a.raw, b.raw)};
1579 #else
1580  return Mask512<T>{static_cast<uint16_t>(~a.raw & b.raw)};
1581 #endif
1582 }
1583 template <typename T>
1585  const Mask512<T> b) {
1586 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1587  return Mask512<T>{_kandn_mask8(a.raw, b.raw)};
1588 #else
1589  return Mask512<T>{static_cast<uint8_t>(~a.raw & b.raw)};
1590 #endif
1591 }
1592 
1593 template <typename T>
1595  const Mask512<T> b) {
1596 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1597  return Mask512<T>{_kor_mask64(a.raw, b.raw)};
1598 #else
1599  return Mask512<T>{a.raw | b.raw};
1600 #endif
1601 }
1602 template <typename T>
1604  const Mask512<T> b) {
1605 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1606  return Mask512<T>{_kor_mask32(a.raw, b.raw)};
1607 #else
1608  return Mask512<T>{a.raw | b.raw};
1609 #endif
1610 }
1611 template <typename T>
1613  const Mask512<T> b) {
1614 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1615  return Mask512<T>{_kor_mask16(a.raw, b.raw)};
1616 #else
1617  return Mask512<T>{static_cast<uint16_t>(a.raw | b.raw)};
1618 #endif
1619 }
1620 template <typename T>
1622  const Mask512<T> b) {
1623 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1624  return Mask512<T>{_kor_mask8(a.raw, b.raw)};
1625 #else
1626  return Mask512<T>{static_cast<uint8_t>(a.raw | b.raw)};
1627 #endif
1628 }
1629 
1630 template <typename T>
1632  const Mask512<T> b) {
1633 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1634  return Mask512<T>{_kxor_mask64(a.raw, b.raw)};
1635 #else
1636  return Mask512<T>{a.raw ^ b.raw};
1637 #endif
1638 }
1639 template <typename T>
1641  const Mask512<T> b) {
1642 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1643  return Mask512<T>{_kxor_mask32(a.raw, b.raw)};
1644 #else
1645  return Mask512<T>{a.raw ^ b.raw};
1646 #endif
1647 }
1648 template <typename T>
1650  const Mask512<T> b) {
1651 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1652  return Mask512<T>{_kxor_mask16(a.raw, b.raw)};
1653 #else
1654  return Mask512<T>{static_cast<uint16_t>(a.raw ^ b.raw)};
1655 #endif
1656 }
1657 template <typename T>
1659  const Mask512<T> b) {
1660 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1661  return Mask512<T>{_kxor_mask8(a.raw, b.raw)};
1662 #else
1663  return Mask512<T>{static_cast<uint8_t>(a.raw ^ b.raw)};
1664 #endif
1665 }
1666 
1667 } // namespace detail
1668 
1669 template <typename T>
1671  return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
1672 }
1673 
1674 template <typename T>
1676  return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
1677 }
1678 
1679 template <typename T>
1681  return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
1682 }
1683 
1684 template <typename T>
1686  return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
1687 }
1688 
1689 template <typename T>
1691  return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
1692 }
1693 
1694 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
1695 
1697  return VecFromMask(v < Zero(Full512<int8_t>()));
1698 }
1699 
1701  return ShiftRight<15>(v);
1702 }
1703 
1705  return ShiftRight<31>(v);
1706 }
1707 
1709  return Vec512<int64_t>{_mm512_srai_epi64(v.raw, 63)};
1710 }
1711 
1712 // ================================================== MEMORY
1713 
1714 // ------------------------------ Load
1715 
1716 template <typename T>
1717 HWY_API Vec512<T> Load(Full512<T> /* tag */, const T* HWY_RESTRICT aligned) {
1718  return Vec512<T>{_mm512_load_si512(aligned)};
1719 }
1721  const float* HWY_RESTRICT aligned) {
1722  return Vec512<float>{_mm512_load_ps(aligned)};
1723 }
1725  const double* HWY_RESTRICT aligned) {
1726  return Vec512<double>{_mm512_load_pd(aligned)};
1727 }
1728 
1729 template <typename T>
1731  return Vec512<T>{_mm512_loadu_si512(p)};
1732 }
1734  const float* HWY_RESTRICT p) {
1735  return Vec512<float>{_mm512_loadu_ps(p)};
1736 }
1738  const double* HWY_RESTRICT p) {
1739  return Vec512<double>{_mm512_loadu_pd(p)};
1740 }
1741 
1742 // ------------------------------ MaskedLoad
1743 
1744 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1746  const T* HWY_RESTRICT aligned) {
1747  return Vec512<T>{_mm512_maskz_load_epi32(m.raw, aligned)};
1748 }
1749 
1750 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1751 HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
1752  const T* HWY_RESTRICT aligned) {
1753  return Vec512<T>{_mm512_maskz_load_epi64(m.raw, aligned)};
1754 }
1755 
1757  const float* HWY_RESTRICT aligned) {
1758  return Vec512<float>{_mm512_maskz_load_ps(m.raw, aligned)};
1759 }
1760 
1762  const double* HWY_RESTRICT aligned) {
1763  return Vec512<double>{_mm512_maskz_load_pd(m.raw, aligned)};
1764 }
1765 
1766 // There is no load_epi8/16, so use loadu instead.
1767 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1768 HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
1769  const T* HWY_RESTRICT aligned) {
1770  return Vec512<T>{_mm512_maskz_loadu_epi8(m.raw, aligned)};
1771 }
1772 
1773 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1774 HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
1775  const T* HWY_RESTRICT aligned) {
1776  return Vec512<T>{_mm512_maskz_loadu_epi16(m.raw, aligned)};
1777 }
1778 
1779 // ------------------------------ LoadDup128
1780 
1781 // Loads 128 bit and duplicates into both 128-bit halves. This avoids the
1782 // 3-cycle cost of moving data between 128-bit halves and avoids port 5.
1783 template <typename T>
1785  const T* const HWY_RESTRICT p) {
1786  // Clang 3.9 generates VINSERTF128 which is slower, but inline assembly leads
1787  // to "invalid output size for constraint" without -mavx512:
1788  // https://gcc.godbolt.org/z/-Jt_-F
1789 #if HWY_LOADDUP_ASM
1790  __m512i out;
1791  asm("vbroadcasti128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
1792  return Vec512<T>{out};
1793 #else
1794  const auto x4 = LoadU(Full128<T>(), p);
1795  return Vec512<T>{_mm512_broadcast_i32x4(x4.raw)};
1796 #endif
1797 }
1799  const float* const HWY_RESTRICT p) {
1800 #if HWY_LOADDUP_ASM
1801  __m512 out;
1802  asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
1803  return Vec512<float>{out};
1804 #else
1805  const __m128 x4 = _mm_loadu_ps(p);
1806  return Vec512<float>{_mm512_broadcast_f32x4(x4)};
1807 #endif
1808 }
1809 
1811  const double* const HWY_RESTRICT p) {
1812 #if HWY_LOADDUP_ASM
1813  __m512d out;
1814  asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
1815  return Vec512<double>{out};
1816 #else
1817  const __m128d x2 = _mm_loadu_pd(p);
1818  return Vec512<double>{_mm512_broadcast_f64x2(x2)};
1819 #endif
1820 }
1821 
1822 // ------------------------------ Store
1823 
1824 template <typename T>
1825 HWY_API void Store(const Vec512<T> v, Full512<T> /* tag */,
1826  T* HWY_RESTRICT aligned) {
1827  _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
1828 }
1829 HWY_API void Store(const Vec512<float> v, Full512<float> /* tag */,
1830  float* HWY_RESTRICT aligned) {
1831  _mm512_store_ps(aligned, v.raw);
1832 }
1834  double* HWY_RESTRICT aligned) {
1835  _mm512_store_pd(aligned, v.raw);
1836 }
1837 
1838 template <typename T>
1839 HWY_API void StoreU(const Vec512<T> v, Full512<T> /* tag */,
1840  T* HWY_RESTRICT p) {
1841  _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw);
1842 }
1843 HWY_API void StoreU(const Vec512<float> v, Full512<float> /* tag */,
1844  float* HWY_RESTRICT p) {
1845  _mm512_storeu_ps(p, v.raw);
1846 }
1848  double* HWY_RESTRICT p) {
1849  _mm512_storeu_pd(p, v.raw);
1850 }
1851 
1852 // ------------------------------ Non-temporal stores
1853 
1854 template <typename T>
1855 HWY_API void Stream(const Vec512<T> v, Full512<T> /* tag */,
1856  T* HWY_RESTRICT aligned) {
1857  _mm512_stream_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
1858 }
1859 HWY_API void Stream(const Vec512<float> v, Full512<float> /* tag */,
1860  float* HWY_RESTRICT aligned) {
1861  _mm512_stream_ps(aligned, v.raw);
1862 }
1864  double* HWY_RESTRICT aligned) {
1865  _mm512_stream_pd(aligned, v.raw);
1866 }
1867 
1868 // ------------------------------ Scatter
1869 
1870 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1871 HWY_DIAGNOSTICS(push)
1872 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1873 
1874 namespace detail {
1875 
1876 template <typename T>
1878  Full512<T> /* tag */, T* HWY_RESTRICT base,
1879  const Vec512<int32_t> offset) {
1880  _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1);
1881 }
1882 template <typename T>
1884  Full512<T> /* tag */, T* HWY_RESTRICT base,
1885  const Vec512<int32_t> index) {
1886  _mm512_i32scatter_epi32(base, index.raw, v.raw, 4);
1887 }
1888 
1889 template <typename T>
1891  Full512<T> /* tag */, T* HWY_RESTRICT base,
1892  const Vec512<int64_t> offset) {
1893  _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1);
1894 }
1895 template <typename T>
1897  Full512<T> /* tag */, T* HWY_RESTRICT base,
1898  const Vec512<int64_t> index) {
1899  _mm512_i64scatter_epi64(base, index.raw, v.raw, 8);
1900 }
1901 
1902 } // namespace detail
1903 
1904 template <typename T, typename Offset>
1906  const Vec512<Offset> offset) {
1907  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1908  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
1909 }
1910 template <typename T, typename Index>
1912  const Vec512<Index> index) {
1913  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1914  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
1915 }
1916 
1918  float* HWY_RESTRICT base,
1919  const Vec512<int32_t> offset) {
1920  _mm512_i32scatter_ps(base, offset.raw, v.raw, 1);
1921 }
1923  float* HWY_RESTRICT base,
1924  const Vec512<int32_t> index) {
1925  _mm512_i32scatter_ps(base, index.raw, v.raw, 4);
1926 }
1927 
1929  double* HWY_RESTRICT base,
1930  const Vec512<int64_t> offset) {
1931  _mm512_i64scatter_pd(base, offset.raw, v.raw, 1);
1932 }
1934  double* HWY_RESTRICT base,
1935  const Vec512<int64_t> index) {
1936  _mm512_i64scatter_pd(base, index.raw, v.raw, 8);
1937 }
1938 
1939 // ------------------------------ Gather
1940 
1941 namespace detail {
1942 
1943 template <typename T>
1945  Full512<T> /* tag */,
1946  const T* HWY_RESTRICT base,
1947  const Vec512<int32_t> offset) {
1948  return Vec512<T>{_mm512_i32gather_epi32(offset.raw, base, 1)};
1949 }
1950 template <typename T>
1952  Full512<T> /* tag */,
1953  const T* HWY_RESTRICT base,
1954  const Vec512<int32_t> index) {
1955  return Vec512<T>{_mm512_i32gather_epi32(index.raw, base, 4)};
1956 }
1957 
1958 template <typename T>
1960  Full512<T> /* tag */,
1961  const T* HWY_RESTRICT base,
1962  const Vec512<int64_t> offset) {
1963  return Vec512<T>{_mm512_i64gather_epi64(offset.raw, base, 1)};
1964 }
1965 template <typename T>
1967  Full512<T> /* tag */,
1968  const T* HWY_RESTRICT base,
1969  const Vec512<int64_t> index) {
1970  return Vec512<T>{_mm512_i64gather_epi64(index.raw, base, 8)};
1971 }
1972 
1973 } // namespace detail
1974 
1975 template <typename T, typename Offset>
1977  const Vec512<Offset> offset) {
1978 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1979  return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
1980 }
1981 template <typename T, typename Index>
1983  const Vec512<Index> index) {
1984  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1985  return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
1986 }
1987 
1989  const float* HWY_RESTRICT base,
1990  const Vec512<int32_t> offset) {
1991  return Vec512<float>{_mm512_i32gather_ps(offset.raw, base, 1)};
1992 }
1994  const float* HWY_RESTRICT base,
1995  const Vec512<int32_t> index) {
1996  return Vec512<float>{_mm512_i32gather_ps(index.raw, base, 4)};
1997 }
1998 
2000  const double* HWY_RESTRICT base,
2001  const Vec512<int64_t> offset) {
2002  return Vec512<double>{_mm512_i64gather_pd(offset.raw, base, 1)};
2003 }
2005  const double* HWY_RESTRICT base,
2006  const Vec512<int64_t> index) {
2007  return Vec512<double>{_mm512_i64gather_pd(index.raw, base, 8)};
2008 }
2009 
2010 HWY_DIAGNOSTICS(pop)
2011 
2012 // ================================================== SWIZZLE
2013 
2014 // ------------------------------ LowerHalf
2015 
2016 template <typename T>
2018  return Vec256<T>{_mm512_castsi512_si256(v.raw)};
2019 }
2021  return Vec256<float>{_mm512_castps512_ps256(v.raw)};
2022 }
2024  return Vec256<double>{_mm512_castpd512_pd256(v.raw)};
2025 }
2026 
2027 template <typename T>
2029  return LowerHalf(Full256<T>(), v);
2030 }
2031 
2032 // ------------------------------ UpperHalf
2033 
2034 template <typename T>
2036  return Vec256<T>{_mm512_extracti32x8_epi32(v.raw, 1)};
2037 }
2039  return Vec256<float>{_mm512_extractf32x8_ps(v.raw, 1)};
2040 }
2042  return Vec256<double>{_mm512_extractf64x4_pd(v.raw, 1)};
2043 }
2044 
2045 // ------------------------------ GetLane (LowerHalf)
2046 template <typename T>
2048  return GetLane(LowerHalf(v));
2049 }
2050 
2051 // ------------------------------ ZeroExtendVector
2052 
2053 // Unfortunately the initial _mm512_castsi256_si512 intrinsic leaves the upper
2054 // bits undefined. Although it makes sense for them to be zero (EVEX encoded
2055 // instructions have that effect), a compiler could decide to optimize out code
2056 // that relies on this.
2057 //
2058 // The newer _mm512_zextsi256_si512 intrinsic fixes this by specifying the
2059 // zeroing, but it is not available on GCC until 10.1. For older GCC, we can
2060 // still obtain the desired code thanks to pattern recognition; note that the
2061 // expensive insert instruction is not actually generated, see
2062 // https://gcc.godbolt.org/z/1MKGaP.
2063 
2064 template <typename T>
2066 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2067  return Vec512<T>{_mm512_inserti32x8(_mm512_setzero_si512(), lo.raw, 0)};
2068 #else
2069  return Vec512<T>{_mm512_zextsi256_si512(lo.raw)};
2070 #endif
2071 }
2073  Vec256<float> lo) {
2074 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2075  return Vec512<float>{_mm512_insertf32x8(_mm512_setzero_ps(), lo.raw, 0)};
2076 #else
2077  return Vec512<float>{_mm512_zextps256_ps512(lo.raw)};
2078 #endif
2079 }
2081  Vec256<double> lo) {
2082 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2083  return Vec512<double>{_mm512_insertf64x4(_mm512_setzero_pd(), lo.raw, 0)};
2084 #else
2085  return Vec512<double>{_mm512_zextpd256_pd512(lo.raw)};
2086 #endif
2087 }
2088 
2089 // ------------------------------ Combine
2090 
2091 template <typename T>
2093  const auto lo512 = ZeroExtendVector(d, lo);
2094  return Vec512<T>{_mm512_inserti32x8(lo512.raw, hi.raw, 1)};
2095 }
2097  Vec256<float> lo) {
2098  const auto lo512 = ZeroExtendVector(d, lo);
2099  return Vec512<float>{_mm512_insertf32x8(lo512.raw, hi.raw, 1)};
2100 }
2102  Vec256<double> lo) {
2103  const auto lo512 = ZeroExtendVector(d, lo);
2104  return Vec512<double>{_mm512_insertf64x4(lo512.raw, hi.raw, 1)};
2105 }
2106 
2107 // ------------------------------ ShiftLeftBytes
2108 
2109 template <int kBytes, typename T>
2111  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2112  return Vec512<T>{_mm512_bslli_epi128(v.raw, kBytes)};
2113 }
2114 
2115 template <int kBytes, typename T>
2117  return ShiftLeftBytes<kBytes>(Full512<T>(), v);
2118 }
2119 
2120 // ------------------------------ ShiftLeftLanes
2121 
2122 template <int kLanes, typename T>
2124  const Repartition<uint8_t, decltype(d)> d8;
2125  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2126 }
2127 
2128 template <int kLanes, typename T>
2130  return ShiftLeftLanes<kLanes>(Full512<T>(), v);
2131 }
2132 
2133 // ------------------------------ ShiftRightBytes
2134 template <int kBytes, typename T>
2136  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2137  return Vec512<T>{_mm512_bsrli_epi128(v.raw, kBytes)};
2138 }
2139 
2140 // ------------------------------ ShiftRightLanes
2141 template <int kLanes, typename T>
2143  const Repartition<uint8_t, decltype(d)> d8;
2144  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2145 }
2146 
2147 // ------------------------------ CombineShiftRightBytes
2148 
2149 template <int kBytes, typename T, class V = Vec512<T>>
2151  const Repartition<uint8_t, decltype(d)> d8;
2152  return BitCast(d, Vec512<uint8_t>{_mm512_alignr_epi8(
2153  BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
2154 }
2155 
2156 // ------------------------------ Broadcast/splat any lane
2157 
2158 // Unsigned
2159 template <int kLane>
2161  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2162  if (kLane < 4) {
2163  const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2164  return Vec512<uint16_t>{_mm512_unpacklo_epi64(lo, lo)};
2165  } else {
2166  const __m512i hi =
2167  _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2168  return Vec512<uint16_t>{_mm512_unpackhi_epi64(hi, hi)};
2169  }
2170 }
2171 template <int kLane>
2173  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2174  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
2175  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, perm)};
2176 }
2177 template <int kLane>
2179  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2180  constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2181  return Vec512<uint64_t>{_mm512_shuffle_epi32(v.raw, perm)};
2182 }
2183 
2184 // Signed
2185 template <int kLane>
2187  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2188  if (kLane < 4) {
2189  const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2190  return Vec512<int16_t>{_mm512_unpacklo_epi64(lo, lo)};
2191  } else {
2192  const __m512i hi =
2193  _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2194  return Vec512<int16_t>{_mm512_unpackhi_epi64(hi, hi)};
2195  }
2196 }
2197 template <int kLane>
2199  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2200  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
2201  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, perm)};
2202 }
2203 template <int kLane>
2205  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2206  constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2207  return Vec512<int64_t>{_mm512_shuffle_epi32(v.raw, perm)};
2208 }
2209 
2210 // Float
2211 template <int kLane>
2213  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2214  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
2215  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, perm)};
2216 }
2217 template <int kLane>
2219  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2220  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0xFF * kLane);
2221  return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, perm)};
2222 }
2223 
2224 // ------------------------------ Hard-coded shuffles
2225 
2226 // Notation: let Vec512<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
2227 // least-significant). Shuffle0321 rotates four-lane blocks one lane to the
2228 // right (the previous least-significant lane is now most-significant =>
2229 // 47650321). These could also be implemented via CombineShiftRightBytes but
2230 // the shuffle_abcd notation is more convenient.
2231 
2232 // Swap 32-bit halves in 64-bit halves.
2234  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CDAB)};
2235 }
2237  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CDAB)};
2238 }
2240  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CDAB)};
2241 }
2242 
2243 // Swap 64-bit halves
2245  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2246 }
2248  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2249 }
2251  // Shorter encoding than _mm512_permute_ps.
2252  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_BADC)};
2253 }
2255  return Vec512<uint64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2256 }
2258  return Vec512<int64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2259 }
2261  // Shorter encoding than _mm512_permute_pd.
2262  return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, _MM_PERM_BBBB)};
2263 }
2264 
2265 // Rotate right 32 bits
2267  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)};
2268 }
2270  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)};
2271 }
2273  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ADCB)};
2274 }
2275 // Rotate left 32 bits
2277  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)};
2278 }
2280  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)};
2281 }
2283  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CBAD)};
2284 }
2285 
2286 // Reverse
2288  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)};
2289 }
2291  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)};
2292 }
2294  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ABCD)};
2295 }
2296 
2297 // ------------------------------ TableLookupLanes
2298 
2299 // Returned by SetTableIndices for use by TableLookupLanes.
2300 template <typename T>
2301 struct Indices512 {
2302  __m512i raw;
2303 };
2304 
2305 template <typename T>
2306 HWY_API Indices512<T> SetTableIndices(const Full512<T>, const int32_t* idx) {
2307 #if HWY_IS_DEBUG_BUILD
2308  const size_t N = 64 / sizeof(T);
2309  for (size_t i = 0; i < N; ++i) {
2310  HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
2311  }
2312 #endif
2313  return Indices512<T>{LoadU(Full512<int32_t>(), idx).raw};
2314 }
2315 
2317  const Indices512<uint32_t> idx) {
2318  return Vec512<uint32_t>{_mm512_permutexvar_epi32(idx.raw, v.raw)};
2319 }
2321  const Indices512<int32_t> idx) {
2322  return Vec512<int32_t>{_mm512_permutexvar_epi32(idx.raw, v.raw)};
2323 }
2325  const Indices512<float> idx) {
2326  return Vec512<float>{_mm512_permutexvar_ps(idx.raw, v.raw)};
2327 }
2328 
2329 // ------------------------------ Reverse
2330 
2331 template <typename T>
2333  alignas(32) constexpr int32_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
2334  7, 6, 5, 4, 3, 2, 1, 0};
2335  return TableLookupLanes(v, SetTableIndices(d, kReverse));
2336 }
2337 
2338 // ------------------------------ InterleaveLower
2339 
2340 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
2341 // the least-significant lane) and "b". To concatenate two half-width integers
2342 // into one, use ZipLower/Upper instead (also works with scalar).
2343 
2345  const Vec512<uint8_t> b) {
2346  return Vec512<uint8_t>{_mm512_unpacklo_epi8(a.raw, b.raw)};
2347 }
2349  const Vec512<uint16_t> b) {
2350  return Vec512<uint16_t>{_mm512_unpacklo_epi16(a.raw, b.raw)};
2351 }
2353  const Vec512<uint32_t> b) {
2354  return Vec512<uint32_t>{_mm512_unpacklo_epi32(a.raw, b.raw)};
2355 }
2357  const Vec512<uint64_t> b) {
2358  return Vec512<uint64_t>{_mm512_unpacklo_epi64(a.raw, b.raw)};
2359 }
2360 
2362  const Vec512<int8_t> b) {
2363  return Vec512<int8_t>{_mm512_unpacklo_epi8(a.raw, b.raw)};
2364 }
2366  const Vec512<int16_t> b) {
2367  return Vec512<int16_t>{_mm512_unpacklo_epi16(a.raw, b.raw)};
2368 }
2370  const Vec512<int32_t> b) {
2371  return Vec512<int32_t>{_mm512_unpacklo_epi32(a.raw, b.raw)};
2372 }
2374  const Vec512<int64_t> b) {
2375  return Vec512<int64_t>{_mm512_unpacklo_epi64(a.raw, b.raw)};
2376 }
2377 
2379  const Vec512<float> b) {
2380  return Vec512<float>{_mm512_unpacklo_ps(a.raw, b.raw)};
2381 }
2383  const Vec512<double> b) {
2384  return Vec512<double>{_mm512_unpacklo_pd(a.raw, b.raw)};
2385 }
2386 
2387 // Additional overload for the optional Simd<> tag.
2388 template <typename T, class V = Vec512<T>>
2389 HWY_API V InterleaveLower(Full512<T> /* tag */, V a, V b) {
2390  return InterleaveLower(a, b);
2391 }
2392 
2393 // ------------------------------ InterleaveUpper
2394 
2395 // All functions inside detail lack the required D parameter.
2396 namespace detail {
2397 
2399  const Vec512<uint8_t> b) {
2400  return Vec512<uint8_t>{_mm512_unpackhi_epi8(a.raw, b.raw)};
2401 }
2403  const Vec512<uint16_t> b) {
2404  return Vec512<uint16_t>{_mm512_unpackhi_epi16(a.raw, b.raw)};
2405 }
2407  const Vec512<uint32_t> b) {
2408  return Vec512<uint32_t>{_mm512_unpackhi_epi32(a.raw, b.raw)};
2409 }
2411  const Vec512<uint64_t> b) {
2412  return Vec512<uint64_t>{_mm512_unpackhi_epi64(a.raw, b.raw)};
2413 }
2414 
2416  const Vec512<int8_t> b) {
2417  return Vec512<int8_t>{_mm512_unpackhi_epi8(a.raw, b.raw)};
2418 }
2420  const Vec512<int16_t> b) {
2421  return Vec512<int16_t>{_mm512_unpackhi_epi16(a.raw, b.raw)};
2422 }
2424  const Vec512<int32_t> b) {
2425  return Vec512<int32_t>{_mm512_unpackhi_epi32(a.raw, b.raw)};
2426 }
2428  const Vec512<int64_t> b) {
2429  return Vec512<int64_t>{_mm512_unpackhi_epi64(a.raw, b.raw)};
2430 }
2431 
2433  const Vec512<float> b) {
2434  return Vec512<float>{_mm512_unpackhi_ps(a.raw, b.raw)};
2435 }
2437  const Vec512<double> b) {
2438  return Vec512<double>{_mm512_unpackhi_pd(a.raw, b.raw)};
2439 }
2440 
2441 } // namespace detail
2442 
2443 template <typename T, class V = Vec512<T>>
2444 HWY_API V InterleaveUpper(Full512<T> /* tag */, V a, V b) {
2445  return detail::InterleaveUpper(a, b);
2446 }
2447 
2448 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2449 
2450 // Same as Interleave*, except that the return lanes are double-width integers;
2451 // this is necessary because the single-lane scalar cannot return two values.
2452 template <typename T, typename TW = MakeWide<T>>
2454  return BitCast(Full512<TW>(), InterleaveLower(a, b));
2455 }
2456 template <typename T, typename TW = MakeWide<T>>
2458  return BitCast(Full512<TW>(), InterleaveLower(d, a, b));
2459 }
2460 
2461 template <typename T, typename TW = MakeWide<T>>
2463  return BitCast(Full512<TW>(), InterleaveUpper(d, a, b));
2464 }
2465 
2466 // ------------------------------ Concat* halves
2467 
2468 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
2469 template <typename T>
2471  const Vec512<T> lo) {
2472  return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
2473 }
2475  const Vec512<float> hi,
2476  const Vec512<float> lo) {
2477  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
2478 }
2480  const Vec512<double> hi,
2481  const Vec512<double> lo) {
2482  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BABA)};
2483 }
2484 
2485 // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
2486 template <typename T>
2488  const Vec512<T> lo) {
2489  return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
2490 }
2492  const Vec512<float> hi,
2493  const Vec512<float> lo) {
2494  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
2495 }
2497  const Vec512<double> hi,
2498  const Vec512<double> lo) {
2499  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_DCDC)};
2500 }
2501 
2502 // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
2503 template <typename T>
2505  const Vec512<T> lo) {
2506  return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, 0x4E)};
2507 }
2509  const Vec512<float> hi,
2510  const Vec512<float> lo) {
2511  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, 0x4E)};
2512 }
2514  const Vec512<double> hi,
2515  const Vec512<double> lo) {
2516  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, 0x4E)};
2517 }
2518 
2519 // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
2520 template <typename T>
2522  const Vec512<T> lo) {
2523  // There are no imm8 blend in AVX512. Use blend16 because 32-bit masks
2524  // are efficiently loaded from 32-bit regs.
2525  const __mmask32 mask = /*_cvtu32_mask32 */ (0x0000FFFF);
2526  return Vec512<T>{_mm512_mask_blend_epi16(mask, hi.raw, lo.raw)};
2527 }
2529  const Vec512<float> hi,
2530  const Vec512<float> lo) {
2531  const __mmask16 mask = /*_cvtu32_mask16 */ (0x00FF);
2532  return Vec512<float>{_mm512_mask_blend_ps(mask, hi.raw, lo.raw)};
2533 }
2535  const Vec512<double> hi,
2536  const Vec512<double> lo) {
2537  const __mmask8 mask = /*_cvtu32_mask8 */ (0x0F);
2538  return Vec512<double>{_mm512_mask_blend_pd(mask, hi.raw, lo.raw)};
2539 }
2540 
2541 // ------------------------------ ConcatOdd
2542 
2543 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2545  const RebindToUnsigned<decltype(d)> du;
2546  alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2547  17, 19, 21, 23, 25, 27, 29, 31};
2548  return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
2549  BitCast(du, lo).raw, Load(du, kIdx).raw,
2550  __mmask16{0xFFFF}, BitCast(du, hi).raw)});
2551 }
2552 
2554  Vec512<float> lo) {
2555  const RebindToUnsigned<decltype(d)> du;
2556  alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2557  17, 19, 21, 23, 25, 27, 29, 31};
2558  return Vec512<float>{_mm512_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
2559  __mmask16{0xFFFF}, hi.raw)};
2560 }
2561 
2562 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2563 HWY_API Vec512<T> ConcatOdd(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2564  const RebindToUnsigned<decltype(d)> du;
2565  alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2566  return BitCast(d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2567  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
2568  BitCast(du, hi).raw)});
2569 }
2570 
2572  Vec512<double> lo) {
2573  const RebindToUnsigned<decltype(d)> du;
2574  alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2575  return Vec512<double>{_mm512_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
2576  __mmask8{0xFF}, hi.raw)};
2577 }
2578 
2579 // ------------------------------ ConcatEven
2580 
2581 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2583  const RebindToUnsigned<decltype(d)> du;
2584  alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
2585  16, 18, 20, 22, 24, 26, 28, 30};
2586  return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
2587  BitCast(du, lo).raw, Load(du, kIdx).raw,
2588  __mmask16{0xFFFF}, BitCast(du, hi).raw)});
2589 }
2590 
2592  Vec512<float> lo) {
2593  const RebindToUnsigned<decltype(d)> du;
2594  alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
2595  16, 18, 20, 22, 24, 26, 28, 30};
2596  return Vec512<float>{_mm512_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
2597  __mmask16{0xFFFF}, hi.raw)};
2598 }
2599 
2600 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2601 HWY_API Vec512<T> ConcatEven(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2602  const RebindToUnsigned<decltype(d)> du;
2603  alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2604  return BitCast(d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2605  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
2606  BitCast(du, hi).raw)});
2607 }
2608 
2610  Vec512<double> lo) {
2611  const RebindToUnsigned<decltype(d)> du;
2612  alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2613  return Vec512<double>{_mm512_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
2614  __mmask8{0xFF}, hi.raw)};
2615 }
2616 
2617 // ------------------------------ OddEven
2618 
2619 template <typename T>
2621  constexpr size_t s = sizeof(T);
2622  constexpr int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56;
2623  return IfThenElse(Mask512<T>{0x5555555555555555ull >> shift}, b, a);
2624 }
2625 
2626 // ------------------------------ TableLookupBytes (ZeroExtendVector)
2627 
2628 // Both full
2629 template <typename T>
2631  return Vec512<T>{_mm512_shuffle_epi8(bytes.raw, from.raw)};
2632 }
2633 
2634 // Partial index vector
2635 template <typename T, typename TI, size_t NI>
2637  const Full512<TI> d512;
2638  const Half<decltype(d512)> d256;
2639  const Half<decltype(d256)> d128;
2640  // First expand to full 128, then 256, then 512.
2641  const Vec128<TI> from_full{from.raw};
2642  const auto from_512 =
2643  ZeroExtendVector(d512, ZeroExtendVector(d256, from_full));
2644  const auto tbl_full = TableLookupBytes(bytes, from_512);
2645  // Shrink to 256, then 128, then partial.
2646  return Vec128<TI, NI>{LowerHalf(d128, LowerHalf(d256, tbl_full)).raw};
2647 }
2648 template <typename T, typename TI>
2650  const auto from_512 = ZeroExtendVector(Full512<TI>(), from);
2651  return LowerHalf(Full256<TI>(), TableLookupBytes(bytes, from_512));
2652 }
2653 
2654 // Partial table vector
2655 template <typename T, size_t N, typename TI>
2657  const Full512<TI> d512;
2658  const Half<decltype(d512)> d256;
2659  const Half<decltype(d256)> d128;
2660  // First expand to full 128, then 256, then 512.
2661  const Vec128<T> bytes_full{bytes.raw};
2662  const auto bytes_512 =
2663  ZeroExtendVector(d512, ZeroExtendVector(d256, bytes_full));
2664  return TableLookupBytes(bytes_512, from);
2665 }
2666 template <typename T, typename TI>
2668  const auto bytes_512 = ZeroExtendVector(Full512<T>(), bytes);
2669  return TableLookupBytes(bytes_512, from);
2670 }
2671 
2672 // Partial both are handled by x86_128/256.
2673 
2674 // ================================================== CONVERT
2675 
2676 // ------------------------------ Promotions (part w/ narrow lanes -> full)
2677 
2678 // Unsigned: zero-extend.
2679 // Note: these have 3 cycle latency; if inputs are already split across the
2680 // 128 bit blocks (in their upper/lower halves), then Zip* would be faster.
2682  Vec256<uint8_t> v) {
2683  return Vec512<uint16_t>{_mm512_cvtepu8_epi16(v.raw)};
2684 }
2686  Vec128<uint8_t> v) {
2687  return Vec512<uint32_t>{_mm512_cvtepu8_epi32(v.raw)};
2688 }
2690  Vec256<uint8_t> v) {
2691  return Vec512<int16_t>{_mm512_cvtepu8_epi16(v.raw)};
2692 }
2694  Vec128<uint8_t> v) {
2695  return Vec512<int32_t>{_mm512_cvtepu8_epi32(v.raw)};
2696 }
2698  Vec256<uint16_t> v) {
2699  return Vec512<uint32_t>{_mm512_cvtepu16_epi32(v.raw)};
2700 }
2702  Vec256<uint16_t> v) {
2703  return Vec512<int32_t>{_mm512_cvtepu16_epi32(v.raw)};
2704 }
2706  Vec256<uint32_t> v) {
2707  return Vec512<uint64_t>{_mm512_cvtepu32_epi64(v.raw)};
2708 }
2709 
2710 // Signed: replicate sign bit.
2711 // Note: these have 3 cycle latency; if inputs are already split across the
2712 // 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by
2713 // signed shift would be faster.
2715  Vec256<int8_t> v) {
2716  return Vec512<int16_t>{_mm512_cvtepi8_epi16(v.raw)};
2717 }
2719  Vec128<int8_t> v) {
2720  return Vec512<int32_t>{_mm512_cvtepi8_epi32(v.raw)};
2721 }
2723  Vec256<int16_t> v) {
2724  return Vec512<int32_t>{_mm512_cvtepi16_epi32(v.raw)};
2725 }
2727  Vec256<int32_t> v) {
2728  return Vec512<int64_t>{_mm512_cvtepi32_epi64(v.raw)};
2729 }
2730 
2731 // Float
2733  const Vec256<float16_t> v) {
2734  return Vec512<float>{_mm512_cvtph_ps(v.raw)};
2735 }
2736 
2738  const Vec256<bfloat16_t> v) {
2739  const Rebind<uint16_t, decltype(df32)> du16;
2740  const RebindToSigned<decltype(df32)> di32;
2741  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
2742 }
2743 
2745  return Vec512<double>{_mm512_cvtps_pd(v.raw)};
2746 }
2747 
2749  return Vec512<double>{_mm512_cvtepi32_pd(v.raw)};
2750 }
2751 
2752 // ------------------------------ Demotions (full -> part w/ narrow lanes)
2753 
2755  const Vec512<int32_t> v) {
2756  const Vec512<uint16_t> u16{_mm512_packus_epi32(v.raw, v.raw)};
2757 
2758  // Compress even u64 lanes into 256 bit.
2759  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
2760  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
2761  const Vec512<uint16_t> even{_mm512_permutexvar_epi64(idx64.raw, u16.raw)};
2762  return LowerHalf(even);
2763 }
2764 
2766  const Vec512<int32_t> v) {
2767  const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
2768 
2769  // Compress even u64 lanes into 256 bit.
2770  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
2771  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
2772  const Vec512<int16_t> even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)};
2773  return LowerHalf(even);
2774 }
2775 
2777  const Vec512<int32_t> v) {
2778  const Vec512<uint16_t> u16{_mm512_packus_epi32(v.raw, v.raw)};
2779  // packus treats the input as signed; we want unsigned. Clear the MSB to get
2780  // unsigned saturation to u8.
2781  const Vec512<int16_t> i16{
2782  _mm512_and_si512(u16.raw, _mm512_set1_epi16(0x7FFF))};
2783  const Vec512<uint8_t> u8{_mm512_packus_epi16(i16.raw, i16.raw)};
2784 
2785  alignas(16) static constexpr uint32_t kLanes[4] = {0, 4, 8, 12};
2786  const auto idx32 = LoadDup128(Full512<uint32_t>(), kLanes);
2787  const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
2788  return LowerHalf(LowerHalf(fixed));
2789 }
2790 
2792  const Vec512<int16_t> v) {
2793  const Vec512<uint8_t> u8{_mm512_packus_epi16(v.raw, v.raw)};
2794 
2795  // Compress even u64 lanes into 256 bit.
2796  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
2797  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
2798  const Vec512<uint8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
2799  return LowerHalf(even);
2800 }
2801 
2803  const Vec512<int32_t> v) {
2804  const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
2805  const Vec512<int8_t> i8{_mm512_packs_epi16(i16.raw, i16.raw)};
2806 
2807  alignas(16) static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12,
2808  0, 4, 8, 12, 0, 4, 8, 12};
2809  const auto idx32 = LoadDup128(Full512<uint32_t>(), kLanes);
2810  const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
2811  return LowerHalf(LowerHalf(fixed));
2812 }
2813 
2815  const Vec512<int16_t> v) {
2816  const Vec512<int8_t> u8{_mm512_packs_epi16(v.raw, v.raw)};
2817 
2818  // Compress even u64 lanes into 256 bit.
2819  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
2820  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
2821  const Vec512<int8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
2822  return LowerHalf(even);
2823 }
2824 
2826  const Vec512<float> v) {
2827  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2828  HWY_DIAGNOSTICS(push)
2829  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2830  return Vec256<float16_t>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
2831  HWY_DIAGNOSTICS(pop)
2832 }
2833 
2835  const Vec512<float> v) {
2836  // TODO(janwas): _mm512_cvtneps_pbh once we have avx512bf16.
2837  const Rebind<int32_t, decltype(dbf16)> di32;
2838  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
2839  const Rebind<uint16_t, decltype(dbf16)> du16;
2840  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
2841  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
2842 }
2843 
2846  // TODO(janwas): _mm512_cvtne2ps_pbh once we have avx512bf16.
2847  const RebindToUnsigned<decltype(dbf16)> du16;
2848  const Repartition<uint32_t, decltype(dbf16)> du32;
2849  const Vec512<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
2850  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
2851 }
2852 
2854  const Vec512<double> v) {
2855  return Vec256<float>{_mm512_cvtpd_ps(v.raw)};
2856 }
2857 
2859  const Vec512<double> v) {
2860  const auto clamped = detail::ClampF64ToI32Max(Full512<double>(), v);
2861  return Vec256<int32_t>{_mm512_cvttpd_epi32(clamped.raw)};
2862 }
2863 
2864 // For already range-limited input [0, 255].
2866  const Full512<uint32_t> d32;
2867  // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
2868  // lowest 4 bytes.
2869  alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
2870  ~0u};
2871  const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32));
2872  // Gather the lowest 4 bytes of 4 128-bit blocks.
2873  alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
2874  const Vec512<uint8_t> bytes{
2875  _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
2876  return LowerHalf(LowerHalf(bytes));
2877 }
2878 
2879 // ------------------------------ Convert integer <=> floating point
2880 
2882  const Vec512<int32_t> v) {
2883  return Vec512<float>{_mm512_cvtepi32_ps(v.raw)};
2884 }
2885 
2887  const Vec512<int64_t> v) {
2888  return Vec512<double>{_mm512_cvtepi64_pd(v.raw)};
2889 }
2890 
2891 // Truncates (rounds toward zero).
2893  return detail::FixConversionOverflow(d, v, _mm512_cvttps_epi32(v.raw));
2894 }
2896  return detail::FixConversionOverflow(di, v, _mm512_cvttpd_epi64(v.raw));
2897 }
2898 
2900  const Full512<int32_t> di;
2901  return detail::FixConversionOverflow(di, v, _mm512_cvtps_epi32(v.raw));
2902 }
2903 
2904 // ================================================== CRYPTO
2905 
2906 #if !defined(HWY_DISABLE_PCLMUL_AES)
2907 
2908 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
2909 #ifdef HWY_NATIVE_AES
2910 #undef HWY_NATIVE_AES
2911 #else
2912 #define HWY_NATIVE_AES
2913 #endif
2914 
2916  Vec512<uint8_t> round_key) {
2917 #if HWY_TARGET == HWY_AVX3_DL
2918  return Vec512<uint8_t>{_mm512_aesenc_epi128(state.raw, round_key.raw)};
2919 #else
2920  alignas(64) uint8_t a[64];
2921  alignas(64) uint8_t b[64];
2922  const Full512<uint8_t> d;
2923  const Full128<uint8_t> d128;
2924  Store(state, d, a);
2925  Store(round_key, d, b);
2926  for (size_t i = 0; i < 64; i += 16) {
2927  const auto enc = AESRound(Load(d128, a + i), Load(d128, b + i));
2928  Store(enc, d128, a + i);
2929  }
2930  return Load(d, a);
2931 #endif
2932 }
2933 
2935 #if HWY_TARGET == HWY_AVX3_DL
2936  return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x00)};
2937 #else
2938  alignas(64) uint64_t a[8];
2939  alignas(64) uint64_t b[8];
2940  const Full512<uint64_t> d;
2941  const Full128<uint64_t> d128;
2942  Store(va, d, a);
2943  Store(vb, d, b);
2944  for (size_t i = 0; i < 8; i += 2) {
2945  const auto mul = CLMulLower(Load(d128, a + i), Load(d128, b + i));
2946  Store(mul, d128, a + i);
2947  }
2948  return Load(d, a);
2949 #endif
2950 }
2951 
2953 #if HWY_TARGET == HWY_AVX3_DL
2954  return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x11)};
2955 #else
2956  alignas(64) uint64_t a[8];
2957  alignas(64) uint64_t b[8];
2958  const Full512<uint64_t> d;
2959  const Full128<uint64_t> d128;
2960  Store(va, d, a);
2961  Store(vb, d, b);
2962  for (size_t i = 0; i < 8; i += 2) {
2963  const auto mul = CLMulUpper(Load(d128, a + i), Load(d128, b + i));
2964  Store(mul, d128, a + i);
2965  }
2966  return Load(d, a);
2967 #endif
2968 }
2969 
2970 #endif // HWY_DISABLE_PCLMUL_AES
2971 
2972 // ================================================== MISC
2973 
2974 // Returns a vector with lane i=[0, N) set to "first" + i.
2975 template <typename T, typename T2>
2976 Vec512<T> Iota(const Full512<T> d, const T2 first) {
2977  HWY_ALIGN T lanes[64 / sizeof(T)];
2978  for (size_t i = 0; i < 64 / sizeof(T); ++i) {
2979  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
2980  }
2981  return Load(d, lanes);
2982 }
2983 
2984 // ------------------------------ Mask testing
2985 
2986 // Beware: the suffix indicates the number of mask bits, not lane size!
2987 
2988 namespace detail {
2989 
2990 template <typename T>
2991 HWY_INLINE bool AllFalse(hwy::SizeTag<1> /*tag*/, const Mask512<T> mask) {
2992 #if HWY_COMPILER_HAS_MASK_INTRINSICS
2993  return _kortestz_mask64_u8(mask.raw, mask.raw);
2994 #else
2995  return mask.raw == 0;
2996 #endif
2997 }
2998 template <typename T>
2999 HWY_INLINE bool AllFalse(hwy::SizeTag<2> /*tag*/, const Mask512<T> mask) {
3000 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3001  return _kortestz_mask32_u8(mask.raw, mask.raw);
3002 #else
3003  return mask.raw == 0;
3004 #endif
3005 }
3006 template <typename T>
3007 HWY_INLINE bool AllFalse(hwy::SizeTag<4> /*tag*/, const Mask512<T> mask) {
3008 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3009  return _kortestz_mask16_u8(mask.raw, mask.raw);
3010 #else
3011  return mask.raw == 0;
3012 #endif
3013 }
3014 template <typename T>
3015 HWY_INLINE bool AllFalse(hwy::SizeTag<8> /*tag*/, const Mask512<T> mask) {
3016 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3017  return _kortestz_mask8_u8(mask.raw, mask.raw);
3018 #else
3019  return mask.raw == 0;
3020 #endif
3021 }
3022 
3023 } // namespace detail
3024 
3025 template <typename T>
3026 HWY_API bool AllFalse(const Full512<T> /* tag */, const Mask512<T> mask) {
3027  return detail::AllFalse(hwy::SizeTag<sizeof(T)>(), mask);
3028 }
3029 
3030 namespace detail {
3031 
3032 template <typename T>
3033 HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask512<T> mask) {
3034 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3035  return _kortestc_mask64_u8(mask.raw, mask.raw);
3036 #else
3037  return mask.raw == 0xFFFFFFFFFFFFFFFFull;
3038 #endif
3039 }
3040 template <typename T>
3041 HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask512<T> mask) {
3042 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3043  return _kortestc_mask32_u8(mask.raw, mask.raw);
3044 #else
3045  return mask.raw == 0xFFFFFFFFull;
3046 #endif
3047 }
3048 template <typename T>
3049 HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask512<T> mask) {
3050 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3051  return _kortestc_mask16_u8(mask.raw, mask.raw);
3052 #else
3053  return mask.raw == 0xFFFFull;
3054 #endif
3055 }
3056 template <typename T>
3057 HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask512<T> mask) {
3058 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3059  return _kortestc_mask8_u8(mask.raw, mask.raw);
3060 #else
3061  return mask.raw == 0xFFull;
3062 #endif
3063 }
3064 
3065 } // namespace detail
3066 
3067 template <typename T>
3068 HWY_API bool AllTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
3069  return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), mask);
3070 }
3071 
3072 // `p` points to at least 8 readable bytes, not all of which need be valid.
3073 template <typename T>
3075  const uint8_t* HWY_RESTRICT bits) {
3076  Mask512<T> mask;
3077  CopyBytes<8 / sizeof(T)>(bits, &mask.raw);
3078  // N >= 8 (= 512 / 64), so no need to mask invalid bits.
3079  return mask;
3080 }
3081 
3082 // `p` points to at least 8 writable bytes.
3083 template <typename T>
3084 HWY_API size_t StoreMaskBits(const Full512<T> /* tag */, const Mask512<T> mask,
3085  uint8_t* bits) {
3086  const size_t kNumBytes = 8 / sizeof(T);
3087  CopyBytes<kNumBytes>(&mask.raw, bits);
3088  // N >= 8 (= 512 / 64), so no need to mask invalid bits.
3089  return kNumBytes;
3090 }
3091 
3092 template <typename T>
3093 HWY_API size_t CountTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
3094  return PopCount(static_cast<uint64_t>(mask.raw));
3095 }
3096 
3097 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
3098 HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
3099  const Mask512<T> mask) {
3100  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
3101 }
3102 
3103 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
3104 HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
3105  const Mask512<T> mask) {
3106  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask.raw)) : -1;
3107 }
3108 
3109 // ------------------------------ Compress
3110 
3111 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3113  return Vec512<T>{_mm512_maskz_compress_epi32(mask.raw, v.raw)};
3114 }
3115 
3116 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3117 HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
3118  return Vec512<T>{_mm512_maskz_compress_epi64(mask.raw, v.raw)};
3119 }
3120 
3122  return Vec512<float>{_mm512_maskz_compress_ps(mask.raw, v.raw)};
3123 }
3124 
3126  return Vec512<double>{_mm512_maskz_compress_pd(mask.raw, v.raw)};
3127 }
3128 
3129 // 16-bit may use the 32-bit Compress and must be defined after it.
3130 //
3131 // Ignore IDE redefinition error - this is not actually defined in x86_256 if
3132 // we are including x86_512-inl.h.
3133 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3134 HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
3135  const Full256<T> d;
3136  const Rebind<uint16_t, decltype(d)> du;
3137  const auto vu = BitCast(du, v); // (required for float16_t inputs)
3138 
3139 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
3140  const Vec256<uint16_t> cu{_mm256_maskz_compress_epi16(mask.raw, vu.raw)};
3141 #else
3142  // Promote to i32 (512-bit vector!) so we can use the native Compress.
3143  const auto vw = PromoteTo(Rebind<int32_t, decltype(d)>(), vu);
3144  const Mask512<int32_t> mask32{static_cast<__mmask16>(mask.raw)};
3145  const auto cu = DemoteTo(du, Compress(vw, mask32));
3146 #endif // HWY_TARGET == HWY_AVX3_DL
3147 
3148  return BitCast(d, cu);
3149 }
3150 
3151 // Expands to 32-bit, compresses, concatenate demoted halves.
3152 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3153 HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
3154  const Full512<T> d;
3155  const Rebind<uint16_t, decltype(d)> du;
3156  const auto vu = BitCast(du, v); // (required for float16_t inputs)
3157 
3158 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
3159  const Vec512<uint16_t> cu{_mm512_maskz_compress_epi16(mask.raw, v.raw)};
3160 #else
3161  const Repartition<int32_t, decltype(d)> dw;
3162  const Half<decltype(du)> duh;
3163  const auto promoted0 = PromoteTo(dw, LowerHalf(duh, vu));
3164  const auto promoted1 = PromoteTo(dw, UpperHalf(duh, vu));
3165 
3166  const uint32_t mask_bits{mask.raw};
3167  const Mask512<int32_t> mask0{static_cast<__mmask16>(mask_bits & 0xFFFF)};
3168  const Mask512<int32_t> mask1{static_cast<__mmask16>(mask_bits >> 16)};
3169  const auto compressed0 = Compress(promoted0, mask0);
3170  const auto compressed1 = Compress(promoted1, mask1);
3171 
3172  const auto demoted0 = ZeroExtendVector(DemoteTo(duh, compressed0));
3173  const auto demoted1 = ZeroExtendVector(DemoteTo(duh, compressed1));
3174 
3175  // Concatenate into single vector by shifting upper with writemask.
3176  const size_t num0 = CountTrue(dw, mask0);
3177  const __mmask32 m_upper = ~((1u << num0) - 1);
3178  alignas(64) uint16_t iota[64] = {
3179  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3180  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3181  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3182  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
3183  const auto idx = LoadU(du, iota + 32 - num0);
3184  const Vec512<uint16_t> cu{_mm512_mask_permutexvar_epi16(
3185  demoted0.raw, m_upper, idx.raw, demoted1.raw)};
3186 #endif // HWY_TARGET == HWY_AVX3_DL
3187 
3188  return BitCast(d, cu);
3189 }
3190 
3191 // ------------------------------ CompressBits
3192 template <typename T>
3194  return Compress(v, LoadMaskBits(Full512<T>(), bits));
3195 }
3196 
3197 // ------------------------------ CompressStore
3198 
3199 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3201  T* HWY_RESTRICT unaligned) {
3202  const Rebind<uint16_t, decltype(d)> du;
3203  const auto vu = BitCast(du, v); // (required for float16_t inputs)
3204 
3205  const uint64_t mask_bits{mask.raw};
3206 
3207 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
3208  _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
3209 #else
3210  const Repartition<int32_t, decltype(d)> dw;
3211  const Half<decltype(du)> duh;
3212  const auto promoted0 = PromoteTo(dw, LowerHalf(duh, vu));
3213  const auto promoted1 = PromoteTo(dw, UpperHalf(duh, vu));
3214 
3215  const uint64_t maskL = mask_bits & 0xFFFF;
3216  const uint64_t maskH = mask_bits >> 16;
3217  const Mask512<int32_t> mask0{static_cast<__mmask16>(maskL)};
3218  const Mask512<int32_t> mask1{static_cast<__mmask16>(maskH)};
3219  const auto compressed0 = Compress(promoted0, mask0);
3220  const auto compressed1 = Compress(promoted1, mask1);
3221 
3222  const Half<decltype(d)> dh;
3223  const auto demoted0 = BitCast(dh, DemoteTo(duh, compressed0));
3224  const auto demoted1 = BitCast(dh, DemoteTo(duh, compressed1));
3225 
3226  // Store 256-bit halves
3227  StoreU(demoted0, dh, unaligned);
3228  StoreU(demoted1, dh, unaligned + PopCount(maskL));
3229 #endif
3230 
3231  return PopCount(mask_bits);
3232 }
3233 
3234 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3235 HWY_API size_t CompressStore(Vec512<T> v, Mask512<T> mask, Full512<T> /* tag */,
3236  T* HWY_RESTRICT unaligned) {
3237  _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
3238  return PopCount(uint64_t{mask.raw});
3239 }
3240 
3241 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3242 HWY_API size_t CompressStore(Vec512<T> v, Mask512<T> mask, Full512<T> /* tag */,
3243  T* HWY_RESTRICT unaligned) {
3244  _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
3245  return PopCount(uint64_t{mask.raw});
3246 }
3247 
3249  Full512<float> /* tag */,
3250  float* HWY_RESTRICT unaligned) {
3251  _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
3252  return PopCount(uint64_t{mask.raw});
3253 }
3254 
3256  Full512<double> /* tag */,
3257  double* HWY_RESTRICT unaligned) {
3258  _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
3259  return PopCount(uint64_t{mask.raw});
3260 }
3261 
3262 // ------------------------------ CompressBitsStore
3263 template <typename T>
3264 HWY_API size_t CompressBitsStore(Vec512<T> v, const uint8_t* HWY_RESTRICT bits,
3265  Full512<T> d, T* HWY_RESTRICT unaligned) {
3266  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
3267 }
3268 
3269 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
3270 // TableLookupBytes)
3271 
3273  const Vec512<uint8_t> c, Full512<uint8_t> d,
3274  uint8_t* HWY_RESTRICT unaligned) {
3275  const auto k5 = Set(d, 5);
3276  const auto k6 = Set(d, 6);
3277 
3278  // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
3279  // 0x80 so lanes to be filled from other vectors are 0 for blending.
3280  alignas(16) static constexpr uint8_t tbl_r0[16] = {
3281  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
3282  3, 0x80, 0x80, 4, 0x80, 0x80, 5};
3283  alignas(16) static constexpr uint8_t tbl_g0[16] = {
3284  0x80, 0, 0x80, 0x80, 1, 0x80, //
3285  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
3286  const auto shuf_r0 = LoadDup128(d, tbl_r0);
3287  const auto shuf_g0 = LoadDup128(d, tbl_g0); // cannot reuse r0 due to 5
3288  const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
3289  const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0
3290  const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0.
3291  const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0..
3292  const auto i = (r0 | g0 | b0).raw; // low byte in each 128bit: 30 20 10 00
3293 
3294  // Second vector: g10,r10, bgr[9:6], b5,g5
3295  const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
3296  const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
3297  const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
3298  const auto r1 = TableLookupBytes(a, shuf_r1);
3299  const auto g1 = TableLookupBytes(b, shuf_g1);
3300  const auto b1 = TableLookupBytes(c, shuf_b1);
3301  const auto j = (r1 | g1 | b1).raw; // low byte in each 128bit: 35 25 15 05
3302 
3303  // Third vector: bgr[15:11], b10
3304  const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
3305  const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
3306  const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
3307  const auto r2 = TableLookupBytes(a, shuf_r2);
3308  const auto g2 = TableLookupBytes(b, shuf_g2);
3309  const auto b2 = TableLookupBytes(c, shuf_b2);
3310  const auto k = (r2 | g2 | b2).raw; // low byte in each 128bit: 3A 2A 1A 0A
3311 
3312  // To obtain 10 0A 05 00 in one vector, transpose "rows" into "columns".
3313  const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_SHUFFLE(3, 0, 3, 0));
3314  const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_SHUFFLE(1, 2, 0, 1));
3315  const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_SHUFFLE(2, 3, 1, 2));
3316 
3317  // Alternating order, most-significant 128 bits from the second arg.
3318  const __mmask8 m = 0xCC;
3319  const auto i1_k0_j0_i0 = _mm512_mask_blend_epi64(m, k3_k0_i3_i0, i1_i2_j0_j1);
3320  const auto j2_i2_k1_j1 = _mm512_mask_blend_epi64(m, i1_i2_j0_j1, j2_j3_k1_k2);
3321  const auto k3_j3_i3_k2 = _mm512_mask_blend_epi64(m, j2_j3_k1_k2, k3_k0_i3_i0);
3322 
3323  StoreU(Vec512<uint8_t>{i1_k0_j0_i0}, d, unaligned + 0 * 64); // 10 0A 05 00
3324  StoreU(Vec512<uint8_t>{j2_i2_k1_j1}, d, unaligned + 1 * 64); // 25 20 1A 15
3325  StoreU(Vec512<uint8_t>{k3_j3_i3_k2}, d, unaligned + 2 * 64); // 3A 35 30 2A
3326 }
3327 
3328 // ------------------------------ StoreInterleaved4
3329 
3331  const Vec512<uint8_t> v1,
3332  const Vec512<uint8_t> v2,
3333  const Vec512<uint8_t> v3, Full512<uint8_t> d8,
3334  uint8_t* HWY_RESTRICT unaligned) {
3335  const RepartitionToWide<decltype(d8)> d16;
3336  const RepartitionToWide<decltype(d16)> d32;
3337  // let a,b,c,d denote v0..3.
3338  const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
3339  const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
3340  const auto ba8 = ZipUpper(d16, v0, v1);
3341  const auto dc8 = ZipUpper(d16, v2, v3);
3342  const auto i = ZipLower(d32, ba0, dc0).raw; // 4x128bit: d..a3 d..a0
3343  const auto j = ZipUpper(d32, ba0, dc0).raw; // 4x128bit: d..a7 d..a4
3344  const auto k = ZipLower(d32, ba8, dc8).raw; // 4x128bit: d..aB d..a8
3345  const auto l = ZipUpper(d32, ba8, dc8).raw; // 4x128bit: d..aF d..aC
3346  // 128-bit blocks were independent until now; transpose 4x4.
3347  const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(1, 0, 1, 0));
3348  const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(1, 0, 1, 0));
3349  const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(3, 2, 3, 2));
3350  const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(3, 2, 3, 2));
3351  constexpr int k20 = _MM_SHUFFLE(2, 0, 2, 0);
3352  constexpr int k31 = _MM_SHUFFLE(3, 1, 3, 1);
3353  const auto l0_k0_j0_i0 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k20);
3354  const auto l1_k1_j1_i1 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k31);
3355  const auto l2_k2_j2_i2 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k20);
3356  const auto l3_k3_j3_i3 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k31);
3357  StoreU(Vec512<uint8_t>{l0_k0_j0_i0}, d8, unaligned + 0 * 64);
3358  StoreU(Vec512<uint8_t>{l1_k1_j1_i1}, d8, unaligned + 1 * 64);
3359  StoreU(Vec512<uint8_t>{l2_k2_j2_i2}, d8, unaligned + 2 * 64);
3360  StoreU(Vec512<uint8_t>{l3_k3_j3_i3}, d8, unaligned + 3 * 64);
3361 }
3362 
3363 // ------------------------------ MulEven/Odd (Shuffle2301, InterleaveLower)
3364 
3366  const Vec512<uint64_t> b) {
3367  const DFromV<decltype(a)> du64;
3368  const RepartitionToNarrow<decltype(du64)> du32;
3369  const auto maskL = Set(du64, 0xFFFFFFFFULL);
3370  const auto a32 = BitCast(du32, a);
3371  const auto b32 = BitCast(du32, b);
3372  // Inputs for MulEven: we only need the lower 32 bits
3373  const auto aH = Shuffle2301(a32);
3374  const auto bH = Shuffle2301(b32);
3375 
3376  // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
3377  // the even (lower 64 bits of every 128-bit block) results. See
3378  // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
3379  const auto aLbL = MulEven(a32, b32);
3380  const auto w3 = aLbL & maskL;
3381 
3382  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3383  const auto w2 = t2 & maskL;
3384  const auto w1 = ShiftRight<32>(t2);
3385 
3386  const auto t = MulEven(a32, bH) + w2;
3387  const auto k = ShiftRight<32>(t);
3388 
3389  const auto mulH = MulEven(aH, bH) + w1 + k;
3390  const auto mulL = ShiftLeft<32>(t) + w3;
3391  return InterleaveLower(mulL, mulH);
3392 }
3393 
3395  const Vec512<uint64_t> b) {
3396  const DFromV<decltype(a)> du64;
3397  const RepartitionToNarrow<decltype(du64)> du32;
3398  const auto maskL = Set(du64, 0xFFFFFFFFULL);
3399  const auto a32 = BitCast(du32, a);
3400  const auto b32 = BitCast(du32, b);
3401  // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
3402  const auto aH = Shuffle2301(a32);
3403  const auto bH = Shuffle2301(b32);
3404 
3405  // Same as above, but we're using the odd results (upper 64 bits per block).
3406  const auto aLbL = MulEven(a32, b32);
3407  const auto w3 = aLbL & maskL;
3408 
3409  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3410  const auto w2 = t2 & maskL;
3411  const auto w1 = ShiftRight<32>(t2);
3412 
3413  const auto t = MulEven(a32, bH) + w2;
3414  const auto k = ShiftRight<32>(t);
3415 
3416  const auto mulH = MulEven(aH, bH) + w1 + k;
3417  const auto mulL = ShiftLeft<32>(t) + w3;
3418  return InterleaveUpper(du64, mulL, mulH);
3419 }
3420 
3421 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
3422 
3426  const Vec512<float> sum0,
3427  Vec512<float>& sum1) {
3428  // TODO(janwas): _mm512_dpbf16_ps when available
3429  const Repartition<uint16_t, decltype(df32)> du16;
3430  const RebindToUnsigned<decltype(df32)> du32;
3431  const Vec512<uint16_t> zero = Zero(du16);
3432  // Lane order within sum0/1 is undefined, hence we can avoid the
3433  // longer-latency lane-crossing PromoteTo.
3434  const Vec512<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
3435  const Vec512<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
3436  const Vec512<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
3437  const Vec512<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
3438  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
3439  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
3440 }
3441 
3442 // ------------------------------ Reductions
3443 
3444 // Returns the sum in each lane.
3446  return Set(d, _mm512_reduce_add_epi32(v.raw));
3447 }
3449  return Set(d, _mm512_reduce_add_epi64(v.raw));
3450 }
3452  return Set(d, static_cast<uint32_t>(_mm512_reduce_add_epi32(v.raw)));
3453 }
3455  return Set(d, static_cast<uint64_t>(_mm512_reduce_add_epi64(v.raw)));
3456 }
3458  return Set(d, _mm512_reduce_add_ps(v.raw));
3459 }
3461  return Set(d, _mm512_reduce_add_pd(v.raw));
3462 }
3463 
3464 // Returns the minimum in each lane.
3466  return Set(d, _mm512_reduce_min_epi32(v.raw));
3467 }
3469  return Set(d, _mm512_reduce_min_epi64(v.raw));
3470 }
3472  return Set(d, _mm512_reduce_min_epu32(v.raw));
3473 }
3475  return Set(d, _mm512_reduce_min_epu64(v.raw));
3476 }
3478  return Set(d, _mm512_reduce_min_ps(v.raw));
3479 }
3481  return Set(d, _mm512_reduce_min_pd(v.raw));
3482 }
3483 
3484 // Returns the maximum in each lane.
3486  return Set(d, _mm512_reduce_max_epi32(v.raw));
3487 }
3489  return Set(d, _mm512_reduce_max_epi64(v.raw));
3490 }
3492  return Set(d, _mm512_reduce_max_epu32(v.raw));
3493 }
3495  return Set(d, _mm512_reduce_max_epu64(v.raw));
3496 }
3498  return Set(d, _mm512_reduce_max_ps(v.raw));
3499 }
3501  return Set(d, _mm512_reduce_max_pd(v.raw));
3502 }
3503 
3504 // ================================================== DEPRECATED
3505 
3506 template <typename T>
3507 HWY_API size_t StoreMaskBits(const Mask512<T> mask, uint8_t* bits) {
3508  return StoreMaskBits(Full512<T>(), mask, bits);
3509 }
3510 
3511 template <typename T>
3512 HWY_API bool AllTrue(const Mask512<T> mask) {
3513  return AllTrue(Full512<T>(), mask);
3514 }
3515 
3516 template <typename T>
3517 HWY_API bool AllFalse(const Mask512<T> mask) {
3518  return AllFalse(Full512<T>(), mask);
3519 }
3520 
3521 template <typename T>
3522 HWY_API size_t CountTrue(const Mask512<T> mask) {
3523  return CountTrue(Full512<T>(), mask);
3524 }
3525 
3526 template <typename T>
3528  return SumOfLanes(Full512<T>(), v);
3529 }
3530 
3531 template <typename T>
3533  return MinOfLanes(Full512<T>(), v);
3534 }
3535 
3536 template <typename T>
3538  return MaxOfLanes(Full512<T>(), v);
3539 }
3540 
3541 template <typename T>
3543  return UpperHalf(Full256<T>(), v);
3544 }
3545 
3546 template <int kBytes, typename T>
3548  return ShiftRightBytes<kBytes>(Full512<T>(), v);
3549 }
3550 
3551 template <int kLanes, typename T>
3553  return ShiftRightBytes<kLanes>(Full512<T>(), v);
3554 }
3555 
3556 template <size_t kBytes, typename T>
3558  return CombineShiftRightBytes<kBytes>(Full512<T>(), hi, lo);
3559 }
3560 
3561 template <typename T>
3563  return InterleaveUpper(Full512<T>(), a, b);
3564 }
3565 
3566 template <typename T>
3568  return InterleaveUpper(Full512<MakeWide<T>>(), a, b);
3569 }
3570 
3571 template <typename T>
3573  return Combine(Full512<T>(), hi, lo);
3574 }
3575 
3576 template <typename T>
3578  return ZeroExtendVector(Full512<T>(), lo);
3579 }
3580 
3581 template <typename T>
3583  return ConcatLowerLower(Full512<T>(), hi, lo);
3584 }
3585 
3586 template <typename T>
3588  return ConcatLowerUpper(Full512<T>(), hi, lo);
3589 }
3590 
3591 template <typename T>
3593  return ConcatUpperLower(Full512<T>(), hi, lo);
3594 }
3595 
3596 template <typename T>
3598  return ConcatUpperUpper(Full512<T>(), hi, lo);
3599 }
3600 
3601 // NOLINTNEXTLINE(google-readability-namespace-comments)
3602 } // namespace HWY_NAMESPACE
3603 } // namespace hwy
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:66
#define HWY_API
Definition: base.h:117
#define HWY_INLINE
Definition: base.h:59
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:67
#define HWY_DASSERT(condition)
Definition: base.h:163
Definition: arm_neon-inl.h:468
Raw raw
Definition: arm_neon-inl.h:501
Definition: x86_256-inl.h:67
Raw raw
Definition: x86_256-inl.h:95
Definition: x86_512-inl.h:101
typename detail::Raw512< T >::type Raw
Definition: x86_512-inl.h:102
HWY_INLINE Vec512 & operator*=(const Vec512 other)
Definition: x86_512-inl.h:107
Raw raw
Definition: x86_512-inl.h:129
HWY_INLINE Vec512 & operator+=(const Vec512 other)
Definition: x86_512-inl.h:113
HWY_INLINE Vec512 & operator&=(const Vec512 other)
Definition: x86_512-inl.h:119
HWY_INLINE Vec512 & operator|=(const Vec512 other)
Definition: x86_512-inl.h:122
HWY_INLINE Vec512 & operator-=(const Vec512 other)
Definition: x86_512-inl.h:116
HWY_INLINE Vec512 & operator^=(const Vec512 other)
Definition: x86_512-inl.h:125
HWY_INLINE Vec512 & operator/=(const Vec512 other)
Definition: x86_512-inl.h:110
const double shift
Definition: RateControl.cpp:165
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1463
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:842
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:2739
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:2798
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1278
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1136
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:127
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:3589
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:879
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:672
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:578
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:768
HWY_INLINE auto FixConversionOverflow(Simd< TI, N > di, decltype(Zero(DF())) original, decltype(Zero(di).raw) converted_raw) -> decltype(Zero(di))
Definition: x86_128-inl.h:4176
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1487
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2184
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:4165
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:551
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3490
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:2728
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:714
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2176
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:805
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:624
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:803
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1573
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:66
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_API Vec128< uint32_t, N > TableLookupLanes(const Vec128< uint32_t, N > v, const Indices128< uint32_t, N > idx)
Definition: arm_neon-inl.h:3342
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3064
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1167
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1520
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3435
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2829
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1073
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3629
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2949
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3052
HWY_API size_t StoreMaskBits(Simd< T, N >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4528
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4207
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_API Vec128< T, N > LoadDup128(Simd< T, N > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2164
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4054
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2352
HWY_API Vec128< T, N > VecFromMask(const Mask128< T, N > v)
Definition: arm_neon-inl.h:1607
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1206
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3760
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3495
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N > d, const int32_t *idx)
Definition: arm_neon-inl.h:3323
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1600
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: shared-inl.h:149
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:904
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3947
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3726
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3389
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1795
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:422
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3070
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3802
Vec128< T, N > Iota(const Simd< T, N > d, const T2 first)
Definition: arm_neon-inl.h:734
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3483
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1438
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1443
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:3907
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2903
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: shared-inl.h:147
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:778
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1619
HWY_API Vec128< T, N > Undefined(Simd< T, N >)
Definition: arm_neon-inl.h:724
HWY_API intptr_t FindFirstTrue(const Simd< T, N >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4520
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1448
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4509
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:392
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3395
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4538
HWY_API bool AllTrue(const Simd< T, N > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4557
HWY_API void StoreInterleaved3(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4829
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_API VFromD< DW > ZipUpper(DW dw, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3538
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1581
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1655
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2170
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1735
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2846
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: shared-inl.h:160
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:794
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:475
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:325
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3506
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1827
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3041
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1126
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:449
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec128< uint8_t, 8 > hi, Vec128< uint8_t, 8 > lo)
Definition: arm_neon-inl.h:3566
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1194
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4071
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2157
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4036
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3637
typename D::template Rebind< T > Rebind
Definition: shared-inl.h:144
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1799
HWY_API V InterleaveUpper(Simd< T, N >, V a, V b)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1084
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< uint32_t, 2 > Shuffle2301(const Vec128< uint32_t, 2 > v)
Definition: arm_neon-inl.h:1698
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:319
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1077
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:532
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2890
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1366
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4276
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4812
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:3844
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4211
Neg(const Vec128< int64_t, 1 > v)
Definition: arm_neon-inl.h:866
HWY_API Vec128< uint8_t, 4 > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2699
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3235
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3752
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4787
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2818
HWY_API VFromD< DW > ZipLower(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3527
typename D::Half Half
Definition: shared-inl.h:164
typename D::template Repartition< T > Repartition
Definition: shared-inl.h:155
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:2665
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4802
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3401
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:3957
HWY_API void StoreInterleaved4(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, const Vec128< uint8_t > v3, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4864
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< uint8_t, 8 > UpperHalf(Simd< uint8_t, 8 >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3096
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3407
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:984
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1590
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3385
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1320
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4780
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:405
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1296
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
Definition: aligned_allocator.h:23
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:634
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:565
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:589
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:555
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:521
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:529
#define HWY_ALIGN
Definition: set_macros-inl.h:78
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
Definition: x86_512-inl.h:2301
__m512i raw
Definition: x86_512-inl.h:2302
Definition: x86_512-inl.h:134
detail::RawMask512< sizeof(T)>::type raw
Definition: x86_512-inl.h:135
Definition: shared-inl.h:35
HWY_INLINE __m512d operator()(__m512i v)
Definition: x86_512-inl.h:164
HWY_INLINE __m512 operator()(__m512i v)
Definition: x86_512-inl.h:160
HWY_INLINE __m512i operator()(__m512i v)
Definition: x86_512-inl.h:156
__m512d type
Definition: x86_512-inl.h:75
__m512 type
Definition: x86_512-inl.h:71
Definition: x86_512-inl.h:66
__m512i type
Definition: x86_512-inl.h:67
__mmask64 type
Definition: x86_512-inl.h:83
__mmask32 type
Definition: x86_512-inl.h:87
__mmask16 type
Definition: x86_512-inl.h:91
__mmask8 type
Definition: x86_512-inl.h:95
Definition: x86_512-inl.h:80
Definition: base.h:290
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()