Grok  9.5.0
scalar-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Single-element vectors and operations.
16 // External include guard in highway.h - see comment there.
17 
18 #include <stddef.h>
19 #include <stdint.h>
20 
21 #include "hwy/base.h"
22 #include "hwy/ops/shared-inl.h"
23 
25 namespace hwy {
26 namespace HWY_NAMESPACE {
27 
28 // Single instruction, single data.
29 template <typename T>
30 using Sisd = Simd<T, 1>;
31 
32 // (Wrapper class required for overloading comparison operators.)
33 template <typename T>
34 struct Vec1 {
35  HWY_INLINE Vec1() = default;
36  Vec1(const Vec1&) = default;
37  Vec1& operator=(const Vec1&) = default;
38  HWY_INLINE explicit Vec1(const T t) : raw(t) {}
39 
40  HWY_INLINE Vec1& operator*=(const Vec1 other) {
41  return *this = (*this * other);
42  }
43  HWY_INLINE Vec1& operator/=(const Vec1 other) {
44  return *this = (*this / other);
45  }
46  HWY_INLINE Vec1& operator+=(const Vec1 other) {
47  return *this = (*this + other);
48  }
49  HWY_INLINE Vec1& operator-=(const Vec1 other) {
50  return *this = (*this - other);
51  }
52  HWY_INLINE Vec1& operator&=(const Vec1 other) {
53  return *this = (*this & other);
54  }
55  HWY_INLINE Vec1& operator|=(const Vec1 other) {
56  return *this = (*this | other);
57  }
58  HWY_INLINE Vec1& operator^=(const Vec1 other) {
59  return *this = (*this ^ other);
60  }
61 
62  T raw;
63 };
64 
65 // 0 or FF..FF, same size as Vec1.
66 template <typename T>
67 class Mask1 {
69 
70  public:
71  static HWY_INLINE Mask1<T> FromBool(bool b) {
72  Mask1<T> mask;
73  mask.bits = b ? ~Raw(0) : 0;
74  return mask;
75  }
76 
78 };
79 
80 namespace detail {
81 
82 // Deduce Sisd<T> from Vec1<T>
83 struct Deduce1 {
84  template <typename T>
86  return Sisd<T>();
87  }
88 };
89 
90 } // namespace detail
91 
92 template <class V>
93 using DFromV = decltype(detail::Deduce1()(V()));
94 
95 template <class V>
96 using TFromV = TFromD<DFromV<V>>;
97 
98 // ------------------------------ BitCast
99 
100 template <typename T, typename FromT>
102  static_assert(sizeof(T) <= sizeof(FromT), "Promoting is undefined");
103  T to;
104  CopyBytes<sizeof(FromT)>(&v.raw, &to);
105  return Vec1<T>(to);
106 }
107 
108 // ------------------------------ Set
109 
110 template <typename T>
112  return Vec1<T>(T(0));
113 }
114 
115 template <typename T, typename T2>
116 HWY_API Vec1<T> Set(Sisd<T> /* tag */, const T2 t) {
117  return Vec1<T>(static_cast<T>(t));
118 }
119 
120 template <typename T>
122  return Zero(d);
123 }
124 
125 template <typename T, typename T2>
126 HWY_API Vec1<T> Iota(const Sisd<T> /* tag */, const T2 first) {
127  return Vec1<T>(static_cast<T>(first));
128 }
129 
130 // ================================================== LOGICAL
131 
132 // ------------------------------ Not
133 
134 template <typename T>
136  using TU = MakeUnsigned<T>;
137  const Sisd<TU> du;
138  return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, v).raw)));
139 }
140 
141 // ------------------------------ And
142 
143 template <typename T>
144 HWY_API Vec1<T> And(const Vec1<T> a, const Vec1<T> b) {
145  using TU = MakeUnsigned<T>;
146  const Sisd<TU> du;
147  return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw & BitCast(du, b).raw));
148 }
149 template <typename T>
151  return And(a, b);
152 }
153 
154 // ------------------------------ AndNot
155 
156 template <typename T>
157 HWY_API Vec1<T> AndNot(const Vec1<T> a, const Vec1<T> b) {
158  using TU = MakeUnsigned<T>;
159  const Sisd<TU> du;
160  return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, a).raw &
161  BitCast(du, b).raw)));
162 }
163 
164 // ------------------------------ Or
165 
166 template <typename T>
167 HWY_API Vec1<T> Or(const Vec1<T> a, const Vec1<T> b) {
168  using TU = MakeUnsigned<T>;
169  const Sisd<TU> du;
170  return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw | BitCast(du, b).raw));
171 }
172 template <typename T>
174  return Or(a, b);
175 }
176 
177 // ------------------------------ Xor
178 
179 template <typename T>
180 HWY_API Vec1<T> Xor(const Vec1<T> a, const Vec1<T> b) {
181  using TU = MakeUnsigned<T>;
182  const Sisd<TU> du;
183  return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw ^ BitCast(du, b).raw));
184 }
185 template <typename T>
187  return Xor(a, b);
188 }
189 
190 // ------------------------------ CopySign
191 
192 template <typename T>
193 HWY_API Vec1<T> CopySign(const Vec1<T> magn, const Vec1<T> sign) {
194  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
195  const auto msb = SignBit(Sisd<T>());
196  return Or(AndNot(msb, magn), And(msb, sign));
197 }
198 
199 template <typename T>
200 HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) {
201  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
202  return Or(abs, And(SignBit(Sisd<T>()), sign));
203 }
204 
205 // ------------------------------ BroadcastSignBit
206 
207 template <typename T>
209  // This is used inside ShiftRight, so we cannot implement in terms of it.
210  return v.raw < 0 ? Vec1<T>(T(-1)) : Vec1<T>(0);
211 }
212 
213 // ------------------------------ PopulationCount
214 
215 #ifdef HWY_NATIVE_POPCNT
216 #undef HWY_NATIVE_POPCNT
217 #else
218 #define HWY_NATIVE_POPCNT
219 #endif
220 
221 template <typename T>
223  return Vec1<T>(static_cast<T>(PopCount(v.raw)));
224 }
225 
226 // ------------------------------ Mask
227 
228 template <typename TFrom, typename TTo>
230  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
231  return Mask1<TTo>{m.bits};
232 }
233 
234 // v must be 0 or FF..FF.
235 template <typename T>
237  Mask1<T> mask;
238  CopyBytes<sizeof(mask.bits)>(&v.raw, &mask.bits);
239  return mask;
240 }
241 
242 template <typename T>
244  Vec1<T> v;
245  CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
246  return v;
247 }
248 
249 template <typename T>
250 Vec1<T> VecFromMask(Sisd<T> /* tag */, const Mask1<T> mask) {
251  Vec1<T> v;
252  CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
253  return v;
254 }
255 
256 template <typename T>
257 HWY_API Mask1<T> FirstN(Sisd<T> /*tag*/, size_t n) {
258  return Mask1<T>::FromBool(n != 0);
259 }
260 
261 // Returns mask ? yes : no.
262 template <typename T>
263 HWY_API Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes,
264  const Vec1<T> no) {
265  return mask.bits ? yes : no;
266 }
267 
268 template <typename T>
270  return mask.bits ? yes : Vec1<T>(0);
271 }
272 
273 template <typename T>
275  return mask.bits ? Vec1<T>(0) : no;
276 }
277 
278 template <typename T>
280  return v.raw < 0 ? Vec1<T>(0) : v;
281 }
282 
283 // ------------------------------ Mask logical
284 
285 template <typename T>
287  return MaskFromVec(Not(VecFromMask(Sisd<T>(), m)));
288 }
289 
290 template <typename T>
292  const Sisd<T> d;
293  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
294 }
295 
296 template <typename T>
298  const Sisd<T> d;
299  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
300 }
301 
302 template <typename T>
304  const Sisd<T> d;
305  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
306 }
307 
308 template <typename T>
310  const Sisd<T> d;
311  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
312 }
313 
314 // ================================================== SHIFTS
315 
316 // ------------------------------ ShiftLeft (BroadcastSignBit)
317 
318 template <int kBits, typename T>
320  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
321  return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits);
322 }
323 
324 template <int kBits, typename T>
326  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
327 #if __cplusplus >= 202002L
328  // Signed right shift is now guaranteed to be arithmetic (rounding toward
329  // negative infinity, i.e. shifting in the sign bit).
330  return Vec1<T>(v.raw >> kBits);
331 #else
332  if (IsSigned<T>()) {
333  // Emulate arithmetic shift using only logical (unsigned) shifts, because
334  // signed shifts are still implementation-defined.
335  using TU = hwy::MakeUnsigned<T>;
336  const Sisd<TU> du;
337  const TU shifted = BitCast(du, v).raw >> kBits;
338  const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
339  const TU upper = sign << (sizeof(TU) * 8 - 1 - kBits);
340  return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
341  } else {
342  return Vec1<T>(v.raw >> kBits); // unsigned, logical shift
343  }
344 #endif
345 }
346 
347 // ------------------------------ ShiftLeftSame (BroadcastSignBit)
348 
349 template <typename T>
350 HWY_API Vec1<T> ShiftLeftSame(const Vec1<T> v, int bits) {
351  return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits);
352 }
353 
354 template <typename T>
355 HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
356 #if __cplusplus >= 202002L
357  // Signed right shift is now guaranteed to be arithmetic (rounding toward
358  // negative infinity, i.e. shifting in the sign bit).
359  return Vec1<T>(v.raw >> bits);
360 #else
361  if (IsSigned<T>()) {
362  // Emulate arithmetic shift using only logical (unsigned) shifts, because
363  // signed shifts are still implementation-defined.
364  using TU = hwy::MakeUnsigned<T>;
365  const Sisd<TU> du;
366  const TU shifted = BitCast(du, v).raw >> bits;
367  const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
368  const TU upper = sign << (sizeof(TU) * 8 - 1 - bits);
369  return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
370  } else {
371  return Vec1<T>(v.raw >> bits); // unsigned, logical shift
372  }
373 #endif
374 }
375 
376 // ------------------------------ Shl
377 
378 // Single-lane => same as ShiftLeftSame except for the argument type.
379 template <typename T>
380 HWY_API Vec1<T> operator<<(const Vec1<T> v, const Vec1<T> bits) {
381  return ShiftLeftSame(v, static_cast<int>(bits.raw));
382 }
383 
384 template <typename T>
385 HWY_API Vec1<T> operator>>(const Vec1<T> v, const Vec1<T> bits) {
386  return ShiftRightSame(v, static_cast<int>(bits.raw));
387 }
388 
389 // ================================================== ARITHMETIC
390 
391 template <typename T>
393  const uint64_t a64 = static_cast<uint64_t>(a.raw);
394  const uint64_t b64 = static_cast<uint64_t>(b.raw);
395  return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))));
396 }
398  return Vec1<float>(a.raw + b.raw);
399 }
401  return Vec1<double>(a.raw + b.raw);
402 }
403 
404 template <typename T>
406  const uint64_t a64 = static_cast<uint64_t>(a.raw);
407  const uint64_t b64 = static_cast<uint64_t>(b.raw);
408  return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))));
409 }
411  return Vec1<float>(a.raw - b.raw);
412 }
414  return Vec1<double>(a.raw - b.raw);
415 }
416 
417 // ------------------------------ Saturating addition
418 
419 // Returns a + b clamped to the destination range.
420 
421 // Unsigned
423  const Vec1<uint8_t> b) {
424  return Vec1<uint8_t>(
425  static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)));
426 }
428  const Vec1<uint16_t> b) {
429  return Vec1<uint16_t>(
430  static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535)));
431 }
432 
433 // Signed
435  return Vec1<int8_t>(
436  static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)));
437 }
439  const Vec1<int16_t> b) {
440  return Vec1<int16_t>(
441  static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767)));
442 }
443 
444 // ------------------------------ Saturating subtraction
445 
446 // Returns a - b clamped to the destination range.
447 
448 // Unsigned
450  const Vec1<uint8_t> b) {
451  return Vec1<uint8_t>(
452  static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)));
453 }
455  const Vec1<uint16_t> b) {
456  return Vec1<uint16_t>(
457  static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535)));
458 }
459 
460 // Signed
462  return Vec1<int8_t>(
463  static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)));
464 }
466  const Vec1<int16_t> b) {
467  return Vec1<int16_t>(
468  static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767)));
469 }
470 
471 // ------------------------------ Average
472 
473 // Returns (a + b + 1) / 2
474 
476  const Vec1<uint8_t> b) {
477  return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2));
478 }
480  const Vec1<uint16_t> b) {
481  return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2));
482 }
483 
484 // ------------------------------ Absolute value
485 
486 template <typename T>
488  const T i = a.raw;
489  return (i >= 0 || i == hwy::LimitsMin<T>()) ? a : Vec1<T>(-i);
490 }
492  return Vec1<float>(std::abs(a.raw));
493 }
495  return Vec1<double>(std::abs(a.raw));
496 }
497 
498 // ------------------------------ min/max
499 
500 template <typename T, HWY_IF_NOT_FLOAT(T)>
501 HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
502  return Vec1<T>(HWY_MIN(a.raw, b.raw));
503 }
504 
505 template <typename T, HWY_IF_FLOAT(T)>
506 HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
507  if (std::isnan(a.raw)) return b;
508  if (std::isnan(b.raw)) return a;
509  return Vec1<T>(HWY_MIN(a.raw, b.raw));
510 }
511 
512 template <typename T, HWY_IF_NOT_FLOAT(T)>
513 HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
514  return Vec1<T>(HWY_MAX(a.raw, b.raw));
515 }
516 
517 template <typename T, HWY_IF_FLOAT(T)>
518 HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
519  if (std::isnan(a.raw)) return b;
520  if (std::isnan(b.raw)) return a;
521  return Vec1<T>(HWY_MAX(a.raw, b.raw));
522 }
523 
524 // ------------------------------ Floating-point negate
525 
526 template <typename T, HWY_IF_FLOAT(T)>
528  return Xor(v, SignBit(Sisd<T>()));
529 }
530 
531 template <typename T, HWY_IF_NOT_FLOAT(T)>
532 HWY_API Vec1<T> Neg(const Vec1<T> v) {
533  return Zero(Sisd<T>()) - v;
534 }
535 
536 // ------------------------------ mul/div
537 
538 template <typename T, HWY_IF_FLOAT(T)>
540  return Vec1<T>(static_cast<T>(double(a.raw) * b.raw));
541 }
542 
543 template <typename T, HWY_IF_SIGNED(T)>
544 HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
545  return Vec1<T>(static_cast<T>(int64_t(a.raw) * b.raw));
546 }
547 
548 template <typename T, HWY_IF_UNSIGNED(T)>
549 HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
550  return Vec1<T>(static_cast<T>(uint64_t(a.raw) * b.raw));
551 }
552 
553 template <typename T>
555  return Vec1<T>(a.raw / b.raw);
556 }
557 
558 // Returns the upper 16 bits of a * b in each lane.
560  return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16));
561 }
563  // Cast to uint32_t first to prevent overflow. Otherwise the result of
564  // uint16_t * uint16_t is in "int" which may overflow. In practice the result
565  // is the same but this way it is also defined.
566  return Vec1<uint16_t>(static_cast<uint16_t>(
567  (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
568 }
569 
570 // Multiplies even lanes (0, 2 ..) and returns the double-wide result.
572  const int64_t a64 = a.raw;
573  return Vec1<int64_t>(a64 * b.raw);
574 }
576  const uint64_t a64 = a.raw;
577  return Vec1<uint64_t>(a64 * b.raw);
578 }
579 
580 // Approximate reciprocal
582  // Zero inputs are allowed, but callers are responsible for replacing the
583  // return value with something else (typically using IfThenElse). This check
584  // avoids a ubsan error. The return value is arbitrary.
585  if (v.raw == 0.0f) return Vec1<float>(0.0f);
586  return Vec1<float>(1.0f / v.raw);
587 }
588 
589 // Absolute value of difference.
591  return Abs(a - b);
592 }
593 
594 // ------------------------------ Floating-point multiply-add variants
595 
596 template <typename T>
597 HWY_API Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> add) {
598  return mul * x + add;
599 }
600 
601 template <typename T>
603  const Vec1<T> add) {
604  return add - mul * x;
605 }
606 
607 template <typename T>
608 HWY_API Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> sub) {
609  return mul * x - sub;
610 }
611 
612 template <typename T>
614  const Vec1<T> sub) {
615  return Neg(mul) * x - sub;
616 }
617 
618 // ------------------------------ Floating-point square root
619 
620 // Approximate reciprocal square root
622  float f = v.raw;
623  const float half = f * 0.5f;
624  uint32_t bits;
625  CopyBytes<4>(&f, &bits);
626  // Initial guess based on log2(f)
627  bits = 0x5F3759DF - (bits >> 1);
628  CopyBytes<4>(&bits, &f);
629  // One Newton-Raphson iteration
630  return Vec1<float>(f * (1.5f - (half * f * f)));
631 }
632 
633 // Square root
635  return Vec1<float>(std::sqrt(v.raw));
636 }
638  return Vec1<double>(std::sqrt(v.raw));
639 }
640 
641 // ------------------------------ Floating-point rounding
642 
643 template <typename T>
645  using TI = MakeSigned<T>;
646  if (!(Abs(v).raw < MantissaEnd<T>())) { // Huge or NaN
647  return v;
648  }
649  const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
650  const TI rounded = static_cast<TI>(v.raw + bias);
651  if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
652  // Round to even
653  if ((rounded & 1) && std::abs(rounded - v.raw) == T(0.5)) {
654  return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
655  }
656  return Vec1<T>(static_cast<T>(rounded));
657 }
658 
659 // Round-to-nearest even.
661  using T = float;
662  using TI = int32_t;
663 
664  const T abs = Abs(v).raw;
665  const bool signbit = std::signbit(v.raw);
666 
667  if (!(abs < MantissaEnd<T>())) { // Huge or NaN
668  // Check if too large to cast or NaN
669  if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
670  return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
671  }
672  return Vec1<int32_t>(static_cast<TI>(v.raw));
673  }
674  const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
675  const TI rounded = static_cast<TI>(v.raw + bias);
676  if (rounded == 0) return Vec1<int32_t>(0);
677  // Round to even
678  if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
679  return Vec1<TI>(rounded - (signbit ? -1 : 1));
680  }
681  return Vec1<TI>(rounded);
682 }
683 
684 template <typename T>
686  using TI = MakeSigned<T>;
687  if (!(Abs(v).raw <= MantissaEnd<T>())) { // Huge or NaN
688  return v;
689  }
690  const TI truncated = static_cast<TI>(v.raw);
691  if (truncated == 0) return CopySignToAbs(Vec1<T>(0), v);
692  return Vec1<T>(static_cast<T>(truncated));
693 }
694 
695 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
696  class V>
697 V Ceiling(const V v) {
698  const Bits kExponentMask = (1ull << kExponentBits) - 1;
699  const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
700  const Bits kBias = kExponentMask / 2;
701 
702  Float f = v.raw;
703  const bool positive = f > Float(0.0);
704 
705  Bits bits;
706  CopyBytes<sizeof(Bits)>(&v, &bits);
707 
708  const int exponent =
709  static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
710  // Already an integer.
711  if (exponent >= kMantissaBits) return v;
712  // |v| <= 1 => 0 or 1.
713  if (exponent < 0) return positive ? V(1) : V(-0.0);
714 
715  const Bits mantissa_mask = kMantissaMask >> exponent;
716  // Already an integer
717  if ((bits & mantissa_mask) == 0) return v;
718 
719  // Clear fractional bits and round up
720  if (positive) bits += (kMantissaMask + 1) >> exponent;
721  bits &= ~mantissa_mask;
722 
723  CopyBytes<sizeof(Bits)>(&bits, &f);
724  return V(f);
725 }
726 
727 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
728  class V>
729 V Floor(const V v) {
730  const Bits kExponentMask = (1ull << kExponentBits) - 1;
731  const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
732  const Bits kBias = kExponentMask / 2;
733 
734  Float f = v.raw;
735  const bool negative = f < Float(0.0);
736 
737  Bits bits;
738  CopyBytes<sizeof(Bits)>(&v, &bits);
739 
740  const int exponent =
741  static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
742  // Already an integer.
743  if (exponent >= kMantissaBits) return v;
744  // |v| <= 1 => -1 or 0.
745  if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0));
746 
747  const Bits mantissa_mask = kMantissaMask >> exponent;
748  // Already an integer
749  if ((bits & mantissa_mask) == 0) return v;
750 
751  // Clear fractional bits and round down
752  if (negative) bits += (kMantissaMask + 1) >> exponent;
753  bits &= ~mantissa_mask;
754 
755  CopyBytes<sizeof(Bits)>(&bits, &f);
756  return V(f);
757 }
758 
759 // Toward +infinity, aka ceiling
761  return Ceiling<float, uint32_t, 23, 8>(v);
762 }
764  return Ceiling<double, uint64_t, 52, 11>(v);
765 }
766 
767 // Toward -infinity, aka floor
769  return Floor<float, uint32_t, 23, 8>(v);
770 }
772  return Floor<double, uint64_t, 52, 11>(v);
773 }
774 
775 // ================================================== COMPARE
776 
777 template <typename T>
779  return Mask1<T>::FromBool(a.raw == b.raw);
780 }
781 
782 template <typename T>
784  return Mask1<T>::FromBool(a.raw != b.raw);
785 }
786 
787 template <typename T>
788 HWY_API Mask1<T> TestBit(const Vec1<T> v, const Vec1<T> bit) {
789  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
790  return (v & bit) == bit;
791 }
792 
793 template <typename T>
795  return Mask1<T>::FromBool(a.raw < b.raw);
796 }
797 template <typename T>
799  return Mask1<T>::FromBool(a.raw > b.raw);
800 }
801 
802 template <typename T>
804  return Mask1<T>::FromBool(a.raw <= b.raw);
805 }
806 template <typename T>
808  return Mask1<T>::FromBool(a.raw >= b.raw);
809 }
810 
811 // ================================================== MEMORY
812 
813 // ------------------------------ Load
814 
815 template <typename T>
816 HWY_API Vec1<T> Load(Sisd<T> /* tag */, const T* HWY_RESTRICT aligned) {
817  T t;
818  CopyBytes<sizeof(T)>(aligned, &t);
819  return Vec1<T>(t);
820 }
821 
822 template <typename T>
824  const T* HWY_RESTRICT aligned) {
825  return IfThenElseZero(m, Load(d, aligned));
826 }
827 
828 template <typename T>
830  return Load(d, p);
831 }
832 
833 // In some use cases, "load single lane" is sufficient; otherwise avoid this.
834 template <typename T>
836  return Load(d, aligned);
837 }
838 
839 // ------------------------------ Store
840 
841 template <typename T>
842 HWY_API void Store(const Vec1<T> v, Sisd<T> /* tag */,
843  T* HWY_RESTRICT aligned) {
844  CopyBytes<sizeof(T)>(&v.raw, aligned);
845 }
846 
847 template <typename T>
848 HWY_API void StoreU(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT p) {
849  return Store(v, d, p);
850 }
851 
852 // ------------------------------ StoreInterleaved3
853 
855  const Vec1<uint8_t> v2, Sisd<uint8_t> d,
856  uint8_t* HWY_RESTRICT unaligned) {
857  StoreU(v0, d, unaligned + 0);
858  StoreU(v1, d, unaligned + 1);
859  StoreU(v2, d, unaligned + 2);
860 }
861 
863  const Vec1<uint8_t> v2, const Vec1<uint8_t> v3,
864  Sisd<uint8_t> d,
865  uint8_t* HWY_RESTRICT unaligned) {
866  StoreU(v0, d, unaligned + 0);
867  StoreU(v1, d, unaligned + 1);
868  StoreU(v2, d, unaligned + 2);
869  StoreU(v3, d, unaligned + 3);
870 }
871 
872 // ------------------------------ Stream
873 
874 template <typename T>
875 HWY_API void Stream(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT aligned) {
876  return Store(v, d, aligned);
877 }
878 
879 // ------------------------------ Scatter
880 
881 template <typename T, typename Offset>
883  const Vec1<Offset> offset) {
884  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
885  uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw;
886  return Store(v, d, reinterpret_cast<T*>(base8));
887 }
888 
889 template <typename T, typename Index>
891  const Vec1<Index> index) {
892  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
893  return Store(v, d, base + index.raw);
894 }
895 
896 // ------------------------------ Gather
897 
898 template <typename T, typename Offset>
900  const Vec1<Offset> offset) {
901  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
902  const uintptr_t addr = reinterpret_cast<uintptr_t>(base) + offset.raw;
903  return Load(d, reinterpret_cast<const T*>(addr));
904 }
905 
906 template <typename T, typename Index>
908  const Vec1<Index> index) {
909  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
910  return Load(d, base + index.raw);
911 }
912 
913 // ================================================== CONVERT
914 
915 // ConvertTo and DemoteTo with floating-point input and integer output truncate
916 // (rounding toward zero).
917 
918 template <typename FromT, typename ToT>
920  static_assert(sizeof(ToT) > sizeof(FromT), "Not promoting");
921  // For bits Y > X, floatX->floatY and intX->intY are always representable.
922  return Vec1<ToT>(static_cast<ToT>(from.raw));
923 }
924 
925 template <typename FromT, typename ToT, HWY_IF_FLOAT(FromT)>
927  static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
928 
929  // Prevent ubsan errors when converting float to narrower integer/float
930  if (std::isinf(from.raw) ||
931  std::fabs(from.raw) > static_cast<FromT>(HighestValue<ToT>())) {
932  return Vec1<ToT>(std::signbit(from.raw) ? LowestValue<ToT>()
933  : HighestValue<ToT>());
934  }
935  return Vec1<ToT>(static_cast<ToT>(from.raw));
936 }
937 
938 template <typename FromT, typename ToT, HWY_IF_NOT_FLOAT(FromT)>
939 HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
940  static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
941 
942  // Int to int: choose closest value in ToT to `from` (avoids UB)
943  from.raw = HWY_MIN(HWY_MAX(LimitsMin<ToT>(), from.raw), LimitsMax<ToT>());
944  return Vec1<ToT>(static_cast<ToT>(from.raw));
945 }
946 
948 #if HWY_NATIVE_FLOAT16
949  uint16_t bits16;
950  CopyBytes<2>(&v.raw, &bits16);
951 #else
952  const uint16_t bits16 = v.raw.bits;
953 #endif
954  const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
955  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
956  const uint32_t mantissa = bits16 & 0x3FF;
957 
958  // Subnormal or zero
959  if (biased_exp == 0) {
960  const float subnormal =
961  (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
962  return Vec1<float>(sign ? -subnormal : subnormal);
963  }
964 
965  // Normalized: convert the representation directly (faster than ldexp/tables).
966  const uint32_t biased_exp32 = biased_exp + (127 - 15);
967  const uint32_t mantissa32 = mantissa << (23 - 10);
968  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
969  float out;
970  CopyBytes<4>(&bits32, &out);
971  return Vec1<float>(out);
972 }
973 
975  return Set(d, F32FromBF16(v.raw));
976 }
977 
979  const Vec1<float> v) {
980  uint32_t bits32;
981  CopyBytes<4>(&v.raw, &bits32);
982  const uint32_t sign = bits32 >> 31;
983  const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
984  const uint32_t mantissa32 = bits32 & 0x7FFFFF;
985 
986  const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
987 
988  // Tiny or zero => zero.
989  Vec1<float16_t> out;
990  if (exp < -24) {
991 #if HWY_NATIVE_FLOAT16
992  const uint16_t zero = 0;
993  CopyBytes<2>(&zero, &out.raw);
994 #else
995  out.raw.bits = 0;
996 #endif
997  return out;
998  }
999 
1000  uint32_t biased_exp16, mantissa16;
1001 
1002  // exp = [-24, -15] => subnormal
1003  if (exp < -14) {
1004  biased_exp16 = 0;
1005  const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
1006  HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
1007  mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
1008  (mantissa32 >> (13 + sub_exp)));
1009  } else {
1010  // exp = [-14, 15]
1011  biased_exp16 = static_cast<uint32_t>(exp + 15);
1012  HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1013  mantissa16 = mantissa32 >> 13;
1014  }
1015 
1016  HWY_DASSERT(mantissa16 < 1024);
1017  const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1018  HWY_DASSERT(bits16 < 0x10000);
1019 #if HWY_NATIVE_FLOAT16
1020  const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
1021  CopyBytes<2>(&narrowed, &out.raw);
1022 #else
1023  out.raw.bits = static_cast<uint16_t>(bits16);
1024 #endif
1025  return out;
1026 }
1027 
1029  return Set(d, BF16FromF32(v.raw));
1030 }
1031 
1032 template <typename FromT, typename ToT, HWY_IF_FLOAT(FromT)>
1034  static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1035  // float## -> int##: return closest representable value. We cannot exactly
1036  // represent LimitsMax<ToT> in FromT, so use double.
1037  const double f = static_cast<double>(from.raw);
1038  if (std::isinf(from.raw) ||
1039  std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
1040  return Vec1<ToT>(std::signbit(from.raw) ? LimitsMin<ToT>()
1041  : LimitsMax<ToT>());
1042  }
1043  return Vec1<ToT>(static_cast<ToT>(from.raw));
1044 }
1045 
1046 template <typename FromT, typename ToT, HWY_IF_NOT_FLOAT(FromT)>
1047 HWY_API Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
1048  static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1049  // int## -> float##: no check needed
1050  return Vec1<ToT>(static_cast<ToT>(from.raw));
1051 }
1052 
1054  return DemoteTo(Sisd<uint8_t>(), v);
1055 }
1056 
1057 // ================================================== COMBINE
1058 // UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported.
1059 
1060 template <typename T>
1062  return v;
1063 }
1064 
1065 template <typename T>
1067  return v;
1068 }
1069 
1070 // ================================================== SWIZZLE
1071 // OddEven is unsupported.
1072 
1073 template <typename T>
1075  return v.raw;
1076 }
1077 
1078 // ------------------------------ TableLookupLanes
1079 
1080 // Returned by SetTableIndices for use by TableLookupLanes.
1081 template <typename T>
1082 struct Indices1 {
1083  int raw;
1084 };
1085 
1086 template <typename T>
1088  HWY_DASSERT(idx[0] == 0);
1089  return Indices1<T>{idx[0]};
1090 }
1091 
1092 template <typename T>
1094  return v;
1095 }
1096 
1097 // ------------------------------ Reverse
1098 
1099 template <typename T>
1100 HWY_API Vec1<T> Reverse(Sisd<T> /* tag */, const Vec1<T> v) {
1101  return v;
1102 }
1103 
1104 // ================================================== BLOCKWISE
1105 // Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported.
1106 
1107 // ------------------------------ Broadcast/splat any lane
1108 
1109 template <int kLane, typename T>
1111  static_assert(kLane == 0, "Scalar only has one lane");
1112  return v;
1113 }
1114 
1115 // ------------------------------ Shuffle bytes with variable indices
1116 
1117 // Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
1118 // indices in [0, sizeof(T)).
1119 template <typename T>
1121  uint8_t in_bytes[sizeof(T)];
1122  uint8_t from_bytes[sizeof(T)];
1123  uint8_t out_bytes[sizeof(T)];
1124  CopyBytes<sizeof(T)>(&in, &in_bytes);
1125  CopyBytes<sizeof(T)>(&from, &from_bytes);
1126  for (size_t i = 0; i < sizeof(T); ++i) {
1127  out_bytes[i] = in_bytes[from_bytes[i]];
1128  }
1129  T out;
1130  CopyBytes<sizeof(T)>(&out_bytes, &out);
1131  return Vec1<T>{out};
1132 }
1133 
1134 template <typename T>
1136  uint8_t in_bytes[sizeof(T)];
1137  uint8_t from_bytes[sizeof(T)];
1138  uint8_t out_bytes[sizeof(T)];
1139  CopyBytes<sizeof(T)>(&in, &in_bytes);
1140  CopyBytes<sizeof(T)>(&from, &from_bytes);
1141  for (size_t i = 0; i < sizeof(T); ++i) {
1142  out_bytes[i] = from_bytes[i] & 0x80 ? 0 : in_bytes[from_bytes[i]];
1143  }
1144  T out;
1145  CopyBytes<sizeof(T)>(&out_bytes, &out);
1146  return Vec1<T>{out};
1147 }
1148 
1149 // ------------------------------ ZipLower
1150 
1152  return Vec1<uint16_t>(static_cast<uint16_t>((uint32_t(b.raw) << 8) + a.raw));
1153 }
1155  const Vec1<uint16_t> b) {
1156  return Vec1<uint32_t>((uint32_t(b.raw) << 16) + a.raw);
1157 }
1159  const Vec1<uint32_t> b) {
1160  return Vec1<uint64_t>((uint64_t(b.raw) << 32) + a.raw);
1161 }
1163  return Vec1<int16_t>(static_cast<int16_t>((int32_t(b.raw) << 8) + a.raw));
1164 }
1166  return Vec1<int32_t>((int32_t(b.raw) << 16) + a.raw);
1167 }
1169  return Vec1<int64_t>((int64_t(b.raw) << 32) + a.raw);
1170 }
1171 
1172 template <typename T, typename TW = MakeWide<T>, class VW = Vec1<TW>>
1173 HWY_API VW ZipLower(Sisd<TW> /* tag */, Vec1<T> a, Vec1<T> b) {
1174  return VW(static_cast<TW>((TW{b.raw} << (sizeof(T) * 8)) + a.raw));
1175 }
1176 
1177 // ================================================== MASK
1178 
1179 template <typename T>
1180 HWY_API bool AllFalse(Sisd<T> /* tag */, const Mask1<T> mask) {
1181  return mask.bits == 0;
1182 }
1183 
1184 template <typename T>
1185 HWY_API bool AllTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
1186  return mask.bits != 0;
1187 }
1188 
1189 // `p` points to at least 8 readable bytes, not all of which need be valid.
1190 template <typename T>
1192  const uint8_t* HWY_RESTRICT bits) {
1193  return Mask1<T>::FromBool((bits[0] & 1) != 0);
1194 }
1195 
1196 // `p` points to at least 8 writable bytes.
1197 template <typename T>
1198 HWY_API size_t StoreMaskBits(Sisd<T> d, const Mask1<T> mask, uint8_t* bits) {
1199  *bits = AllTrue(d, mask);
1200  return 1;
1201 }
1202 
1203 template <typename T>
1204 HWY_API size_t CountTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
1205  return mask.bits == 0 ? 0 : 1;
1206 }
1207 
1208 template <typename T>
1209 HWY_API intptr_t FindFirstTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
1210  return mask.bits == 0 ? -1 : 0;
1211 }
1212 
1213 // ------------------------------ Compress, CompressBits
1214 
1215 template <typename T>
1216 HWY_API Vec1<T> Compress(Vec1<T> v, const Mask1<T> /* mask */) {
1217  // Upper lanes are undefined, so result is the same independent of mask.
1218  return v;
1219 }
1220 
1221 template <typename T>
1222 HWY_API Vec1<T> Compress(Vec1<T> v, const uint8_t* HWY_RESTRICT /* bits */) {
1223  return v;
1224 }
1225 
1226 // ------------------------------ CompressStore
1227 
1228 template <typename T>
1230  T* HWY_RESTRICT unaligned) {
1231  StoreU(Compress(v, mask), d, unaligned);
1232  return CountTrue(d, mask);
1233 }
1234 
1235 // ------------------------------ CompressBitsStore
1236 
1237 template <typename T>
1238 HWY_API size_t CompressBitsStore(Vec1<T> v, const uint8_t* HWY_RESTRICT bits,
1239  Sisd<T> d, T* HWY_RESTRICT unaligned) {
1240  const Mask1<T> mask = LoadMaskBits(d, bits);
1241  StoreU(Compress(v, mask), d, unaligned);
1242  return CountTrue(d, mask);
1243 }
1244 
1245 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
1246 
1248  Vec1<bfloat16_t> a,
1249  Vec1<bfloat16_t> b,
1250  const Vec1<float> sum0,
1251  Vec1<float>& /* sum1 */) {
1252  return MulAdd(Vec1<float>(F32FromBF16(a.raw)),
1253  Vec1<float>(F32FromBF16(b.raw)), sum0);
1254 }
1255 
1256 // ================================================== REDUCTIONS
1257 
1258 // Sum of all lanes, i.e. the only one.
1259 template <typename T>
1261  return v;
1262 }
1263 template <typename T>
1265  return v;
1266 }
1267 template <typename T>
1269  return v;
1270 }
1271 
1272 // ================================================== DEPRECATED
1273 
1274 template <typename T>
1275 HWY_API size_t StoreMaskBits(const Mask1<T> mask, uint8_t* bits) {
1276  return StoreMaskBits(Sisd<T>(), mask, bits);
1277 }
1278 
1279 template <typename T>
1280 HWY_API bool AllTrue(const Mask1<T> mask) {
1281  return AllTrue(Sisd<T>(), mask);
1282 }
1283 
1284 template <typename T>
1285 HWY_API bool AllFalse(const Mask1<T> mask) {
1286  return AllFalse(Sisd<T>(), mask);
1287 }
1288 
1289 template <typename T>
1290 HWY_API size_t CountTrue(const Mask1<T> mask) {
1291  return CountTrue(Sisd<T>(), mask);
1292 }
1293 
1294 template <typename T>
1296  return SumOfLanes(Sisd<T>(), v);
1297 }
1298 template <typename T>
1300  return MinOfLanes(Sisd<T>(), v);
1301 }
1302 template <typename T>
1304  return MaxOfLanes(Sisd<T>(), v);
1305 }
1306 
1307 // ================================================== Operator wrapper
1308 
1309 template <class V>
1310 HWY_API V Add(V a, V b) {
1311  return a + b;
1312 }
1313 template <class V>
1314 HWY_API V Sub(V a, V b) {
1315  return a - b;
1316 }
1317 
1318 template <class V>
1319 HWY_API V Mul(V a, V b) {
1320  return a * b;
1321 }
1322 template <class V>
1323 HWY_API V Div(V a, V b) {
1324  return a / b;
1325 }
1326 
1327 template <class V>
1328 V Shl(V a, V b) {
1329  return a << b;
1330 }
1331 template <class V>
1332 V Shr(V a, V b) {
1333  return a >> b;
1334 }
1335 
1336 template <class V>
1337 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
1338  return a == b;
1339 }
1340 template <class V>
1341 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
1342  return a != b;
1343 }
1344 template <class V>
1345 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
1346  return a < b;
1347 }
1348 
1349 template <class V>
1350 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
1351  return a > b;
1352 }
1353 template <class V>
1354 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
1355  return a >= b;
1356 }
1357 
1358 template <class V>
1359 HWY_API auto Le(V a, V b) -> decltype(a == b) {
1360  return a <= b;
1361 }
1362 
1363 // NOLINTNEXTLINE(google-readability-namespace-comments)
1364 } // namespace HWY_NAMESPACE
1365 } // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:123
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_API
Definition: base.h:117
#define HWY_MIN(a, b)
Definition: base.h:122
#define HWY_INLINE
Definition: base.h:59
#define HWY_DASSERT(condition)
Definition: base.h:163
Definition: scalar-inl.h:67
Raw bits
Definition: scalar-inl.h:77
hwy::MakeUnsigned< T > Raw
Definition: scalar-inl.h:68
static HWY_INLINE Mask1< T > FromBool(bool b)
Definition: scalar-inl.h:71
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:803
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1573
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:66
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_API Vec128< uint32_t, N > TableLookupLanes(const Vec128< uint32_t, N > v, const Indices128< uint32_t, N > idx)
Definition: arm_neon-inl.h:3342
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1167
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1520
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5035
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2829
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5027
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1073
HWY_API size_t StoreMaskBits(Simd< T, N >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4528
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4207
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5040
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_API Vec128< T, N > LoadDup128(Simd< T, N > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2164
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4054
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2352
V Shl(V a, V b)
Definition: arm_neon-inl.h:5018
HWY_API Vec128< T, N > VecFromMask(const Mask128< T, N > v)
Definition: arm_neon-inl.h:1607
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5044
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1206
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N > d, const int32_t *idx)
Definition: arm_neon-inl.h:3323
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1600
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:904
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1795
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:422
Vec128< T, N > Iota(const Simd< T, N > d, const T2 first)
Definition: arm_neon-inl.h:734
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1438
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1443
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:3907
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2903
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:778
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1619
HWY_API Vec128< T, N > Undefined(Simd< T, N >)
Definition: arm_neon-inl.h:724
HWY_API intptr_t FindFirstTrue(const Simd< T, N >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4520
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1448
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4509
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:392
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5000
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4538
HWY_API bool AllTrue(const Simd< T, N > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4557
HWY_API void StoreInterleaved3(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4829
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1581
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1655
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2170
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1735
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2846
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:794
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:475
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:325
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1827
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1126
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:449
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1194
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4071
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2157
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4036
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5004
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1799
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1084
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
V Ceiling(const V v)
Definition: scalar-inl.h:697
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:319
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5049
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1077
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:532
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2890
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1366
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4276
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
V Shr(V a, V b)
Definition: arm_neon-inl.h:5022
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4812
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4211
Neg(const Vec128< int64_t, 1 > v)
Definition: arm_neon-inl.h:866
HWY_API Vec128< uint8_t, 4 > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2699
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3235
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2818
HWY_API VFromD< DW > ZipLower(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3527
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4019
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4802
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:3957
HWY_API void StoreInterleaved4(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, const Vec128< uint8_t > v3, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4864
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5013
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:984
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5009
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1590
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:535
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1320
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4780
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:405
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4012
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1296
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
Definition: aligned_allocator.h:23
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:634
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:648
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:656
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:589
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:521
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:523
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
Definition: scalar-inl.h:1082
int raw
Definition: scalar-inl.h:1083
Definition: shared-inl.h:35
Definition: scalar-inl.h:34
T raw
Definition: scalar-inl.h:62
Vec1 & operator=(const Vec1 &)=default
HWY_INLINE Vec1 & operator*=(const Vec1 other)
Definition: scalar-inl.h:40
HWY_INLINE Vec1 & operator-=(const Vec1 other)
Definition: scalar-inl.h:49
HWY_INLINE Vec1 & operator+=(const Vec1 other)
Definition: scalar-inl.h:46
Vec1(const Vec1 &)=default
HWY_INLINE Vec1()=default
HWY_INLINE Vec1 & operator/=(const Vec1 other)
Definition: scalar-inl.h:43
HWY_INLINE Vec1 & operator&=(const Vec1 other)
Definition: scalar-inl.h:52
HWY_INLINE Vec1(const T t)
Definition: scalar-inl.h:38
HWY_INLINE Vec1 & operator^=(const Vec1 other)
Definition: scalar-inl.h:58
HWY_INLINE Vec1 & operator|=(const Vec1 other)
Definition: scalar-inl.h:55
Definition: scalar-inl.h:83
Sisd< T > operator()(Vec1< T >) const
Definition: scalar-inl.h:85