18 #if defined(HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_) == \
19 defined(HWY_TARGET_TOGGLE)
20 #ifdef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
21 #undef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
23 #define HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
54 template <
int kAssumptions,
class D,
typename T = TFromD<D>,
55 HWY_IF_NOT_LANE_SIZE_D(D, 2)>
58 const size_t num_elements) {
59 static_assert(IsFloat<T>(),
"MulAdd requires float type");
60 using V = decltype(
Zero(d));
62 const size_t N =
Lanes(d);
72 if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
77 for (; i + 2 <= num_elements; i += 2) {
78 sum0 += pa[i + 0] * pb[i + 0];
79 sum1 += pa[i + 1] * pb[i + 1];
81 if (i < num_elements) {
82 sum1 += pa[i] * pb[i];
98 for (; i + 4 * N <= num_elements; ) {
99 const auto a0 = kIsAlignedA ?
Load(d, pa + i) :
LoadU(d, pa + i);
100 const auto b0 = kIsAlignedB ?
Load(d, pb + i) :
LoadU(d, pb + i);
102 sum0 =
MulAdd(a0, b0, sum0);
103 const auto a1 = kIsAlignedA ?
Load(d, pa + i) :
LoadU(d, pa + i);
104 const auto b1 = kIsAlignedB ?
Load(d, pb + i) :
LoadU(d, pb + i);
106 sum1 =
MulAdd(a1, b1, sum1);
107 const auto a2 = kIsAlignedA ?
Load(d, pa + i) :
LoadU(d, pa + i);
108 const auto b2 = kIsAlignedB ?
Load(d, pb + i) :
LoadU(d, pb + i);
110 sum2 =
MulAdd(a2, b2, sum2);
111 const auto a3 = kIsAlignedA ?
Load(d, pa + i) :
LoadU(d, pa + i);
112 const auto b3 = kIsAlignedB ?
Load(d, pb + i) :
LoadU(d, pb + i);
114 sum3 =
MulAdd(a3, b3, sum3);
118 for (; i + N <= num_elements; i += N) {
119 const auto a = kIsAlignedA ?
Load(d, pa + i) :
LoadU(d, pa + i);
120 const auto b = kIsAlignedB ?
Load(d, pb + i) :
LoadU(d, pb + i);
121 sum0 =
MulAdd(a, b, sum0);
124 if (!kIsMultipleOfVector) {
125 const size_t remaining = num_elements - i;
126 if (remaining != 0) {
127 if (kIsPaddedToVector) {
128 const auto mask =
FirstN(d, remaining);
129 const auto a = kIsAlignedA ?
Load(d, pa + i) :
LoadU(d, pa + i);
130 const auto b = kIsAlignedB ?
Load(d, pb + i) :
LoadU(d, pb + i);
138 const auto skip =
FirstN(d, N - remaining);
139 const auto a =
LoadU(d, pa + i);
140 const auto b =
LoadU(d, pb + i);
147 sum0 =
Add(sum0, sum1);
148 sum2 =
Add(sum2, sum3);
149 sum0 =
Add(sum0, sum2);
154 template <
int kAssumptions,
class D>
158 const size_t num_elements) {
162 using V = decltype(
Zero(df32));
163 const size_t N =
Lanes(d);
173 if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
177 for (; i + 2 <= num_elements; i += 2) {
181 if (i < num_elements) {
195 for (; i + 2 * N <= num_elements; ) {
196 const auto a0 = kIsAlignedA ?
Load(d, pa + i) :
LoadU(d, pa + i);
197 const auto b0 = kIsAlignedB ?
Load(d, pb + i) :
LoadU(d, pb + i);
200 const auto a1 = kIsAlignedA ?
Load(d, pa + i) :
LoadU(d, pa + i);
201 const auto b1 = kIsAlignedB ?
Load(d, pb + i) :
LoadU(d, pb + i);
207 if (i + N <= num_elements) {
208 const auto a0 = kIsAlignedA ?
Load(d, pa + i) :
LoadU(d, pa + i);
209 const auto b0 = kIsAlignedB ?
Load(d, pb + i) :
LoadU(d, pb + i);
214 if (!kIsMultipleOfVector) {
215 const size_t remaining = num_elements - i;
216 if (remaining != 0) {
217 if (kIsPaddedToVector) {
218 const auto mask =
FirstN(du16, remaining);
219 const auto va = kIsAlignedA ?
Load(d, pa + i) :
LoadU(d, pa + i);
220 const auto vb = kIsAlignedB ?
Load(d, pb + i) :
LoadU(d, pb + i);
231 const auto skip =
FirstN(du16, N - remaining);
232 const auto va =
LoadU(d, pa + i);
233 const auto vb =
LoadU(d, pb + i);
242 sum0 =
Add(sum0, sum1);
243 sum2 =
Add(sum2, sum3);
244 sum0 =
Add(sum0, sum2);
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_INLINE
Definition: base.h:59
#define HWY_DASSERT(condition)
Definition: base.h:163
#define HWY_UNLIKELY(expr)
Definition: base.h:64
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: shared-inl.h:149
constexpr HWY_API size_t Lanes(Simd< T, N >)
Definition: arm_sve-inl.h:226
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5000
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
typename D::template Repartition< T > Repartition
Definition: shared-inl.h:155
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
Definition: aligned_allocator.h:23
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:648
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
static HWY_INLINE T Compute(const D d, const T *const HWY_RESTRICT pa, const T *const HWY_RESTRICT pb, const size_t num_elements)
Definition: dot-inl.h:56
static HWY_INLINE float Compute(const D d, const bfloat16_t *const HWY_RESTRICT pa, const bfloat16_t *const HWY_RESTRICT pb, const size_t num_elements)
Definition: dot-inl.h:155
Assumptions
Definition: dot-inl.h:36
@ kMultipleOfVector
Definition: dot-inl.h:41
@ kPaddedToVector
Definition: dot-inl.h:44
@ kVectorAlignedA
Definition: dot-inl.h:49
@ kAtLeastOneVector
Definition: dot-inl.h:38
@ kVectorAlignedB
Definition: dot-inl.h:50