17 #if defined(HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_) == \
18 defined(HWY_TARGET_TOGGLE)
19 #ifdef HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
20 #undef HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
22 #define HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
32 #if HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
34 #define HWY_SORT_VERIFY 1
36 enum class SortOrder { kAscending, kDescending };
38 constexpr
inline SortOrder
Reverse(SortOrder order) {
39 return (order == SortOrder::kAscending) ? SortOrder::kDescending
40 : SortOrder::kAscending;
46 bool Compare(T a, T b, SortOrder kOrder) {
47 if (kOrder == SortOrder::kAscending)
return a <= b;
58 Runs(D d,
size_t num_regs,
size_t run_length = 0,
bool alternating =
false) {
59 const size_t N =
Lanes(d);
61 buf_ = AllocateAligned<T>(N);
62 consecutive_ = AllocateAligned<T>(num_regs * N);
66 run_length_ = run_length;
67 num_runs_ = num_regs * N / run_length;
69 alternating_ = alternating;
71 run_length_ = num_regs * 4;
78 void ScatterQuartets(D d,
const size_t idx_reg, Vec<D> v) {
80 const size_t N =
Lanes(d);
81 for (
size_t i = 0; i < N; i += 4) {
82 Store(v, d, buf_.get());
83 const size_t idx_q = (i / 4) * num_regs_ + idx_reg;
84 CopyBytes<16>(buf_.get() + i, consecutive_.get() + idx_q * 4);
88 void StoreVector(D d,
const size_t idx_reg, Vec<D> v) {
90 Store(v, d, &consecutive_[idx_reg *
Lanes(d)]);
93 bool IsBitonic()
const {
95 for (
size_t ir = 0; ir < num_runs_; ++ir) {
96 const T* p = &consecutive_[ir * run_length_];
101 for (
size_t i = 0; i < run_length_ / 2 - 1; ++i) {
102 is_asc &= (p[i] <= p[i + 1]);
103 is_desc &= (p[i] >= p[i + 1]);
105 for (
size_t i = 0; i < run_length_; ++i) {
106 is_zero &= (p[i] == 0);
110 bool is_desc2 =
true;
111 for (
size_t i = run_length_ / 2; i < run_length_ - 1; ++i) {
112 is_asc2 &= (p[i] <= p[i + 1]);
113 is_desc2 &= (p[i] >= p[i + 1]);
116 if (is_zero)
continue;
117 if (is_asc && is_desc2)
continue;
118 if (is_desc && is_asc2)
continue;
124 void CheckBitonic(
int line,
int caller)
const {
125 if (IsBitonic())
return;
126 for (
size_t ir = 0; ir < num_runs_; ++ir) {
127 const T* p = &consecutive_[ir * run_length_];
128 printf(
"run %zu (len %zu)\n", ir, run_length_);
129 for (
size_t i = 0; i < run_length_; ++i) {
130 printf(
"%.0f\n",
static_cast<float>(p[i]));
133 printf(
"caller %d\n", caller);
134 hwy::Abort(
"",
line,
"not bitonic");
137 void CheckSorted(SortOrder kOrder,
int line,
int caller)
const {
138 for (
size_t ir = 0; ir < num_runs_; ++ir) {
139 const SortOrder order =
140 (alternating_ && (ir & 1)) ?
Reverse(kOrder) : kOrder;
141 const T* p = &consecutive_[ir * run_length_];
143 for (
size_t i = 0; i < run_length_ - 1; ++i) {
144 if (!Compare(p[i], p[i + 1], order)) {
146 "ir%zu run_length=%zu alt=%d original order=%d this order=%d\n",
147 ir, run_length_, alternating_,
static_cast<int>(kOrder),
148 static_cast<int>(order));
149 for (
size_t i = 0; i < run_length_; ++i) {
150 printf(
" %.0f\n",
static_cast<float>(p[i]));
152 printf(
"caller %d\n", caller);
153 hwy::Abort(
"",
line,
"not sorted");
160 AlignedFreeUniquePtr<T[]> buf_;
161 AlignedFreeUniquePtr<T[]> consecutive_;
170 Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0) {
172 runs.ScatterQuartets(d, 0, v0);
177 Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1) {
179 runs.ScatterQuartets(d, 0, v0);
180 runs.ScatterQuartets(d, 1, v1);
185 Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
188 runs.ScatterQuartets(d, 0, v0);
189 runs.ScatterQuartets(d, 1, v1);
190 runs.ScatterQuartets(d, 2, v2);
191 runs.ScatterQuartets(d, 3, v3);
196 Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
197 Vec<D> v3, Vec<D> v4, Vec<D> v5, Vec<D> v6,
200 runs.ScatterQuartets(d, 0, v0);
201 runs.ScatterQuartets(d, 1, v1);
202 runs.ScatterQuartets(d, 2, v2);
203 runs.ScatterQuartets(d, 3, v3);
204 runs.ScatterQuartets(d, 4, v4);
205 runs.ScatterQuartets(d, 5, v5);
206 runs.ScatterQuartets(d, 6, v6);
207 runs.ScatterQuartets(d, 7, v7);
212 Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
213 Vec<D> v3, Vec<D> v4, Vec<D> v5, Vec<D> v6,
214 Vec<D> v7, Vec<D> v8, Vec<D> v9, Vec<D> vA,
215 Vec<D> vB, Vec<D> vC, Vec<D> vD, Vec<D> vE,
218 runs.ScatterQuartets(d, 0x0, v0);
219 runs.ScatterQuartets(d, 0x1, v1);
220 runs.ScatterQuartets(d, 0x2, v2);
221 runs.ScatterQuartets(d, 0x3, v3);
222 runs.ScatterQuartets(d, 0x4, v4);
223 runs.ScatterQuartets(d, 0x5, v5);
224 runs.ScatterQuartets(d, 0x6, v6);
225 runs.ScatterQuartets(d, 0x7, v7);
226 runs.ScatterQuartets(d, 0x8, v8);
227 runs.ScatterQuartets(d, 0x9, v9);
228 runs.ScatterQuartets(d, 0xA, vA);
229 runs.ScatterQuartets(d, 0xB, vB);
230 runs.ScatterQuartets(d, 0xC, vC);
231 runs.ScatterQuartets(d, 0xD, vD);
232 runs.ScatterQuartets(d, 0xE, vE);
233 runs.ScatterQuartets(d, 0xF, vF);
238 Runs<D> StoreDeinterleavedQuartets(
239 D d,
const Vec<D>& v00,
const Vec<D>& v01,
const Vec<D>& v02,
240 const Vec<D>& v03,
const Vec<D>& v04,
const Vec<D>& v05,
const Vec<D>& v06,
241 const Vec<D>& v07,
const Vec<D>& v08,
const Vec<D>& v09,
const Vec<D>& v0A,
242 const Vec<D>& v0B,
const Vec<D>& v0C,
const Vec<D>& v0D,
const Vec<D>& v0E,
243 const Vec<D>& v0F,
const Vec<D>& v10,
const Vec<D>& v11,
const Vec<D>& v12,
244 const Vec<D>& v13,
const Vec<D>& v14,
const Vec<D>& v15,
const Vec<D>& v16,
245 const Vec<D>& v17,
const Vec<D>& v18,
const Vec<D>& v19,
const Vec<D>& v1A,
246 const Vec<D>& v1B,
const Vec<D>& v1C,
const Vec<D>& v1D,
const Vec<D>& v1E,
249 runs.ScatterQuartets(d, 0x00, v00);
250 runs.ScatterQuartets(d, 0x01, v01);
251 runs.ScatterQuartets(d, 0x02, v02);
252 runs.ScatterQuartets(d, 0x03, v03);
253 runs.ScatterQuartets(d, 0x04, v04);
254 runs.ScatterQuartets(d, 0x05, v05);
255 runs.ScatterQuartets(d, 0x06, v06);
256 runs.ScatterQuartets(d, 0x07, v07);
257 runs.ScatterQuartets(d, 0x08, v08);
258 runs.ScatterQuartets(d, 0x09, v09);
259 runs.ScatterQuartets(d, 0x0A, v0A);
260 runs.ScatterQuartets(d, 0x0B, v0B);
261 runs.ScatterQuartets(d, 0x0C, v0C);
262 runs.ScatterQuartets(d, 0x0D, v0D);
263 runs.ScatterQuartets(d, 0x0E, v0E);
264 runs.ScatterQuartets(d, 0x0F, v0F);
265 runs.ScatterQuartets(d, 0x10, v10);
266 runs.ScatterQuartets(d, 0x11, v11);
267 runs.ScatterQuartets(d, 0x12, v12);
268 runs.ScatterQuartets(d, 0x13, v13);
269 runs.ScatterQuartets(d, 0x14, v14);
270 runs.ScatterQuartets(d, 0x15, v15);
271 runs.ScatterQuartets(d, 0x16, v16);
272 runs.ScatterQuartets(d, 0x17, v17);
273 runs.ScatterQuartets(d, 0x18, v18);
274 runs.ScatterQuartets(d, 0x19, v19);
275 runs.ScatterQuartets(d, 0x1A, v1A);
276 runs.ScatterQuartets(d, 0x1B, v1B);
277 runs.ScatterQuartets(d, 0x1C, v1C);
278 runs.ScatterQuartets(d, 0x1D, v1D);
279 runs.ScatterQuartets(d, 0x1E, v1E);
280 runs.ScatterQuartets(d, 0x1F, v1F);
285 Runs<D> StoreVectors(D d, Vec<D> v0,
size_t run_length,
bool alternating) {
286 Runs runs(d, 1, run_length, alternating);
287 runs.StoreVector(d, 0, v0);
292 Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1) {
293 constexpr
size_t kRegs = 2;
294 Runs runs(d, kRegs, kRegs *
Lanes(d),
false);
295 runs.StoreVector(d, 0, v0);
296 runs.StoreVector(d, 1, v1);
301 Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2, Vec<D> v3) {
302 constexpr
size_t kRegs = 4;
303 Runs runs(d, kRegs, kRegs *
Lanes(d),
false);
304 runs.StoreVector(d, 0, v0);
305 runs.StoreVector(d, 1, v1);
306 runs.StoreVector(d, 2, v2);
307 runs.StoreVector(d, 3, v3);
312 Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2, Vec<D> v3, Vec<D> v4,
313 Vec<D> v5, Vec<D> v6, Vec<D> v7) {
314 constexpr
size_t kRegs = 8;
315 Runs runs(d, kRegs, kRegs *
Lanes(d),
false);
316 runs.StoreVector(d, 0, v0);
317 runs.StoreVector(d, 1, v1);
318 runs.StoreVector(d, 2, v2);
319 runs.StoreVector(d, 3, v3);
320 runs.StoreVector(d, 4, v4);
321 runs.StoreVector(d, 5, v5);
322 runs.StoreVector(d, 6, v6);
323 runs.StoreVector(d, 7, v7);
338 template <SortOrder kOrder,
class V>
339 HWY_INLINE void SortLanesIn2Vectors(V& a, V& b) {
341 a = (kOrder == SortOrder::kAscending) ?
Min(a, b) :
Max(a, b);
342 b = (kOrder == SortOrder::kAscending) ?
Max(temp, b) :
Min(temp, b);
346 template <SortOrder kOrder,
class D,
class V = Vec<D>>
347 HWY_INLINE void SortLanesIn4Vectors(D d,
const TFromD<D>* in, V& v0, V& v1,
349 const size_t N =
Lanes(d);
355 v0 =
Load(d, in + 0 * N);
356 v2 =
Load(d, in + 2 * N);
357 SortLanesIn2Vectors<kOrder>(v0, v2);
358 v1 =
Load(d, in + 1 * N);
359 v3 =
Load(d, in + 3 * N);
360 SortLanesIn2Vectors<kOrder>(v1, v3);
363 SortLanesIn2Vectors<kOrder>(v0, v1);
364 SortLanesIn2Vectors<kOrder>(v2, v3);
367 SortLanesIn2Vectors<kOrder>(v1, v2);
373 template <
class D,
class V = Vec<D>>
374 HWY_INLINE void Transpose4x4(D d, V& v0, V& v1, V& v2, V& v3) {
400 template <SortOrder kOrder,
class D,
class V = Vec<D>>
401 HWY_INLINE void Merge2SortedQuartets(D d, V& v0, V& v1,
int caller) {
403 const verify::Runs<D> input0 = verify::StoreDeinterleavedQuartets(d, v0);
404 const verify::Runs<D> input1 = verify::StoreDeinterleavedQuartets(d, v1);
405 input0.CheckSorted(kOrder, __LINE__, caller);
406 input1.CheckSorted(kOrder, __LINE__, caller);
413 SortLanesIn2Vectors<kOrder>(v0, v1);
415 SortLanesIn2Vectors<kOrder>(v0, v1);
417 SortLanesIn2Vectors<kOrder>(v0, v1);
419 SortLanesIn2Vectors<kOrder>(v0, v1);
423 auto output = verify::StoreDeinterleavedQuartets(d, v0, v1);
424 output.CheckSorted(kOrder, __LINE__, caller);
432 template <SortOrder kOrder,
class D>
433 HWY_INLINE void SortAdjacentLanesQV(D d, Vec<D>& q_or_v) {
436 #if !HWY_ARCH_X86 || HWY_TARGET <= HWY_AVX3
437 if (
sizeof(TFromD<D>) == 4 && !
IsFloat<TFromD<D>>()) {
439 const auto wide =
BitCast(dw, q_or_v);
441 if (kOrder == SortOrder::kAscending) {
450 SortLanesIn2Vectors<kOrder>(q_or_v, swapped);
451 q_or_v =
OddEven(swapped, q_or_v);
456 template <SortOrder kOrder,
class D>
457 HWY_INLINE void SortDistance2LanesQV(D d, Vec<D>& q_or_v) {
460 SortLanesIn2Vectors<kOrder>(q_or_v, swapped);
469 template <SortOrder kOrder,
class D,
class V = Vec<D>>
470 HWY_INLINE void BitonicMerge2Quartets(D d, V& q0, V& q1,
int caller) {
472 const verify::Runs<D> input = verify::StoreDeinterleavedQuartets(d, q0, q1);
473 if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
477 SortLanesIn2Vectors<kOrder>(q0, q1);
480 SortDistance2LanesQV<kOrder>(d, q0);
481 SortDistance2LanesQV<kOrder>(d, q1);
484 SortAdjacentLanesQV<kOrder>(d, q0);
485 SortAdjacentLanesQV<kOrder>(d, q1);
488 const verify::Runs<D> output = verify::StoreDeinterleavedQuartets(d, q0, q1);
489 output.CheckSorted(kOrder, __LINE__, caller);
494 template <SortOrder kOrder,
class D,
class V = Vec<D>>
495 HWY_INLINE void BitonicMerge4Quartets(D d, V& q0, V& q1, V& q2, V& q3,
498 const verify::Runs<D> input =
499 verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3);
500 if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
504 SortLanesIn2Vectors<kOrder>(q0, q2);
505 SortLanesIn2Vectors<kOrder>(q1, q3);
509 BitonicMerge2Quartets<kOrder>(d, q0, q1, __LINE__);
510 BitonicMerge2Quartets<kOrder>(d, q2, q3, __LINE__);
513 const verify::Runs<D> output =
514 verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3);
515 output.CheckSorted(kOrder, __LINE__, caller);
520 template <SortOrder kOrder,
class D,
class V = Vec<D>>
521 HWY_INLINE void BitonicMerge8Quartets(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
522 V& q5, V& q6, V& q7,
int caller) {
524 const verify::Runs<D> input =
525 verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3, q4, q5, q6, q7);
526 if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
530 SortLanesIn2Vectors<kOrder>(q0, q4);
531 SortLanesIn2Vectors<kOrder>(q1, q5);
532 SortLanesIn2Vectors<kOrder>(q2, q6);
533 SortLanesIn2Vectors<kOrder>(q3, q7);
536 BitonicMerge4Quartets<kOrder>(d, q0, q1, q2, q3, __LINE__);
537 BitonicMerge4Quartets<kOrder>(d, q4, q5, q6, q7, __LINE__);
540 const verify::Runs<D> output =
541 verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3, q4, q5, q6, q7);
542 output.CheckSorted(kOrder, __LINE__, caller);
549 #if HWY_TARGET <= HWY_AVX3
552 template <
typename T>
553 Vec512<T> Shuffle128_2020(Vec512<T> a, Vec512<T> b) {
554 return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0))};
557 template <
typename T>
558 Vec512<T> Shuffle128_3131(Vec512<T> a, Vec512<T> b) {
559 return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(3, 1, 3, 1))};
562 template <
typename T>
563 Vec512<T> Shuffle128_2301(Vec512<T> a, Vec512<T> b) {
564 return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(2, 3, 0, 1))};
567 template <
typename T>
568 Vec512<T> OddEven128(Vec512<T> odd, Vec512<T> even) {
569 return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)};
572 template <SortOrder kOrder,
class T>
573 HWY_INLINE void SortDistance4LanesV(Simd<T, 16> d,
Vec<decltype(d)>& v) {
576 Vec512<T> swapped = Shuffle128_2301(v, v);
577 SortLanesIn2Vectors<kOrder>(v, swapped);
578 v = OddEven128(swapped, v);
583 template <SortOrder kOrder,
typename T>
584 HWY_INLINE void SortDistance4LanesV(Simd<T, 8> d,
Vec<decltype(d)>& v) {
586 SortLanesIn2Vectors<kOrder>(v, swapped);
590 template <SortOrder kOrder,
typename T>
591 HWY_INLINE void SortDistance4LanesV(Simd<T, 4> , ...) {}
594 template <SortOrder kOrder,
class D>
595 HWY_INLINE void SortDistance8LanesV(D d, Vec<D>& v) {
597 SortLanesIn2Vectors<kOrder>(v, swapped);
602 template <SortOrder kOrder,
class D,
class V = Vec<D>>
603 HWY_INLINE void BitonicMergeTo64(D d, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
604 V& v6, V& v7,
int caller) {
606 const verify::Runs<D> input =
607 verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
608 if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
612 SortLanesIn2Vectors<kOrder>(v0, v4);
613 SortLanesIn2Vectors<kOrder>(v1, v5);
614 SortLanesIn2Vectors<kOrder>(v2, v6);
615 SortLanesIn2Vectors<kOrder>(v3, v7);
618 SortLanesIn2Vectors<kOrder>(v0, v2);
619 SortLanesIn2Vectors<kOrder>(v1, v3);
620 SortLanesIn2Vectors<kOrder>(v4, v6);
621 SortLanesIn2Vectors<kOrder>(v5, v7);
624 SortLanesIn2Vectors<kOrder>(v0, v1);
625 SortLanesIn2Vectors<kOrder>(v2, v3);
626 SortLanesIn2Vectors<kOrder>(v4, v5);
627 SortLanesIn2Vectors<kOrder>(v6, v7);
630 SortDistance4LanesV<kOrder>(d, v0);
631 SortDistance4LanesV<kOrder>(d, v1);
632 SortDistance4LanesV<kOrder>(d, v2);
633 SortDistance4LanesV<kOrder>(d, v3);
634 SortDistance4LanesV<kOrder>(d, v4);
635 SortDistance4LanesV<kOrder>(d, v5);
636 SortDistance4LanesV<kOrder>(d, v6);
637 SortDistance4LanesV<kOrder>(d, v7);
640 SortDistance2LanesQV<kOrder>(d, v0);
641 SortDistance2LanesQV<kOrder>(d, v1);
642 SortDistance2LanesQV<kOrder>(d, v2);
643 SortDistance2LanesQV<kOrder>(d, v3);
644 SortDistance2LanesQV<kOrder>(d, v4);
645 SortDistance2LanesQV<kOrder>(d, v5);
646 SortDistance2LanesQV<kOrder>(d, v6);
647 SortDistance2LanesQV<kOrder>(d, v7);
650 SortAdjacentLanesQV<kOrder>(d, v0);
651 SortAdjacentLanesQV<kOrder>(d, v1);
652 SortAdjacentLanesQV<kOrder>(d, v2);
653 SortAdjacentLanesQV<kOrder>(d, v3);
654 SortAdjacentLanesQV<kOrder>(d, v4);
655 SortAdjacentLanesQV<kOrder>(d, v5);
656 SortAdjacentLanesQV<kOrder>(d, v6);
657 SortAdjacentLanesQV<kOrder>(d, v7);
660 const verify::Runs<D> output =
661 verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
662 output.CheckSorted(kOrder, __LINE__, caller);
667 template <SortOrder kOrder,
class D,
class V = Vec<D>>
668 HWY_INLINE void BitonicMergeTo64(D d, V& v0, V& v1, V& v2, V& v3,
int caller) {
670 const verify::Runs<D> input = verify::StoreVectors(d, v0, v1, v2, v3);
671 if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
675 SortLanesIn2Vectors<kOrder>(v0, v2);
676 SortLanesIn2Vectors<kOrder>(v1, v3);
679 SortLanesIn2Vectors<kOrder>(v0, v1);
680 SortLanesIn2Vectors<kOrder>(v2, v3);
683 SortDistance8LanesV<kOrder>(d, v0);
684 SortDistance8LanesV<kOrder>(d, v1);
685 SortDistance8LanesV<kOrder>(d, v2);
686 SortDistance8LanesV<kOrder>(d, v3);
689 SortDistance4LanesV<kOrder>(d, v0);
690 SortDistance4LanesV<kOrder>(d, v1);
691 SortDistance4LanesV<kOrder>(d, v2);
692 SortDistance4LanesV<kOrder>(d, v3);
695 SortDistance2LanesQV<kOrder>(d, v0);
696 SortDistance2LanesQV<kOrder>(d, v1);
697 SortDistance2LanesQV<kOrder>(d, v2);
698 SortDistance2LanesQV<kOrder>(d, v3);
701 SortAdjacentLanesQV<kOrder>(d, v0);
702 SortAdjacentLanesQV<kOrder>(d, v1);
703 SortAdjacentLanesQV<kOrder>(d, v2);
704 SortAdjacentLanesQV<kOrder>(d, v3);
707 const verify::Runs<D> output = verify::StoreVectors(d, v0, v1, v2, v3);
708 output.CheckSorted(kOrder, __LINE__, caller);
713 template <SortOrder kOrder,
class D,
class V = Vec<D>>
714 HWY_INLINE void BitonicMergeTo128(D d, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
715 V& v6, V& v7,
int caller) {
717 const verify::Runs<D> input =
718 verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
719 if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
723 SortLanesIn2Vectors<kOrder>(v0, v4);
724 SortLanesIn2Vectors<kOrder>(v1, v5);
725 SortLanesIn2Vectors<kOrder>(v2, v6);
726 SortLanesIn2Vectors<kOrder>(v3, v7);
728 BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, __LINE__);
729 BitonicMergeTo64<kOrder>(d, v4, v5, v6, v7, __LINE__);
732 const verify::Runs<D> output =
733 verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
734 output.CheckSorted(kOrder, __LINE__, caller);
741 template <SortOrder kOrder,
class D,
class V>
742 HWY_API size_t SingleQuartetPerVector(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
743 V& q5, V& q6, V& q7, TFromD<D>* inout) {
744 Store(q0, d, inout + 0 * 4);
745 Store(q1, d, inout + 1 * 4);
746 Store(q2, d, inout + 2 * 4);
747 Store(q3, d, inout + 3 * 4);
748 Store(q4, d, inout + 4 * 4);
749 Store(q5, d, inout + 5 * 4);
750 Store(q6, d, inout + 6 * 4);
751 Store(q7, d, inout + 7 * 4);
756 template <SortOrder kOrder,
class D,
class V>
757 HWY_API size_t TwoQuartetsPerVector(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
758 V& q5, V& q6, V& q7, TFromD<D>* inout) {
768 detail::BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, v4, v5, v6, v7, -1);
770 Store(v0, d, inout + 0 * 8);
771 Store(v1, d, inout + 1 * 8);
772 Store(v2, d, inout + 2 * 8);
773 Store(v3, d, inout + 3 * 8);
774 Store(v4, d, inout + 4 * 8);
775 Store(v5, d, inout + 5 * 8);
776 Store(v6, d, inout + 6 * 8);
777 Store(v7, d, inout + 7 * 8);
782 template <SortOrder kOrder,
typename T,
class V>
783 HWY_API size_t FourQuartetsPerVector(Simd<T, 16> d, V& q0, V& q1, V& q2, V& q3,
784 V& q4, V& q5, V& q6, V& q7, T* inout) {
785 const V q11_01_10_00 = Shuffle128_2020(q0, q1);
786 const V q13_03_12_02 = Shuffle128_2020(q2, q3);
787 V v0 = Shuffle128_2020(q11_01_10_00, q13_03_12_02);
789 const V q15_05_14_04 = Shuffle128_2020(q4, q5);
790 const V q17_07_16_06 = Shuffle128_2020(q6, q7);
791 V v1 = Shuffle128_2020(q15_05_14_04, q17_07_16_06);
793 const V q19_09_18_08 = Shuffle128_3131(q0, q1);
794 const V q1b_0b_1a_0a = Shuffle128_3131(q2, q3);
795 V v3 =
Reverse(d, Shuffle128_2020(q19_09_18_08, q1b_0b_1a_0a));
797 const V q1d_0d_1c_0c = Shuffle128_3131(q4, q5);
798 const V q1f_0f_1e_0e = Shuffle128_3131(q6, q7);
799 V v2 =
Reverse(d, Shuffle128_2020(q1d_0d_1c_0c, q1f_0f_1e_0e));
801 detail::BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, -1);
804 V v4 = Shuffle128_3131(q11_01_10_00, q13_03_12_02);
805 V v5 = Shuffle128_3131(q15_05_14_04, q17_07_16_06);
806 V v7 =
Reverse(d, Shuffle128_3131(q19_09_18_08, q1b_0b_1a_0a));
807 V v6 =
Reverse(d, Shuffle128_3131(q1d_0d_1c_0c, q1f_0f_1e_0e));
809 detail::BitonicMergeTo64<Reverse(kOrder)>(d, v4, v5, v6, v7, -1);
811 detail::BitonicMergeTo128<kOrder>(d, v0, v1, v2, v3, v4, v5, v6, v7, -1);
813 Store(v0, d, inout + 0 * 16);
814 Store(v1, d, inout + 1 * 16);
815 Store(v2, d, inout + 2 * 16);
816 Store(v3, d, inout + 3 * 16);
817 Store(v4, d, inout + 4 * 16);
818 Store(v5, d, inout + 5 * 16);
819 Store(v6, d, inout + 6 * 16);
820 Store(v7, d, inout + 7 * 16);
825 template <SortOrder kOrder,
typename T>
826 HWY_API size_t TwoQuartetsPerVector(Simd<T, 4> , ...) {
830 template <SortOrder kOrder,
typename T>
831 HWY_API size_t FourQuartetsPerVector(Simd<T, 4> , ...) {
834 template <SortOrder kOrder,
typename T>
835 HWY_API size_t FourQuartetsPerVector(Simd<T, 8> , ...) {
842 HWY_API size_t SortBatchSize(D d) {
843 const size_t N =
Lanes(d);
844 if (N == 4)
return 32;
845 if (N == 8)
return 64;
846 if (N == 16)
return 128;
850 template <SortOrder kOrder,
class D>
851 HWY_API size_t SortBatch(D d, TFromD<D>* inout) {
852 const size_t N =
Lanes(d);
854 Vec<D> q0, q1, q2, q3;
855 detail::SortLanesIn4Vectors<kOrder>(d, inout, q0, q1, q2, q3);
856 detail::Transpose4x4(d, q0, q1, q2, q3);
857 detail::Merge2SortedQuartets<kOrder>(d, q0, q1, -1);
858 detail::Merge2SortedQuartets<kOrder>(d, q2, q3, -1);
861 constexpr SortOrder kReverse =
Reverse(kOrder);
863 Vec<D> q4, q5, q6, q7;
864 detail::SortLanesIn4Vectors<kReverse>(d, inout + 4 * N, q4, q5, q6, q7);
865 detail::Transpose4x4(d, q4, q5, q6, q7);
866 detail::Merge2SortedQuartets<kReverse>(d, q4, q5, -1);
867 detail::Merge2SortedQuartets<kReverse>(d, q6, q7, -1);
869 detail::BitonicMerge4Quartets<kOrder>(d, q0, q1, q4, q5, -1);
870 detail::BitonicMerge4Quartets<kReverse>(d, q2, q3, q6, q7, -1);
872 detail::BitonicMerge8Quartets<kOrder>(d, q0, q1, q4, q5, q2, q3, q6, q7,
876 return detail::SingleQuartetPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
881 return detail::TwoQuartetsPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
885 return detail::FourQuartetsPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
#define HWY_API
Definition: base.h:117
#define HWY_INLINE
Definition: base.h:59
#define HWY_ASSERT(condition)
Definition: base.h:142
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2332
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3435
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
constexpr HWY_API size_t Lanes(Simd< T, N >)
Definition: arm_sve-inl.h:226
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3726
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3395
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3637
HWY_API Vec128< uint32_t, 2 > Shuffle2301(const Vec128< uint32_t, 2 > v)
Definition: arm_neon-inl.h:1698
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3752
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3385
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:31
Definition: aligned_allocator.h:23
HWY_NORETURN void int line
Definition: base.h:665
constexpr bool IsFloat()
Definition: base.h:308
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77