mdds
soa/block_util.hpp
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*************************************************************************
3  *
4  * Copyright (c) 2021 Kohei Yoshida
5  *
6  * Permission is hereby granted, free of charge, to any person
7  * obtaining a copy of this software and associated documentation
8  * files (the "Software"), to deal in the Software without
9  * restriction, including without limitation the rights to use,
10  * copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following
13  * conditions:
14  *
15  * The above copyright notice and this permission notice shall be
16  * included in all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25  * OTHER DEALINGS IN THE SOFTWARE.
26  *
27  ************************************************************************/
28 
29 #ifndef INCLUDED_MDDS_MULTI_TYPE_VECTOR_DIR_SOA_BLOCK_UTIL_HPP
30 #define INCLUDED_MDDS_MULTI_TYPE_VECTOR_DIR_SOA_BLOCK_UTIL_HPP
31 
32 #include "mdds/global.hpp"
33 #include "../types.hpp"
34 
35 #if defined(__SSE2__)
36 #include <emmintrin.h>
37 #endif
38 #if defined(__AVX2__)
39 #include <immintrin.h>
40 #endif
41 
42 namespace mdds { namespace mtv { namespace soa {
43 
44 namespace detail {
45 
46 template<typename Blks, lu_factor_t F>
48 {
49  void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
50  {
51  static_assert(invalid_static_int<F>, "The loop-unrolling factor must be one of 0, 4, 8, 16, or 32.");
52  }
53 };
54 
55 template<typename Blks>
56 struct adjust_block_positions<Blks, lu_factor_t::none>
57 {
58  void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
59  {
60  int64_t n = block_store.positions.size();
61 
62  if (start_block_index >= n)
63  return;
64 
65 #if MDDS_USE_OPENMP
66  #pragma omp parallel for
67 #endif
68  for (int64_t i = start_block_index; i < n; ++i)
69  block_store.positions[i] += delta;
70  }
71 };
72 
73 template<typename Blks>
74 struct adjust_block_positions<Blks, lu_factor_t::lu4>
75 {
76  void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
77  {
78  int64_t n = block_store.positions.size();
79 
80  if (start_block_index >= n)
81  return;
82 
83  // Ensure that the section length is divisible by 4.
84  int64_t len = n - start_block_index;
85  int64_t rem = len & 3; // % 4
86  len -= rem;
87  len += start_block_index;
88 #if MDDS_USE_OPENMP
89  #pragma omp parallel for
90 #endif
91  for (int64_t i = start_block_index; i < len; i += 4)
92  {
93  block_store.positions[i+0] += delta;
94  block_store.positions[i+1] += delta;
95  block_store.positions[i+2] += delta;
96  block_store.positions[i+3] += delta;
97  }
98 
99  rem += len;
100  for (int64_t i = len; i < rem; ++i)
101  block_store.positions[i] += delta;
102  }
103 };
104 
105 template<typename Blks>
106 struct adjust_block_positions<Blks, lu_factor_t::lu8>
107 {
108  void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
109  {
110  int64_t n = block_store.positions.size();
111 
112  if (start_block_index >= n)
113  return;
114 
115  // Ensure that the section length is divisible by 8.
116  int64_t len = n - start_block_index;
117  int64_t rem = len & 7; // % 8
118  len -= rem;
119  len += start_block_index;
120 #if MDDS_USE_OPENMP
121  #pragma omp parallel for
122 #endif
123  for (int64_t i = start_block_index; i < len; i += 8)
124  {
125  block_store.positions[i+0] += delta;
126  block_store.positions[i+1] += delta;
127  block_store.positions[i+2] += delta;
128  block_store.positions[i+3] += delta;
129  block_store.positions[i+4] += delta;
130  block_store.positions[i+5] += delta;
131  block_store.positions[i+6] += delta;
132  block_store.positions[i+7] += delta;
133  }
134 
135  rem += len;
136  for (int64_t i = len; i < rem; ++i)
137  block_store.positions[i] += delta;
138  }
139 };
140 
141 template<typename Blks>
142 struct adjust_block_positions<Blks, lu_factor_t::lu16>
143 {
144  void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
145  {
146  int64_t n = block_store.positions.size();
147 
148  if (start_block_index >= n)
149  return;
150 
151  // Ensure that the section length is divisible by 16.
152  int64_t len = n - start_block_index;
153  int64_t rem = len & 15; // % 16
154  len -= rem;
155  len += start_block_index;
156 #if MDDS_USE_OPENMP
157  #pragma omp parallel for
158 #endif
159  for (int64_t i = start_block_index; i < len; i += 16)
160  {
161  block_store.positions[i+0] += delta;
162  block_store.positions[i+1] += delta;
163  block_store.positions[i+2] += delta;
164  block_store.positions[i+3] += delta;
165  block_store.positions[i+4] += delta;
166  block_store.positions[i+5] += delta;
167  block_store.positions[i+6] += delta;
168  block_store.positions[i+7] += delta;
169  block_store.positions[i+8] += delta;
170  block_store.positions[i+9] += delta;
171  block_store.positions[i+10] += delta;
172  block_store.positions[i+11] += delta;
173  block_store.positions[i+12] += delta;
174  block_store.positions[i+13] += delta;
175  block_store.positions[i+14] += delta;
176  block_store.positions[i+15] += delta;
177  }
178 
179  rem += len;
180  for (int64_t i = len; i < rem; ++i)
181  block_store.positions[i] += delta;
182  }
183 };
184 
185 template<typename Blks>
186 struct adjust_block_positions<Blks, lu_factor_t::lu32>
187 {
188  void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
189  {
190  int64_t n = block_store.positions.size();
191 
192  if (start_block_index >= n)
193  return;
194 
195  // Ensure that the section length is divisible by 32.
196  int64_t len = n - start_block_index;
197  int64_t rem = len & 31; // % 32
198  len -= rem;
199  len += start_block_index;
200 #if MDDS_USE_OPENMP
201  #pragma omp parallel for
202 #endif
203  for (int64_t i = start_block_index; i < len; i += 32)
204  {
205  block_store.positions[i+0] += delta;
206  block_store.positions[i+1] += delta;
207  block_store.positions[i+2] += delta;
208  block_store.positions[i+3] += delta;
209  block_store.positions[i+4] += delta;
210  block_store.positions[i+5] += delta;
211  block_store.positions[i+6] += delta;
212  block_store.positions[i+7] += delta;
213  block_store.positions[i+8] += delta;
214  block_store.positions[i+9] += delta;
215  block_store.positions[i+10] += delta;
216  block_store.positions[i+11] += delta;
217  block_store.positions[i+12] += delta;
218  block_store.positions[i+13] += delta;
219  block_store.positions[i+14] += delta;
220  block_store.positions[i+15] += delta;
221  block_store.positions[i+16] += delta;
222  block_store.positions[i+17] += delta;
223  block_store.positions[i+18] += delta;
224  block_store.positions[i+19] += delta;
225  block_store.positions[i+20] += delta;
226  block_store.positions[i+21] += delta;
227  block_store.positions[i+22] += delta;
228  block_store.positions[i+23] += delta;
229  block_store.positions[i+24] += delta;
230  block_store.positions[i+25] += delta;
231  block_store.positions[i+26] += delta;
232  block_store.positions[i+27] += delta;
233  block_store.positions[i+28] += delta;
234  block_store.positions[i+29] += delta;
235  block_store.positions[i+30] += delta;
236  block_store.positions[i+31] += delta;
237  }
238 
239  rem += len;
240  for (int64_t i = len; i < rem; ++i)
241  block_store.positions[i] += delta;
242  }
243 };
244 
245 #if defined(__SSE2__)
246 
247 template<typename Blks>
248 struct adjust_block_positions<Blks, lu_factor_t::sse2_x64>
249 {
250  void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
251  {
252  static_assert(
253  sizeof(typename decltype(block_store.positions)::value_type) == 8,
254  "This code works only when the position values are 64-bit wide.");
255 
256  int64_t n = block_store.positions.size();
257 
258  if (start_block_index >= n)
259  return;
260 
261  // Ensure that the section length is divisible by 2.
262  int64_t len = n - start_block_index;
263  bool odd = len & 1;
264  if (odd)
265  len -= 1;
266 
267  len += start_block_index;
268 
269  __m128i right = _mm_set_epi64x(delta, delta);
270 
271 #if MDDS_USE_OPENMP
272  #pragma omp parallel for
273 #endif
274  for (int64_t i = start_block_index; i < len; i += 2)
275  {
276  __m128i* dst = (__m128i*)&block_store.positions[i];
277  _mm_storeu_si128(dst, _mm_add_epi64(_mm_loadu_si128(dst), right));
278  }
279 
280  if (odd)
281  block_store.positions[len] += delta;
282  }
283 };
284 
285 template<typename Blks>
286 struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu4>
287 {
288  void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
289  {
290  static_assert(
291  sizeof(typename decltype(block_store.positions)::value_type) == 8,
292  "This code works only when the position values are 64-bit wide.");
293 
294  int64_t n = block_store.positions.size();
295 
296  if (start_block_index >= n)
297  return;
298 
299  // Ensure that the section length is divisible by 8.
300  int64_t len = n - start_block_index;
301  int64_t rem = len & 7; // % 8
302  len -= rem;
303  len += start_block_index;
304 
305  __m128i right = _mm_set_epi64x(delta, delta);
306 
307 #if MDDS_USE_OPENMP
308  #pragma omp parallel for
309 #endif
310  for (int64_t i = start_block_index; i < len; i += 8)
311  {
312  __m128i* dst0 = (__m128i*)&block_store.positions[i];
313  _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
314 
315  __m128i* dst2 = (__m128i*)&block_store.positions[i+2];
316  _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
317 
318  __m128i* dst4 = (__m128i*)&block_store.positions[i+4];
319  _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
320 
321  __m128i* dst6 = (__m128i*)&block_store.positions[i+6];
322  _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
323  }
324 
325  rem += len;
326  for (int64_t i = len; i < rem; ++i)
327  block_store.positions[i] += delta;
328  }
329 };
330 
331 template<typename Blks>
332 struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu8>
333 {
334  void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
335  {
336  static_assert(
337  sizeof(typename decltype(block_store.positions)::value_type) == 8,
338  "This code works only when the position values are 64-bit wide.");
339 
340  int64_t n = block_store.positions.size();
341 
342  if (start_block_index >= n)
343  return;
344 
345  // Ensure that the section length is divisible by 16.
346  int64_t len = n - start_block_index;
347  int64_t rem = len & 15; // % 16
348  len -= rem;
349  len += start_block_index;
350 
351  __m128i right = _mm_set_epi64x(delta, delta);
352 
353 #if MDDS_USE_OPENMP
354  #pragma omp parallel for
355 #endif
356  for (int64_t i = start_block_index; i < len; i += 16)
357  {
358  __m128i* dst0 = (__m128i*)&block_store.positions[i];
359  _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
360 
361  __m128i* dst2 = (__m128i*)&block_store.positions[i+2];
362  _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
363 
364  __m128i* dst4 = (__m128i*)&block_store.positions[i+4];
365  _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
366 
367  __m128i* dst6 = (__m128i*)&block_store.positions[i+6];
368  _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
369 
370  __m128i* dst8 = (__m128i*)&block_store.positions[i+8];
371  _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
372 
373  __m128i* dst10 = (__m128i*)&block_store.positions[i+10];
374  _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
375 
376  __m128i* dst12 = (__m128i*)&block_store.positions[i+12];
377  _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
378 
379  __m128i* dst14 = (__m128i*)&block_store.positions[i+14];
380  _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
381  }
382 
383  rem += len;
384  for (int64_t i = len; i < rem; ++i)
385  block_store.positions[i] += delta;
386  }
387 };
388 
389 template<typename Blks>
390 struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu16>
391 {
392  void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
393  {
394  static_assert(
395  sizeof(typename decltype(block_store.positions)::value_type) == 8,
396  "This code works only when the position values are 64-bit wide.");
397 
398  int64_t n = block_store.positions.size();
399 
400  if (start_block_index >= n)
401  return;
402 
403  // Ensure that the section length is divisible by 32.
404  int64_t len = n - start_block_index;
405  int64_t rem = len & 31; // % 32
406  len -= rem;
407  len += start_block_index;
408 
409  __m128i right = _mm_set_epi64x(delta, delta);
410 
411 #if MDDS_USE_OPENMP
412  #pragma omp parallel for
413 #endif
414  for (int64_t i = start_block_index; i < len; i += 32)
415  {
416  __m128i* dst0 = (__m128i*)&block_store.positions[i];
417  _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
418 
419  __m128i* dst2 = (__m128i*)&block_store.positions[i+2];
420  _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
421 
422  __m128i* dst4 = (__m128i*)&block_store.positions[i+4];
423  _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
424 
425  __m128i* dst6 = (__m128i*)&block_store.positions[i+6];
426  _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
427 
428  __m128i* dst8 = (__m128i*)&block_store.positions[i+8];
429  _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
430 
431  __m128i* dst10 = (__m128i*)&block_store.positions[i+10];
432  _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
433 
434  __m128i* dst12 = (__m128i*)&block_store.positions[i+12];
435  _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
436 
437  __m128i* dst14 = (__m128i*)&block_store.positions[i+14];
438  _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
439 
440  __m128i* dst16 = (__m128i*)&block_store.positions[i+16];
441  _mm_storeu_si128(dst16, _mm_add_epi64(_mm_loadu_si128(dst16), right));
442 
443  __m128i* dst18 = (__m128i*)&block_store.positions[i+18];
444  _mm_storeu_si128(dst18, _mm_add_epi64(_mm_loadu_si128(dst18), right));
445 
446  __m128i* dst20 = (__m128i*)&block_store.positions[i+20];
447  _mm_storeu_si128(dst20, _mm_add_epi64(_mm_loadu_si128(dst20), right));
448 
449  __m128i* dst22 = (__m128i*)&block_store.positions[i+22];
450  _mm_storeu_si128(dst22, _mm_add_epi64(_mm_loadu_si128(dst22), right));
451 
452  __m128i* dst24 = (__m128i*)&block_store.positions[i+24];
453  _mm_storeu_si128(dst24, _mm_add_epi64(_mm_loadu_si128(dst24), right));
454 
455  __m128i* dst26 = (__m128i*)&block_store.positions[i+26];
456  _mm_storeu_si128(dst26, _mm_add_epi64(_mm_loadu_si128(dst26), right));
457 
458  __m128i* dst28 = (__m128i*)&block_store.positions[i+28];
459  _mm_storeu_si128(dst28, _mm_add_epi64(_mm_loadu_si128(dst28), right));
460 
461  __m128i* dst30 = (__m128i*)&block_store.positions[i+30];
462  _mm_storeu_si128(dst30, _mm_add_epi64(_mm_loadu_si128(dst30), right));
463  }
464 
465  rem += len;
466  for (int64_t i = len; i < rem; ++i)
467  block_store.positions[i] += delta;
468  }
469 };
470 
471 #endif // __SSE2__
472 
473 #if defined(__AVX2__)
474 
475 template<typename Blks>
476 struct adjust_block_positions<Blks, lu_factor_t::avx2_x64>
477 {
478  void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
479  {
480  static_assert(
481  sizeof(typename decltype(block_store.positions)::value_type) == 8,
482  "This code works only when the position values are 64-bit wide.");
483 
484  int64_t n = block_store.positions.size();
485 
486  if (start_block_index >= n)
487  return;
488 
489  // Ensure that the section length is divisible by 4.
490  int64_t len = n - start_block_index;
491  int64_t rem = len & 3; // % 4
492  len -= rem;
493  len += start_block_index;
494 
495  __m256i right = _mm256_set1_epi64x(delta);
496 
497 #if MDDS_USE_OPENMP
498  #pragma omp parallel for
499 #endif
500  for (int64_t i = start_block_index; i < len; i += 4)
501  {
502  __m256i* dst = (__m256i*)&block_store.positions[i];
503  _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
504  }
505 
506  rem += len;
507  for (int64_t i = len; i < rem; ++i)
508  block_store.positions[i] += delta;
509  }
510 };
511 
512 template<typename Blks>
513 struct adjust_block_positions<Blks, lu_factor_t::avx2_x64_lu4>
514 {
515  void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
516  {
517  static_assert(
518  sizeof(typename decltype(block_store.positions)::value_type) == 8,
519  "This code works only when the position values are 64-bit wide.");
520 
521  int64_t n = block_store.positions.size();
522 
523  if (start_block_index >= n)
524  return;
525 
526  // Ensure that the section length is divisible by 16.
527  int64_t len = n - start_block_index;
528  int64_t rem = len & 15; // % 16
529  len -= rem;
530  len += start_block_index;
531 
532  __m256i right = _mm256_set1_epi64x(delta);
533 
534 #if MDDS_USE_OPENMP
535  #pragma omp parallel for
536 #endif
537  for (int64_t i = start_block_index; i < len; i += 16)
538  {
539  __m256i* dst = (__m256i*)&block_store.positions[i];
540  _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
541 
542  __m256i* dst4 = (__m256i*)&block_store.positions[i+4];
543  _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
544 
545  __m256i* dst8 = (__m256i*)&block_store.positions[i+8];
546  _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
547 
548  __m256i* dst12 = (__m256i*)&block_store.positions[i+12];
549  _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
550  }
551 
552  rem += len;
553  for (int64_t i = len; i < rem; ++i)
554  block_store.positions[i] += delta;
555  }
556 };
557 
558 template<typename Blks>
559 struct adjust_block_positions<Blks, lu_factor_t::avx2_x64_lu8>
560 {
561  void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
562  {
563  static_assert(
564  sizeof(typename decltype(block_store.positions)::value_type) == 8,
565  "This code works only when the position values are 64-bit wide.");
566 
567  int64_t n = block_store.positions.size();
568 
569  if (start_block_index >= n)
570  return;
571 
572  // Ensure that the section length is divisible by 16.
573  int64_t len = n - start_block_index;
574  int64_t rem = len & 31; // % 32
575  len -= rem;
576  len += start_block_index;
577 
578  __m256i right = _mm256_set1_epi64x(delta);
579 
580 #if MDDS_USE_OPENMP
581  #pragma omp parallel for
582 #endif
583  for (int64_t i = start_block_index; i < len; i += 32)
584  {
585  __m256i* dst = (__m256i*)&block_store.positions[i];
586  _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
587 
588  __m256i* dst4 = (__m256i*)&block_store.positions[i+4];
589  _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
590 
591  __m256i* dst8 = (__m256i*)&block_store.positions[i+8];
592  _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
593 
594  __m256i* dst12 = (__m256i*)&block_store.positions[i+12];
595  _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
596 
597  __m256i* dst16 = (__m256i*)&block_store.positions[i+16];
598  _mm256_storeu_si256(dst16, _mm256_add_epi64(_mm256_loadu_si256(dst16), right));
599 
600  __m256i* dst20 = (__m256i*)&block_store.positions[i+20];
601  _mm256_storeu_si256(dst20, _mm256_add_epi64(_mm256_loadu_si256(dst20), right));
602 
603  __m256i* dst24 = (__m256i*)&block_store.positions[i+24];
604  _mm256_storeu_si256(dst24, _mm256_add_epi64(_mm256_loadu_si256(dst24), right));
605 
606  __m256i* dst28 = (__m256i*)&block_store.positions[i+28];
607  _mm256_storeu_si256(dst28, _mm256_add_epi64(_mm256_loadu_si256(dst28), right));
608  }
609 
610  rem += len;
611  for (int64_t i = len; i < rem; ++i)
612  block_store.positions[i] += delta;
613  }
614 };
615 
616 #endif // __AVX2__
617 
618 } // namespace detail
619 
620 }}}
621 
622 #endif
623 
624 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
625 
Definition: soa/block_util.hpp:48