Crypto++  8.7
Free C++ class library of cryptographic schemes
lsh512_avx.cpp
1 // lsh.cpp - written and placed in the public domain by Jeffrey Walton
2 // Based on the specification and source code provided by
3 // Korea Internet & Security Agency (KISA) website. Also
4 // see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
5 // and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
6 
7 // We are hitting some sort of GCC bug in the LSH AVX2 code path.
8 // Clang is OK on the AVX2 code path. We believe it is GCC Issue
9 // 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
10 // makes using zeroupper a little tricky.
11 
12 #include "pch.h"
13 #include "config.h"
14 
15 #include "lsh.h"
16 #include "misc.h"
17 
18 // Squash MS LNK4221 and libtool warnings
19 extern const char LSH512_AVX_FNAME[] = __FILE__;
20 
21 #if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE)
22 
23 #if defined(CRYPTOPP_AVX2_AVAILABLE)
24 # include <emmintrin.h>
25 # include <immintrin.h>
26 #endif
27 
28 // GCC at 4.5. Clang is unknown. Also see https://stackoverflow.com/a/42493893.
29 #if (CRYPTOPP_GCC_VERSION >= 40500)
30 # include <x86intrin.h>
31 #endif
32 
33 ANONYMOUS_NAMESPACE_BEGIN
34 
35 /* LSH Constants */
36 
37 const unsigned int LSH512_MSG_BLK_BYTE_LEN = 256;
38 // const unsigned int LSH512_MSG_BLK_BIT_LEN = 2048;
39 // const unsigned int LSH512_CV_BYTE_LEN = 128;
40 const unsigned int LSH512_HASH_VAL_MAX_BYTE_LEN = 64;
41 
42 // const unsigned int MSG_BLK_WORD_LEN = 32;
43 const unsigned int CV_WORD_LEN = 16;
44 const unsigned int CONST_WORD_LEN = 8;
45 // const unsigned int HASH_VAL_MAX_WORD_LEN = 8;
46 const unsigned int NUM_STEPS = 28;
47 
48 const unsigned int ROT_EVEN_ALPHA = 23;
49 const unsigned int ROT_EVEN_BETA = 59;
50 const unsigned int ROT_ODD_ALPHA = 7;
51 const unsigned int ROT_ODD_BETA = 3;
52 
53 const unsigned int LSH_TYPE_512_512 = 0x0010040;
54 const unsigned int LSH_TYPE_512_384 = 0x0010030;
55 const unsigned int LSH_TYPE_512_256 = 0x0010020;
56 const unsigned int LSH_TYPE_512_224 = 0x001001C;
57 
58 // const unsigned int LSH_TYPE_384 = LSH_TYPE_512_384;
59 // const unsigned int LSH_TYPE_512 = LSH_TYPE_512_512;
60 
61 /* Error Code */
62 
63 const unsigned int LSH_SUCCESS = 0x0;
64 // const unsigned int LSH_ERR_NULL_PTR = 0x2401;
65 // const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402;
66 const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403;
67 const unsigned int LSH_ERR_INVALID_STATE = 0x2404;
68 
69 /* Index into our state array */
70 
71 const unsigned int AlgorithmType = 80;
72 const unsigned int RemainingBits = 81;
73 
74 NAMESPACE_END
75 
76 NAMESPACE_BEGIN(CryptoPP)
77 NAMESPACE_BEGIN(LSH)
78 
79 // lsh512.cpp
80 extern const word64 LSH512_IV224[CV_WORD_LEN];
81 extern const word64 LSH512_IV256[CV_WORD_LEN];
82 extern const word64 LSH512_IV384[CV_WORD_LEN];
83 extern const word64 LSH512_IV512[CV_WORD_LEN];
84 extern const word64 LSH512_StepConstants[CONST_WORD_LEN * NUM_STEPS];
85 
86 NAMESPACE_END // LSH
87 NAMESPACE_END // Crypto++
88 
89 ANONYMOUS_NAMESPACE_BEGIN
90 
91 using CryptoPP::byte;
92 using CryptoPP::word32;
93 using CryptoPP::word64;
96 
97 using CryptoPP::GetBlock;
101 
102 using CryptoPP::LSH::LSH512_IV224;
103 using CryptoPP::LSH::LSH512_IV256;
104 using CryptoPP::LSH::LSH512_IV384;
105 using CryptoPP::LSH::LSH512_IV512;
106 using CryptoPP::LSH::LSH512_StepConstants;
107 
108 typedef byte lsh_u8;
109 typedef word32 lsh_u32;
110 typedef word64 lsh_u64;
111 typedef word32 lsh_uint;
112 typedef word32 lsh_err;
113 typedef word32 lsh_type;
114 
115 struct LSH512_AVX2_Context
116 {
117  LSH512_AVX2_Context(word64* state, word64 algType, word64& remainingBitLength) :
118  cv_l(state+0), cv_r(state+8), sub_msgs(state+16),
119  last_block(reinterpret_cast<byte*>(state+48)),
120  remain_databitlen(remainingBitLength),
121  alg_type(static_cast<lsh_type>(algType)) {}
122 
123  lsh_u64* cv_l; // start of our state block
124  lsh_u64* cv_r;
125  lsh_u64* sub_msgs;
126  lsh_u8* last_block;
127  lsh_u64& remain_databitlen;
128  lsh_type alg_type;
129 };
130 
131 struct LSH512_AVX2_Internal
132 {
133  LSH512_AVX2_Internal(word64* state) :
134  submsg_e_l(state+16), submsg_e_r(state+24),
135  submsg_o_l(state+32), submsg_o_r(state+40) { }
136 
137  lsh_u64* submsg_e_l; /* even left sub-message */
138  lsh_u64* submsg_e_r; /* even right sub-message */
139  lsh_u64* submsg_o_l; /* odd left sub-message */
140  lsh_u64* submsg_o_r; /* odd right sub-message */
141 };
142 
143 // Zero the upper 128 bits of all YMM registers on exit.
144 // It avoids AVX state transition penalties when saving state.
145 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735
146 // makes using zeroupper a little tricky.
147 
148 struct AVX_Cleanup
149 {
150  ~AVX_Cleanup() {
151  _mm256_zeroupper();
152  }
153 };
154 
155 // const lsh_u32 g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
156 
157 /* LSH AlgType Macro */
158 
159 inline bool LSH_IS_LSH512(lsh_uint val) {
160  return (val & 0xf0000) == 0x10000;
161 }
162 
163 inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) {
164  return val >> 24;
165 }
166 
167 inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) {
168  return val & 0xffff;
169 }
170 
171 inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) {
172  return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val);
173 }
174 
175 inline lsh_u64 loadLE64(lsh_u64 v) {
177 }
178 
179 lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) {
180  return rotlFixed(x, r);
181 }
182 
183 // Original code relied upon unaligned lsh_u64 buffer
184 inline void load_msg_blk(LSH512_AVX2_Internal* i_state, const lsh_u8 msgblk[LSH512_MSG_BLK_BYTE_LEN])
185 {
186  lsh_u64* submsg_e_l = i_state->submsg_e_l;
187  lsh_u64* submsg_e_r = i_state->submsg_e_r;
188  lsh_u64* submsg_o_l = i_state->submsg_o_l;
189  lsh_u64* submsg_o_r = i_state->submsg_o_r;
190 
191  _mm256_storeu_si256(M256_CAST(submsg_e_l+0),
192  _mm256_loadu_si256(CONST_M256_CAST(msgblk+0)));
193  _mm256_storeu_si256(M256_CAST(submsg_e_l+4),
194  _mm256_loadu_si256(CONST_M256_CAST(msgblk+32)));
195 
196  _mm256_storeu_si256(M256_CAST(submsg_e_r+0),
197  _mm256_loadu_si256(CONST_M256_CAST(msgblk+64)));
198  _mm256_storeu_si256(M256_CAST(submsg_e_r+4),
199  _mm256_loadu_si256(CONST_M256_CAST(msgblk+96)));
200 
201  _mm256_storeu_si256(M256_CAST(submsg_o_l+0),
202  _mm256_loadu_si256(CONST_M256_CAST(msgblk+128)));
203  _mm256_storeu_si256(M256_CAST(submsg_o_l+4),
204  _mm256_loadu_si256(CONST_M256_CAST(msgblk+160)));
205 
206  _mm256_storeu_si256(M256_CAST(submsg_o_r+0),
207  _mm256_loadu_si256(CONST_M256_CAST(msgblk+192)));
208  _mm256_storeu_si256(M256_CAST(submsg_o_r+4),
209  _mm256_loadu_si256(CONST_M256_CAST(msgblk+224)));
210 }
211 
212 inline void msg_exp_even(LSH512_AVX2_Internal* i_state)
213 {
214  CRYPTOPP_ASSERT(i_state != NULLPTR);
215 
216  lsh_u64* submsg_e_l = i_state->submsg_e_l;
217  lsh_u64* submsg_e_r = i_state->submsg_e_r;
218  lsh_u64* submsg_o_l = i_state->submsg_o_l;
219  lsh_u64* submsg_o_r = i_state->submsg_o_r;
220 
221  _mm256_storeu_si256(M256_CAST(submsg_e_l+0), _mm256_add_epi64(
222  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)),
223  _mm256_permute4x64_epi64(
224  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)),
225  _MM_SHUFFLE(1,0,2,3))));
226  _mm256_storeu_si256(M256_CAST(submsg_e_l+4), _mm256_add_epi64(
227  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)),
228  _mm256_permute4x64_epi64(
229  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)),
230  _MM_SHUFFLE(2,1,0,3))));
231 
232  _mm256_storeu_si256(M256_CAST(submsg_e_r+0), _mm256_add_epi64(
233  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)),
234  _mm256_permute4x64_epi64(
235  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)),
236  _MM_SHUFFLE(1,0,2,3))));
237  _mm256_storeu_si256(M256_CAST(submsg_e_r+4), _mm256_add_epi64(
238  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)),
239  _mm256_permute4x64_epi64(
240  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)),
241  _MM_SHUFFLE(2,1,0,3))));
242 }
243 
244 inline void msg_exp_odd(LSH512_AVX2_Internal* i_state)
245 {
246  CRYPTOPP_ASSERT(i_state != NULLPTR);
247 
248  lsh_u64* submsg_e_l = i_state->submsg_e_l;
249  lsh_u64* submsg_e_r = i_state->submsg_e_r;
250  lsh_u64* submsg_o_l = i_state->submsg_o_l;
251  lsh_u64* submsg_o_r = i_state->submsg_o_r;
252 
253  _mm256_storeu_si256(M256_CAST(submsg_o_l+0),
254  _mm256_add_epi64(
255  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)),
256  _mm256_permute4x64_epi64(
257  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)),
258  _MM_SHUFFLE(1,0,2,3))));
259  _mm256_storeu_si256(M256_CAST(submsg_o_l+4),
260  _mm256_add_epi64(
261  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)),
262  _mm256_permute4x64_epi64(
263  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)),
264  _MM_SHUFFLE(2,1,0,3))));
265 
266  _mm256_storeu_si256(M256_CAST(submsg_o_r+0),
267  _mm256_add_epi64(
268  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)),
269  _mm256_permute4x64_epi64(
270  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)),
271  _MM_SHUFFLE(1,0,2,3))));
272  _mm256_storeu_si256(M256_CAST(submsg_o_r+4),
273  _mm256_add_epi64(
274  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)),
275  _mm256_permute4x64_epi64(
276  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)),
277  _MM_SHUFFLE(2,1,0,3))));
278 }
279 
280 inline void load_sc(const lsh_u64** p_const_v, size_t i)
281 {
282  *p_const_v = &LSH512_StepConstants[i];
283 }
284 
285 inline void msg_add_even(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_AVX2_Internal* i_state)
286 {
287  CRYPTOPP_ASSERT(i_state != NULLPTR);
288 
289  lsh_u64* submsg_e_l = i_state->submsg_e_l;
290  lsh_u64* submsg_e_r = i_state->submsg_e_r;
291 
292  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
293  _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
294  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l))));
295  _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256(
296  _mm256_loadu_si256(CONST_M256_CAST(cv_r)),
297  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r))));
298 
299  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
300  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
301  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4))));
302  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256(
303  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
304  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4))));
305 }
306 
307 inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_AVX2_Internal* i_state)
308 {
309  CRYPTOPP_ASSERT(i_state != NULLPTR);
310 
311  lsh_u64* submsg_o_l = i_state->submsg_o_l;
312  lsh_u64* submsg_o_r = i_state->submsg_o_r;
313 
314  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
315  _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
316  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l))));
317  _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256(
318  _mm256_loadu_si256(CONST_M256_CAST(cv_r)),
319  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r))));
320 
321  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
322  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
323  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4))));
324  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256(
325  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
326  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4))));
327 }
328 
329 inline void add_blk(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
330 {
331  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_add_epi64(
332  _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
333  _mm256_loadu_si256(CONST_M256_CAST(cv_r))));
334  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_add_epi64(
335  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
336  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4))));
337 }
338 
339 template <unsigned int R>
340 inline void rotate_blk(lsh_u64 cv[8])
341 {
342  _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
343  _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
344  _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), 64-R)));
345  _mm256_storeu_si256(M256_CAST(cv+4), _mm256_or_si256(
346  _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), R),
347  _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), 64-R)));
348 }
349 
350 inline void xor_with_const(lsh_u64 cv_l[8], const lsh_u64 const_v[8])
351 {
352  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
353  _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
354  _mm256_loadu_si256(CONST_M256_CAST(const_v))));
355  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
356  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
357  _mm256_loadu_si256(CONST_M256_CAST(const_v+4))));
358 }
359 
360 inline void rotate_msg_gamma(lsh_u64 cv_r[8])
361 {
362  // g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
363  _mm256_storeu_si256(M256_CAST(cv_r+0),
364  _mm256_shuffle_epi8(
365  _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)),
366  _mm256_set_epi8(
367  /* hi lane */ 9,8,15,14, 13,12,11,10, 3,2,1,0, 7,6,5,4,
368  /* lo lane */ 13,12,11,10, 9,8,15,14, 7,6,5,4, 3,2,1,0)));
369  _mm256_storeu_si256(M256_CAST(cv_r+4),
370  _mm256_shuffle_epi8(
371  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
372  _mm256_set_epi8(
373  /* hi lane */ 8,15,14,13, 12,11,10,9, 2,1,0,7, 6,5,4,3,
374  /* lo lane */ 12,11,10,9, 8,15,14,13, 6,5,4,3, 2,1,0,7)));
375 }
376 
377 inline void word_perm(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
378 {
379  __m256i temp[2];
380  _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_permute4x64_epi64(
381  _mm256_loadu_si256(CONST_M256_CAST(cv_l+0)), _MM_SHUFFLE(3,1,0,2)));
382  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_permute4x64_epi64(
383  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), _MM_SHUFFLE(3,1,0,2)));
384  _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_permute4x64_epi64(
385  _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), _MM_SHUFFLE(1,2,3,0)));
386  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_permute4x64_epi64(
387  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)), _MM_SHUFFLE(1,2,3,0)));
388 
389  temp[0] = _mm256_loadu_si256(CONST_M256_CAST(cv_l+0));
390  temp[1] = _mm256_loadu_si256(CONST_M256_CAST(cv_r+0));
391 
392  _mm256_storeu_si256(M256_CAST(cv_l+0),
393  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)));
394  _mm256_storeu_si256(M256_CAST(cv_l+4),
395  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)));
396 
397  _mm256_storeu_si256(M256_CAST(cv_r+0), temp[0]);
398  _mm256_storeu_si256(M256_CAST(cv_r+4), temp[1]);
399 };
400 
401 /* -------------------------------------------------------- *
402 * step function
403 * -------------------------------------------------------- */
404 
405 template <unsigned int Alpha, unsigned int Beta>
406 inline void mix(lsh_u64 cv_l[8], lsh_u64 cv_r[8], const lsh_u64 const_v[8])
407 {
408  add_blk(cv_l, cv_r);
409  rotate_blk<Alpha>(cv_l);
410  xor_with_const(cv_l, const_v);
411  add_blk(cv_r, cv_l);
412  rotate_blk<Beta>(cv_r);
413  add_blk(cv_l, cv_r);
414  rotate_msg_gamma(cv_r);
415 }
416 
417 /* -------------------------------------------------------- *
418 * compression function
419 * -------------------------------------------------------- */
420 
421 inline void compress(LSH512_AVX2_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_BYTE_LEN])
422 {
423  CRYPTOPP_ASSERT(ctx != NULLPTR);
424 
425  LSH512_AVX2_Internal s_state(ctx->cv_l);
426  LSH512_AVX2_Internal* i_state = &s_state;
427 
428  const lsh_u64* const_v = NULL;
429  lsh_u64 *cv_l = ctx->cv_l;
430  lsh_u64 *cv_r = ctx->cv_r;
431 
432  load_msg_blk(i_state, pdMsgBlk);
433 
434  msg_add_even(cv_l, cv_r, i_state);
435  load_sc(&const_v, 0);
436  mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
437  word_perm(cv_l, cv_r);
438 
439  msg_add_odd(cv_l, cv_r, i_state);
440  load_sc(&const_v, 8);
441  mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
442  word_perm(cv_l, cv_r);
443 
444  for (size_t i = 1; i < NUM_STEPS / 2; i++)
445  {
446  msg_exp_even(i_state);
447  msg_add_even(cv_l, cv_r, i_state);
448  load_sc(&const_v, 16 * i);
449  mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
450  word_perm(cv_l, cv_r);
451 
452  msg_exp_odd(i_state);
453  msg_add_odd(cv_l, cv_r, i_state);
454  load_sc(&const_v, 16 * i + 8);
455  mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
456  word_perm(cv_l, cv_r);
457  }
458 
459  msg_exp_even(i_state);
460  msg_add_even(cv_l, cv_r, i_state);
461 }
462 
463 /* -------------------------------------------------------- */
464 
465 inline void load_iv(word64 cv_l[8], word64 cv_r[8], const word64 iv[16])
466 {
467  // The IV's are 32-byte aligned so we can use aligned loads.
468  _mm256_storeu_si256(M256_CAST(cv_l+0),
469  _mm256_load_si256(CONST_M256_CAST(iv+0)));
470  _mm256_storeu_si256(M256_CAST(cv_l+4),
471  _mm256_load_si256(CONST_M256_CAST(iv+4)));
472 
473  _mm256_storeu_si256(M256_CAST(cv_r+0),
474  _mm256_load_si256(CONST_M256_CAST(iv+8)));
475  _mm256_storeu_si256(M256_CAST(cv_r+4),
476  _mm256_load_si256(CONST_M256_CAST(iv+12)));
477 }
478 
479 inline void zero_iv(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
480 {
481  _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_setzero_si256());
482  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_setzero_si256());
483  _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_setzero_si256());
484  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_setzero_si256());
485 }
486 
487 inline void zero_submsgs(LSH512_AVX2_Context* ctx)
488 {
489  lsh_u64* sub_msgs = ctx->sub_msgs;
490 
491  _mm256_storeu_si256(M256_CAST(sub_msgs+ 0),
492  _mm256_setzero_si256());
493  _mm256_storeu_si256(M256_CAST(sub_msgs+ 4),
494  _mm256_setzero_si256());
495 
496  _mm256_storeu_si256(M256_CAST(sub_msgs+ 8),
497  _mm256_setzero_si256());
498  _mm256_storeu_si256(M256_CAST(sub_msgs+12),
499  _mm256_setzero_si256());
500 }
501 
502 inline void init224(LSH512_AVX2_Context* ctx)
503 {
504  CRYPTOPP_ASSERT(ctx != NULLPTR);
505 
506  zero_submsgs(ctx);
507  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV224);
508 }
509 
510 inline void init256(LSH512_AVX2_Context* ctx)
511 {
512  CRYPTOPP_ASSERT(ctx != NULLPTR);
513 
514  zero_submsgs(ctx);
515  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV256);
516 }
517 
518 inline void init384(LSH512_AVX2_Context* ctx)
519 {
520  CRYPTOPP_ASSERT(ctx != NULLPTR);
521 
522  zero_submsgs(ctx);
523  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV384);
524 }
525 
526 inline void init512(LSH512_AVX2_Context* ctx)
527 {
528  CRYPTOPP_ASSERT(ctx != NULLPTR);
529 
530  zero_submsgs(ctx);
531  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV512);
532 }
533 
534 /* -------------------------------------------------------- */
535 
536 inline void fin(LSH512_AVX2_Context* ctx)
537 {
538  CRYPTOPP_ASSERT(ctx != NULLPTR);
539 
540  _mm256_storeu_si256(M256_CAST(ctx->cv_l+0), _mm256_xor_si256(
541  _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+0)),
542  _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+0))));
543 
544  _mm256_storeu_si256(M256_CAST(ctx->cv_l+4), _mm256_xor_si256(
545  _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+4)),
546  _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+4))));
547 }
548 
549 /* -------------------------------------------------------- */
550 
551 inline void get_hash(LSH512_AVX2_Context* ctx, lsh_u8* pbHashVal)
552 {
553  CRYPTOPP_ASSERT(ctx != NULLPTR);
554  CRYPTOPP_ASSERT(ctx->alg_type != 0);
555  CRYPTOPP_ASSERT(pbHashVal != NULLPTR);
556 
557  lsh_uint alg_type = ctx->alg_type;
558  lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type);
559  lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type);
560 
561  // Multiplying by sizeof(lsh_u8) looks odd...
562  memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len);
563  if (hash_val_bit_len){
564  pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len);
565  }
566 }
567 
568 /* -------------------------------------------------------- */
569 
570 lsh_err lsh512_init_avx2(LSH512_AVX2_Context* ctx)
571 {
572  CRYPTOPP_ASSERT(ctx != NULLPTR);
573  CRYPTOPP_ASSERT(ctx->alg_type != 0);
574 
575  lsh_u32 alg_type = ctx->alg_type;
576  const lsh_u64* const_v = NULL;
577  ctx->remain_databitlen = 0;
578 
579  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
580  AVX_Cleanup cleanup;
581 
582  switch (alg_type){
583  case LSH_TYPE_512_512:
584  init512(ctx);
585  return LSH_SUCCESS;
586  case LSH_TYPE_512_384:
587  init384(ctx);
588  return LSH_SUCCESS;
589  case LSH_TYPE_512_256:
590  init256(ctx);
591  return LSH_SUCCESS;
592  case LSH_TYPE_512_224:
593  init224(ctx);
594  return LSH_SUCCESS;
595  default:
596  break;
597  }
598 
599  lsh_u64* cv_l = ctx->cv_l;
600  lsh_u64* cv_r = ctx->cv_r;
601 
602  zero_iv(cv_l, cv_r);
603  cv_l[0] = LSH512_HASH_VAL_MAX_BYTE_LEN;
604  cv_l[1] = LSH_GET_HASHBIT(alg_type);
605 
606  for (size_t i = 0; i < NUM_STEPS / 2; i++)
607  {
608  //Mix
609  load_sc(&const_v, i * 16);
610  mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
611  word_perm(cv_l, cv_r);
612 
613  load_sc(&const_v, i * 16 + 8);
614  mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
615  word_perm(cv_l, cv_r);
616  }
617 
618  return LSH_SUCCESS;
619 }
620 
621 lsh_err lsh512_update_avx2(LSH512_AVX2_Context* ctx, const lsh_u8* data, size_t databitlen)
622 {
623  CRYPTOPP_ASSERT(ctx != NULLPTR);
624  CRYPTOPP_ASSERT(data != NULLPTR);
625  CRYPTOPP_ASSERT(databitlen % 8 == 0);
626  CRYPTOPP_ASSERT(ctx->alg_type != 0);
627 
628  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
629  AVX_Cleanup cleanup;
630 
631  if (databitlen == 0){
632  return LSH_SUCCESS;
633  }
634 
635  // We are byte oriented. tail bits will always be 0.
636  size_t databytelen = databitlen >> 3;
637  // lsh_uint pos2 = databitlen & 0x7;
638  const size_t pos2 = 0;
639 
640  size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
641  // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
642  const size_t remain_msg_bit = 0;
643 
644  if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
645  return LSH_ERR_INVALID_STATE;
646  }
647  if (remain_msg_bit > 0){
648  return LSH_ERR_INVALID_DATABITLEN;
649  }
650 
651  if (databytelen + remain_msg_byte < LSH512_MSG_BLK_BYTE_LEN){
652  memcpy(ctx->last_block + remain_msg_byte, data, databytelen);
653  ctx->remain_databitlen += (lsh_uint)databitlen;
654  remain_msg_byte += (lsh_uint)databytelen;
655  if (pos2){
656  ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
657  }
658  return LSH_SUCCESS;
659  }
660 
661  if (remain_msg_byte > 0){
662  size_t more_byte = LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte;
663  memcpy(ctx->last_block + remain_msg_byte, data, more_byte);
664  compress(ctx, ctx->last_block);
665  data += more_byte;
666  databytelen -= more_byte;
667  remain_msg_byte = 0;
668  ctx->remain_databitlen = 0;
669  }
670 
671  while (databytelen >= LSH512_MSG_BLK_BYTE_LEN)
672  {
673  // This call to compress caused some trouble.
674  // The data pointer can become unaligned in the
675  // previous block.
676  compress(ctx, data);
677  data += LSH512_MSG_BLK_BYTE_LEN;
678  databytelen -= LSH512_MSG_BLK_BYTE_LEN;
679  }
680 
681  if (databytelen > 0){
682  memcpy(ctx->last_block, data, databytelen);
683  ctx->remain_databitlen = (lsh_uint)(databytelen << 3);
684  }
685 
686  if (pos2){
687  ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
688  ctx->remain_databitlen += pos2;
689  }
690  return LSH_SUCCESS;
691 }
692 
693 lsh_err lsh512_final_avx2(LSH512_AVX2_Context* ctx, lsh_u8* hashval)
694 {
695  CRYPTOPP_ASSERT(ctx != NULLPTR);
696  CRYPTOPP_ASSERT(hashval != NULLPTR);
697 
698  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
699  AVX_Cleanup cleanup;
700 
701  // We are byte oriented. tail bits will always be 0.
702  size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
703  // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
704  const size_t remain_msg_bit = 0;
705 
706  if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
707  return LSH_ERR_INVALID_STATE;
708  }
709 
710  if (remain_msg_bit){
711  ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit));
712  }
713  else{
714  ctx->last_block[remain_msg_byte] = 0x80;
715  }
716  memset(ctx->last_block + remain_msg_byte + 1, 0, LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte - 1);
717 
718  compress(ctx, ctx->last_block);
719 
720  fin(ctx);
721  get_hash(ctx, hashval);
722 
723  return LSH_SUCCESS;
724 }
725 
726 ANONYMOUS_NAMESPACE_END
727 
728 NAMESPACE_BEGIN(CryptoPP)
729 
730 extern
731 void LSH512_Base_Restart_AVX2(word64* state)
732 {
733  state[RemainingBits] = 0;
734  LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
735  lsh_err err = lsh512_init_avx2(&ctx);
736 
737  if (err != LSH_SUCCESS)
738  throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_init_avx2 failed");
739 }
740 
741 extern
742 void LSH512_Base_Update_AVX2(word64* state, const byte *input, size_t size)
743 {
744  LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
745  lsh_err err = lsh512_update_avx2(&ctx, input, 8*size);
746 
747  if (err != LSH_SUCCESS)
748  throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_update_avx2 failed");
749 }
750 
751 extern
752 void LSH512_Base_TruncatedFinal_AVX2(word64* state, byte *hash, size_t)
753 {
754  LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
755  lsh_err err = lsh512_final_avx2(&ctx, hash);
756 
757  if (err != LSH_SUCCESS)
758  throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_final_avx2 failed");
759 }
760 
761 NAMESPACE_END
762 
763 #endif // CRYPTOPP_AVX2_AVAILABLE
Base class for all exceptions thrown by the library.
Definition: cryptlib.h:159
@ OTHER_ERROR
Some other error occurred not belonging to other categories.
Definition: cryptlib.h:177
Library configuration file.
unsigned char byte
8-bit unsigned datatype
Definition: config_int.h:56
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:62
unsigned long long word64
64-bit unsigned datatype
Definition: config_int.h:91
@ LITTLE_ENDIAN_ORDER
byte order is little-endian
Definition: cryptlib.h:145
EnumToType< ByteOrder, LITTLE_ENDIAN_ORDER > LittleEndian
Provides a constant for LittleEndian.
Definition: cryptlib.h:150
Classes for the LSH hash functions.
Utility functions for the Crypto++ library.
T rotlConstant(T x)
Performs a left rotate.
Definition: misc.h:1548
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianness.
Definition: misc.h:2208
T rotlFixed(T x, unsigned int y)
Performs a left rotate.
Definition: misc.h:1599
Crypto++ library namespace.
Precompiled header file.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68