Bitcoin Core  27.99.0
P2P Digital Currency
sha256_arm_shani.cpp
Go to the documentation of this file.
1 // Copyright (c) 2022 The Bitcoin Core developers
2 // Distributed under the MIT software license, see the accompanying
3 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
4 //
5 // Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
6 // Written and placed in public domain by Jeffrey Walton.
7 // Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
8 // Barry O'Rourke for the mbedTLS project.
9 // Variant specialized for 64-byte inputs added by Pieter Wuille.
10 
11 #ifdef ENABLE_ARM_SHANI
12 
13 #include <array>
14 #include <cstdint>
15 #include <cstddef>
16 #include <arm_acle.h>
17 #include <arm_neon.h>
18 
19 namespace {
20 alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
21 {
22  0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
23  0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
24  0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
25  0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
26  0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
27  0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
28  0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
29  0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
30  0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
31  0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
32  0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
33  0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
34  0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
35  0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
36  0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
37  0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
38 };
39 }
40 
41 namespace sha256_arm_shani {
42 void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
43 {
44  uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
45  uint32x4_t MSG0, MSG1, MSG2, MSG3;
46  uint32x4_t TMP0, TMP2;
47 
48  // Load state
49  STATE0 = vld1q_u32(&s[0]);
50  STATE1 = vld1q_u32(&s[4]);
51 
52  while (blocks--)
53  {
54  // Save state
55  ABEF_SAVE = STATE0;
56  CDGH_SAVE = STATE1;
57 
58  // Load and convert input chunk to Big Endian
59  MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
60  MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
61  MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
62  MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
63  chunk += 64;
64 
65  // Original implementation preloaded message and constant addition which was 1-3% slower.
66  // Now included as first step in quad round code saving one Q Neon register
67  // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
68 
69  // Rounds 1-4
70  TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
71  TMP2 = STATE0;
72  MSG0 = vsha256su0q_u32(MSG0, MSG1);
73  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
74  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
75  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
76 
77  // Rounds 5-8
78  TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
79  TMP2 = STATE0;
80  MSG1 = vsha256su0q_u32(MSG1, MSG2);
81  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
82  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
83  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
84 
85  // Rounds 9-12
86  TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
87  TMP2 = STATE0;
88  MSG2 = vsha256su0q_u32(MSG2, MSG3);
89  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
90  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
91  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
92 
93  // Rounds 13-16
94  TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
95  TMP2 = STATE0;
96  MSG3 = vsha256su0q_u32(MSG3, MSG0);
97  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
98  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
99  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
100 
101  // Rounds 17-20
102  TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
103  TMP2 = STATE0;
104  MSG0 = vsha256su0q_u32(MSG0, MSG1);
105  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
106  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
107  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
108 
109  // Rounds 21-24
110  TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
111  TMP2 = STATE0;
112  MSG1 = vsha256su0q_u32(MSG1, MSG2);
113  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
114  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
115  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
116 
117  // Rounds 25-28
118  TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
119  TMP2 = STATE0;
120  MSG2 = vsha256su0q_u32(MSG2, MSG3);
121  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
122  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
123  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
124 
125  // Rounds 29-32
126  TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
127  TMP2 = STATE0;
128  MSG3 = vsha256su0q_u32(MSG3, MSG0);
129  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
130  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
131  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
132 
133  // Rounds 33-36
134  TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
135  TMP2 = STATE0;
136  MSG0 = vsha256su0q_u32(MSG0, MSG1);
137  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
138  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
139  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
140 
141  // Rounds 37-40
142  TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
143  TMP2 = STATE0;
144  MSG1 = vsha256su0q_u32(MSG1, MSG2);
145  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
146  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
147  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
148 
149  // Rounds 41-44
150  TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
151  TMP2 = STATE0;
152  MSG2 = vsha256su0q_u32(MSG2, MSG3);
153  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
154  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
155  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
156 
157  // Rounds 45-48
158  TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
159  TMP2 = STATE0;
160  MSG3 = vsha256su0q_u32(MSG3, MSG0);
161  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
162  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
163  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
164 
165  // Rounds 49-52
166  TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
167  TMP2 = STATE0;
168  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
169  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
170 
171  // Rounds 53-56
172  TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
173  TMP2 = STATE0;
174  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
175  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
176 
177  // Rounds 57-60
178  TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
179  TMP2 = STATE0;
180  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
181  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
182 
183  // Rounds 61-64
184  TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
185  TMP2 = STATE0;
186  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
187  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
188 
189  // Update state
190  STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
191  STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
192  }
193 
194  // Save final state
195  vst1q_u32(&s[0], STATE0);
196  vst1q_u32(&s[4], STATE1);
197 }
198 }
199 
200 namespace sha256d64_arm_shani {
201 void Transform_2way(unsigned char* output, const unsigned char* input)
202 {
203  /* Initial state. */
204  alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
205  0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
206  0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
207  };
208 
209  /* Precomputed message schedule for the 2nd transform. */
210  alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
211  0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
212  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
213  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
214  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
215  0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
216  0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
217  0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
218  0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
219  0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
220  0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
221  0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
222  0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
223  0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
224  0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
225  0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
226  0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
227  };
228 
229  /* A few precomputed message schedule values for the 3rd transform. */
230  alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
231  0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
232  0x80000000, 0x00000000, 0x00000000, 0x00000000,
233  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
234  };
235 
236  /* Padding processed in the 3rd transform (byteswapped). */
237  alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
238 
239  uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABEF_SAVEA, ABEF_SAVEB, CDGH_SAVEA, CDGH_SAVEB;
240  uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B;
241  uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP;
242 
243  // Transform 1: Load state
244  STATE0A = vld1q_u32(&INIT[0]);
245  STATE0B = STATE0A;
246  STATE1A = vld1q_u32(&INIT[4]);
247  STATE1B = STATE1A;
248 
249  // Transform 1: Load and convert input chunk to Big Endian
250  MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0)));
251  MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
252  MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
253  MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
254  MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64)));
255  MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80)));
256  MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96)));
257  MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112)));
258 
259  // Transform 1: Rounds 1-4
260  TMP = vld1q_u32(&K[0]);
261  TMP0A = vaddq_u32(MSG0A, TMP);
262  TMP0B = vaddq_u32(MSG0B, TMP);
263  TMP2A = STATE0A;
264  TMP2B = STATE0B;
265  MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
266  MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
267  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
268  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
269  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
270  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
271  MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
272  MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
273 
274  // Transform 1: Rounds 5-8
275  TMP = vld1q_u32(&K[4]);
276  TMP0A = vaddq_u32(MSG1A, TMP);
277  TMP0B = vaddq_u32(MSG1B, TMP);
278  TMP2A = STATE0A;
279  TMP2B = STATE0B;
280  MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
281  MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
282  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
283  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
284  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
285  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
286  MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
287  MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
288 
289  // Transform 1: Rounds 9-12
290  TMP = vld1q_u32(&K[8]);
291  TMP0A = vaddq_u32(MSG2A, TMP);
292  TMP0B = vaddq_u32(MSG2B, TMP);
293  TMP2A = STATE0A;
294  TMP2B = STATE0B;
295  MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
296  MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
297  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
298  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
299  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
300  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
301  MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
302  MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
303 
304  // Transform 1: Rounds 13-16
305  TMP = vld1q_u32(&K[12]);
306  TMP0A = vaddq_u32(MSG3A, TMP);
307  TMP0B = vaddq_u32(MSG3B, TMP);
308  TMP2A = STATE0A;
309  TMP2B = STATE0B;
310  MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
311  MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
312  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
313  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
314  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
315  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
316  MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
317  MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
318 
319  // Transform 1: Rounds 17-20
320  TMP = vld1q_u32(&K[16]);
321  TMP0A = vaddq_u32(MSG0A, TMP);
322  TMP0B = vaddq_u32(MSG0B, TMP);
323  TMP2A = STATE0A;
324  TMP2B = STATE0B;
325  MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
326  MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
327  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
328  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
329  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
330  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
331  MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
332  MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
333 
334  // Transform 1: Rounds 21-24
335  TMP = vld1q_u32(&K[20]);
336  TMP0A = vaddq_u32(MSG1A, TMP);
337  TMP0B = vaddq_u32(MSG1B, TMP);
338  TMP2A = STATE0A;
339  TMP2B = STATE0B;
340  MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
341  MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
342  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
343  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
344  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
345  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
346  MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
347  MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
348 
349  // Transform 1: Rounds 25-28
350  TMP = vld1q_u32(&K[24]);
351  TMP0A = vaddq_u32(MSG2A, TMP);
352  TMP0B = vaddq_u32(MSG2B, TMP);
353  TMP2A = STATE0A;
354  TMP2B = STATE0B;
355  MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
356  MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
357  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
358  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
359  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
360  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
361  MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
362  MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
363 
364  // Transform 1: Rounds 29-32
365  TMP = vld1q_u32(&K[28]);
366  TMP0A = vaddq_u32(MSG3A, TMP);
367  TMP0B = vaddq_u32(MSG3B, TMP);
368  TMP2A = STATE0A;
369  TMP2B = STATE0B;
370  MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
371  MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
372  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
373  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
374  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
375  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
376  MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
377  MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
378 
379  // Transform 1: Rounds 33-36
380  TMP = vld1q_u32(&K[32]);
381  TMP0A = vaddq_u32(MSG0A, TMP);
382  TMP0B = vaddq_u32(MSG0B, TMP);
383  TMP2A = STATE0A;
384  TMP2B = STATE0B;
385  MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
386  MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
387  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
388  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
389  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
390  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
391  MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
392  MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
393 
394  // Transform 1: Rounds 37-40
395  TMP = vld1q_u32(&K[36]);
396  TMP0A = vaddq_u32(MSG1A, TMP);
397  TMP0B = vaddq_u32(MSG1B, TMP);
398  TMP2A = STATE0A;
399  TMP2B = STATE0B;
400  MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
401  MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
402  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
403  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
404  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
405  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
406  MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
407  MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
408 
409  // Transform 1: Rounds 41-44
410  TMP = vld1q_u32(&K[40]);
411  TMP0A = vaddq_u32(MSG2A, TMP);
412  TMP0B = vaddq_u32(MSG2B, TMP);
413  TMP2A = STATE0A;
414  TMP2B = STATE0B;
415  MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
416  MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
417  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
418  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
419  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
420  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
421  MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
422  MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
423 
424  // Transform 1: Rounds 45-48
425  TMP = vld1q_u32(&K[44]);
426  TMP0A = vaddq_u32(MSG3A, TMP);
427  TMP0B = vaddq_u32(MSG3B, TMP);
428  TMP2A = STATE0A;
429  TMP2B = STATE0B;
430  MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
431  MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
432  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
433  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
434  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
435  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
436  MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
437  MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
438 
439  // Transform 1: Rounds 49-52
440  TMP = vld1q_u32(&K[48]);
441  TMP0A = vaddq_u32(MSG0A, TMP);
442  TMP0B = vaddq_u32(MSG0B, TMP);
443  TMP2A = STATE0A;
444  TMP2B = STATE0B;
445  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
446  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
447  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
448  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
449 
450  // Transform 1: Rounds 53-56
451  TMP = vld1q_u32(&K[52]);
452  TMP0A = vaddq_u32(MSG1A, TMP);
453  TMP0B = vaddq_u32(MSG1B, TMP);
454  TMP2A = STATE0A;
455  TMP2B = STATE0B;
456  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
457  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
458  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
459  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
460 
461  // Transform 1: Rounds 57-60
462  TMP = vld1q_u32(&K[56]);
463  TMP0A = vaddq_u32(MSG2A, TMP);
464  TMP0B = vaddq_u32(MSG2B, TMP);
465  TMP2A = STATE0A;
466  TMP2B = STATE0B;
467  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
468  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
469  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
470  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
471 
472  // Transform 1: Rounds 61-64
473  TMP = vld1q_u32(&K[60]);
474  TMP0A = vaddq_u32(MSG3A, TMP);
475  TMP0B = vaddq_u32(MSG3B, TMP);
476  TMP2A = STATE0A;
477  TMP2B = STATE0B;
478  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
479  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
480  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
481  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
482 
483  // Transform 1: Update state
484  TMP = vld1q_u32(&INIT[0]);
485  STATE0A = vaddq_u32(STATE0A, TMP);
486  STATE0B = vaddq_u32(STATE0B, TMP);
487  TMP = vld1q_u32(&INIT[4]);
488  STATE1A = vaddq_u32(STATE1A, TMP);
489  STATE1B = vaddq_u32(STATE1B, TMP);
490 
491  // Transform 2: Save state
492  ABEF_SAVEA = STATE0A;
493  ABEF_SAVEB = STATE0B;
494  CDGH_SAVEA = STATE1A;
495  CDGH_SAVEB = STATE1B;
496 
497  // Transform 2: Rounds 1-4
498  TMP = vld1q_u32(&MIDS[0]);
499  TMP2A = STATE0A;
500  TMP2B = STATE0B;
501  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
502  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
503  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
504  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
505 
506  // Transform 2: Rounds 5-8
507  TMP = vld1q_u32(&MIDS[4]);
508  TMP2A = STATE0A;
509  TMP2B = STATE0B;
510  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
511  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
512  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
513  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
514 
515  // Transform 2: Rounds 9-12
516  TMP = vld1q_u32(&MIDS[8]);
517  TMP2A = STATE0A;
518  TMP2B = STATE0B;
519  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
520  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
521  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
522  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
523 
524  // Transform 2: Rounds 13-16
525  TMP = vld1q_u32(&MIDS[12]);
526  TMP2A = STATE0A;
527  TMP2B = STATE0B;
528  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
529  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
530  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
531  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
532 
533  // Transform 2: Rounds 17-20
534  TMP = vld1q_u32(&MIDS[16]);
535  TMP2A = STATE0A;
536  TMP2B = STATE0B;
537  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
538  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
539  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
540  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
541 
542  // Transform 2: Rounds 21-24
543  TMP = vld1q_u32(&MIDS[20]);
544  TMP2A = STATE0A;
545  TMP2B = STATE0B;
546  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
547  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
548  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
549  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
550 
551  // Transform 2: Rounds 25-28
552  TMP = vld1q_u32(&MIDS[24]);
553  TMP2A = STATE0A;
554  TMP2B = STATE0B;
555  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
556  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
557  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
558  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
559 
560  // Transform 2: Rounds 29-32
561  TMP = vld1q_u32(&MIDS[28]);
562  TMP2A = STATE0A;
563  TMP2B = STATE0B;
564  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
565  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
566  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
567  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
568 
569  // Transform 2: Rounds 33-36
570  TMP = vld1q_u32(&MIDS[32]);
571  TMP2A = STATE0A;
572  TMP2B = STATE0B;
573  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
574  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
575  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
576  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
577 
578  // Transform 2: Rounds 37-40
579  TMP = vld1q_u32(&MIDS[36]);
580  TMP2A = STATE0A;
581  TMP2B = STATE0B;
582  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
583  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
584  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
585  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
586 
587  // Transform 2: Rounds 41-44
588  TMP = vld1q_u32(&MIDS[40]);
589  TMP2A = STATE0A;
590  TMP2B = STATE0B;
591  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
592  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
593  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
594  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
595 
596  // Transform 2: Rounds 45-48
597  TMP = vld1q_u32(&MIDS[44]);
598  TMP2A = STATE0A;
599  TMP2B = STATE0B;
600  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
601  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
602  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
603  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
604 
605  // Transform 2: Rounds 49-52
606  TMP = vld1q_u32(&MIDS[48]);
607  TMP2A = STATE0A;
608  TMP2B = STATE0B;
609  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
610  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
611  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
612  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
613 
614  // Transform 2: Rounds 53-56
615  TMP = vld1q_u32(&MIDS[52]);
616  TMP2A = STATE0A;
617  TMP2B = STATE0B;
618  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
619  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
620  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
621  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
622 
623  // Transform 2: Rounds 57-60
624  TMP = vld1q_u32(&MIDS[56]);
625  TMP2A = STATE0A;
626  TMP2B = STATE0B;
627  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
628  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
629  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
630  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
631 
632  // Transform 2: Rounds 61-64
633  TMP = vld1q_u32(&MIDS[60]);
634  TMP2A = STATE0A;
635  TMP2B = STATE0B;
636  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
637  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
638  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
639  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
640 
641  // Transform 2: Update state
642  STATE0A = vaddq_u32(STATE0A, ABEF_SAVEA);
643  STATE0B = vaddq_u32(STATE0B, ABEF_SAVEB);
644  STATE1A = vaddq_u32(STATE1A, CDGH_SAVEA);
645  STATE1B = vaddq_u32(STATE1B, CDGH_SAVEB);
646 
647  // Transform 3: Pad previous output
648  MSG0A = STATE0A;
649  MSG0B = STATE0B;
650  MSG1A = STATE1A;
651  MSG1B = STATE1B;
652  MSG2A = vld1q_u32(&FINAL[0]);
653  MSG2B = MSG2A;
654  MSG3A = vld1q_u32(&FINAL[4]);
655  MSG3B = MSG3A;
656 
657  // Transform 3: Load state
658  STATE0A = vld1q_u32(&INIT[0]);
659  STATE0B = STATE0A;
660  STATE1A = vld1q_u32(&INIT[4]);
661  STATE1B = STATE1A;
662 
663  // Transform 3: Rounds 1-4
664  TMP = vld1q_u32(&K[0]);
665  TMP0A = vaddq_u32(MSG0A, TMP);
666  TMP0B = vaddq_u32(MSG0B, TMP);
667  TMP2A = STATE0A;
668  TMP2B = STATE0B;
669  MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
670  MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
671  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
672  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
673  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
674  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
675  MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
676  MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
677 
678  // Transform 3: Rounds 5-8
679  TMP = vld1q_u32(&K[4]);
680  TMP0A = vaddq_u32(MSG1A, TMP);
681  TMP0B = vaddq_u32(MSG1B, TMP);
682  TMP2A = STATE0A;
683  TMP2B = STATE0B;
684  MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
685  MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
686  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
687  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
688  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
689  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
690  MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
691  MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
692 
693  // Transform 3: Rounds 9-12
694  TMP = vld1q_u32(&FINS[0]);
695  TMP2A = STATE0A;
696  TMP2B = STATE0B;
697  MSG2A = vld1q_u32(&FINS[4]);
698  MSG2B = MSG2A;
699  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
700  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
701  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
702  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
703  MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
704  MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
705 
706  // Transform 3: Rounds 13-16
707  TMP = vld1q_u32(&FINS[8]);
708  TMP2A = STATE0A;
709  TMP2B = STATE0B;
710  MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
711  MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
712  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
713  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
714  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
715  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
716  MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
717  MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
718 
719  // Transform 3: Rounds 17-20
720  TMP = vld1q_u32(&K[16]);
721  TMP0A = vaddq_u32(MSG0A, TMP);
722  TMP0B = vaddq_u32(MSG0B, TMP);
723  TMP2A = STATE0A;
724  TMP2B = STATE0B;
725  MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
726  MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
727  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
728  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
729  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
730  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
731  MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
732  MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
733 
734  // Transform 3: Rounds 21-24
735  TMP = vld1q_u32(&K[20]);
736  TMP0A = vaddq_u32(MSG1A, TMP);
737  TMP0B = vaddq_u32(MSG1B, TMP);
738  TMP2A = STATE0A;
739  TMP2B = STATE0B;
740  MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
741  MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
742  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
743  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
744  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
745  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
746  MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
747  MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
748 
749  // Transform 3: Rounds 25-28
750  TMP = vld1q_u32(&K[24]);
751  TMP0A = vaddq_u32(MSG2A, TMP);
752  TMP0B = vaddq_u32(MSG2B, TMP);
753  TMP2A = STATE0A;
754  TMP2B = STATE0B;
755  MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
756  MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
757  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
758  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
759  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
760  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
761  MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
762  MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
763 
764  // Transform 3: Rounds 29-32
765  TMP = vld1q_u32(&K[28]);
766  TMP0A = vaddq_u32(MSG3A, TMP);
767  TMP0B = vaddq_u32(MSG3B, TMP);
768  TMP2A = STATE0A;
769  TMP2B = STATE0B;
770  MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
771  MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
772  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
773  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
774  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
775  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
776  MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
777  MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
778 
779  // Transform 3: Rounds 33-36
780  TMP = vld1q_u32(&K[32]);
781  TMP0A = vaddq_u32(MSG0A, TMP);
782  TMP0B = vaddq_u32(MSG0B, TMP);
783  TMP2A = STATE0A;
784  TMP2B = STATE0B;
785  MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
786  MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
787  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
788  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
789  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
790  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
791  MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
792  MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
793 
794  // Transform 3: Rounds 37-40
795  TMP = vld1q_u32(&K[36]);
796  TMP0A = vaddq_u32(MSG1A, TMP);
797  TMP0B = vaddq_u32(MSG1B, TMP);
798  TMP2A = STATE0A;
799  TMP2B = STATE0B;
800  MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
801  MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
802  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
803  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
804  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
805  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
806  MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
807  MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
808 
809  // Transform 3: Rounds 41-44
810  TMP = vld1q_u32(&K[40]);
811  TMP0A = vaddq_u32(MSG2A, TMP);
812  TMP0B = vaddq_u32(MSG2B, TMP);
813  TMP2A = STATE0A;
814  TMP2B = STATE0B;
815  MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
816  MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
817  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
818  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
819  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
820  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
821  MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
822  MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
823 
824  // Transform 3: Rounds 45-48
825  TMP = vld1q_u32(&K[44]);
826  TMP0A = vaddq_u32(MSG3A, TMP);
827  TMP0B = vaddq_u32(MSG3B, TMP);
828  TMP2A = STATE0A;
829  TMP2B = STATE0B;
830  MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
831  MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
832  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
833  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
834  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
835  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
836  MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
837  MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
838 
839  // Transform 3: Rounds 49-52
840  TMP = vld1q_u32(&K[48]);
841  TMP0A = vaddq_u32(MSG0A, TMP);
842  TMP0B = vaddq_u32(MSG0B, TMP);
843  TMP2A = STATE0A;
844  TMP2B = STATE0B;
845  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
846  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
847  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
848  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
849 
850  // Transform 3: Rounds 53-56
851  TMP = vld1q_u32(&K[52]);
852  TMP0A = vaddq_u32(MSG1A, TMP);
853  TMP0B = vaddq_u32(MSG1B, TMP);
854  TMP2A = STATE0A;
855  TMP2B = STATE0B;
856  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
857  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
858  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
859  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
860 
861  // Transform 3: Rounds 57-60
862  TMP = vld1q_u32(&K[56]);
863  TMP0A = vaddq_u32(MSG2A, TMP);
864  TMP0B = vaddq_u32(MSG2B, TMP);
865  TMP2A = STATE0A;
866  TMP2B = STATE0B;
867  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
868  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
869  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
870  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
871 
872  // Transform 3: Rounds 61-64
873  TMP = vld1q_u32(&K[60]);
874  TMP0A = vaddq_u32(MSG3A, TMP);
875  TMP0B = vaddq_u32(MSG3B, TMP);
876  TMP2A = STATE0A;
877  TMP2B = STATE0B;
878  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
879  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
880  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
881  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
882 
883  // Transform 3: Update state
884  TMP = vld1q_u32(&INIT[0]);
885  STATE0A = vaddq_u32(STATE0A, TMP);
886  STATE0B = vaddq_u32(STATE0B, TMP);
887  TMP = vld1q_u32(&INIT[4]);
888  STATE1A = vaddq_u32(STATE1A, TMP);
889  STATE1B = vaddq_u32(STATE1B, TMP);
890 
891  // Store result
892  vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A)));
893  vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A)));
894  vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B)));
895  vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B)));
896 }
897 }
898 
899 #endif
void Transform(uint32_t *s, const unsigned char *chunk, size_t blocks)
void Transform_2way(unsigned char *out, const unsigned char *in)