Bitcoin Core  27.99.0
P2P Digital Currency
sha256_sse4.cpp
Go to the documentation of this file.
1 // Copyright (c) 2017-2022 The Bitcoin Core developers
2 // Distributed under the MIT software license, see the accompanying
3 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
4 //
5 // This is a translation to GCC extended asm syntax from YASM code by Intel
6 // (available at the bottom of this file).
7 
8 #include <cstdlib>
9 #include <stdint.h>
10 
11 #if defined(__x86_64__) || defined(__amd64__)
12 
13 namespace sha256_sse4
14 {
15 void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
16 #if defined(__clang__) && !defined(__OPTIMIZE__)
17  /*
18  clang is unable to compile this with -O0 and -fsanitize=address.
19  See upstream bug: https://github.com/llvm/llvm-project/issues/92182
20  */
21  __attribute__((no_sanitize("address")))
22 #endif
23 {
24  static const uint32_t K256 alignas(16) [] = {
25  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
26  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
27  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
28  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
29  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
30  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
31  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
32  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
33  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
34  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
35  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
36  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
37  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
38  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
39  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
40  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
41  };
42  static const uint32_t FLIP_MASK alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
43  static const uint32_t SHUF_00BA alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
44  static const uint32_t SHUF_DC00 alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
45  uint32_t a, b, c, d, f, g, h, y0, y1, y2;
46  uint64_t tbl;
47  uint64_t inp_end, inp;
48  uint32_t xfer alignas(16) [4];
49 
50  __asm__ __volatile__(
51  "shl $0x6,%2;"
52  "je Ldone_hash_%=;"
53  "add %1,%2;"
54  "mov %2,%14;"
55  "mov (%0),%3;"
56  "mov 0x4(%0),%4;"
57  "mov 0x8(%0),%5;"
58  "mov 0xc(%0),%6;"
59  "mov 0x10(%0),%k2;"
60  "mov 0x14(%0),%7;"
61  "mov 0x18(%0),%8;"
62  "mov 0x1c(%0),%9;"
63  "movdqa %18,%%xmm12;"
64  "movdqa %19,%%xmm10;"
65  "movdqa %20,%%xmm11;"
66 
67  "Lloop0_%=:"
68  "lea %17,%13;"
69  "movdqu (%1),%%xmm4;"
70  "pshufb %%xmm12,%%xmm4;"
71  "movdqu 0x10(%1),%%xmm5;"
72  "pshufb %%xmm12,%%xmm5;"
73  "movdqu 0x20(%1),%%xmm6;"
74  "pshufb %%xmm12,%%xmm6;"
75  "movdqu 0x30(%1),%%xmm7;"
76  "pshufb %%xmm12,%%xmm7;"
77  "mov %1,%15;"
78  "mov $3,%1;"
79 
80  "Lloop1_%=:"
81  "movdqa 0x0(%13),%%xmm9;"
82  "paddd %%xmm4,%%xmm9;"
83  "movdqa %%xmm9,%16;"
84  "movdqa %%xmm7,%%xmm0;"
85  "mov %k2,%10;"
86  "ror $0xe,%10;"
87  "mov %3,%11;"
88  "palignr $0x4,%%xmm6,%%xmm0;"
89  "ror $0x9,%11;"
90  "xor %k2,%10;"
91  "mov %7,%12;"
92  "ror $0x5,%10;"
93  "movdqa %%xmm5,%%xmm1;"
94  "xor %3,%11;"
95  "xor %8,%12;"
96  "paddd %%xmm4,%%xmm0;"
97  "xor %k2,%10;"
98  "and %k2,%12;"
99  "ror $0xb,%11;"
100  "palignr $0x4,%%xmm4,%%xmm1;"
101  "xor %3,%11;"
102  "ror $0x6,%10;"
103  "xor %8,%12;"
104  "movdqa %%xmm1,%%xmm2;"
105  "ror $0x2,%11;"
106  "add %10,%12;"
107  "add %16,%12;"
108  "movdqa %%xmm1,%%xmm3;"
109  "mov %3,%10;"
110  "add %12,%9;"
111  "mov %3,%12;"
112  "pslld $0x19,%%xmm1;"
113  "or %5,%10;"
114  "add %9,%6;"
115  "and %5,%12;"
116  "psrld $0x7,%%xmm2;"
117  "and %4,%10;"
118  "add %11,%9;"
119  "por %%xmm2,%%xmm1;"
120  "or %12,%10;"
121  "add %10,%9;"
122  "movdqa %%xmm3,%%xmm2;"
123  "mov %6,%10;"
124  "mov %9,%11;"
125  "movdqa %%xmm3,%%xmm8;"
126  "ror $0xe,%10;"
127  "xor %6,%10;"
128  "mov %k2,%12;"
129  "ror $0x9,%11;"
130  "pslld $0xe,%%xmm3;"
131  "xor %9,%11;"
132  "ror $0x5,%10;"
133  "xor %7,%12;"
134  "psrld $0x12,%%xmm2;"
135  "ror $0xb,%11;"
136  "xor %6,%10;"
137  "and %6,%12;"
138  "ror $0x6,%10;"
139  "pxor %%xmm3,%%xmm1;"
140  "xor %9,%11;"
141  "xor %7,%12;"
142  "psrld $0x3,%%xmm8;"
143  "add %10,%12;"
144  "add 4+%16,%12;"
145  "ror $0x2,%11;"
146  "pxor %%xmm2,%%xmm1;"
147  "mov %9,%10;"
148  "add %12,%8;"
149  "mov %9,%12;"
150  "pxor %%xmm8,%%xmm1;"
151  "or %4,%10;"
152  "add %8,%5;"
153  "and %4,%12;"
154  "pshufd $0xfa,%%xmm7,%%xmm2;"
155  "and %3,%10;"
156  "add %11,%8;"
157  "paddd %%xmm1,%%xmm0;"
158  "or %12,%10;"
159  "add %10,%8;"
160  "movdqa %%xmm2,%%xmm3;"
161  "mov %5,%10;"
162  "mov %8,%11;"
163  "ror $0xe,%10;"
164  "movdqa %%xmm2,%%xmm8;"
165  "xor %5,%10;"
166  "ror $0x9,%11;"
167  "mov %6,%12;"
168  "xor %8,%11;"
169  "ror $0x5,%10;"
170  "psrlq $0x11,%%xmm2;"
171  "xor %k2,%12;"
172  "psrlq $0x13,%%xmm3;"
173  "xor %5,%10;"
174  "and %5,%12;"
175  "psrld $0xa,%%xmm8;"
176  "ror $0xb,%11;"
177  "xor %8,%11;"
178  "xor %k2,%12;"
179  "ror $0x6,%10;"
180  "pxor %%xmm3,%%xmm2;"
181  "add %10,%12;"
182  "ror $0x2,%11;"
183  "add 8+%16,%12;"
184  "pxor %%xmm2,%%xmm8;"
185  "mov %8,%10;"
186  "add %12,%7;"
187  "mov %8,%12;"
188  "pshufb %%xmm10,%%xmm8;"
189  "or %3,%10;"
190  "add %7,%4;"
191  "and %3,%12;"
192  "paddd %%xmm8,%%xmm0;"
193  "and %9,%10;"
194  "add %11,%7;"
195  "pshufd $0x50,%%xmm0,%%xmm2;"
196  "or %12,%10;"
197  "add %10,%7;"
198  "movdqa %%xmm2,%%xmm3;"
199  "mov %4,%10;"
200  "ror $0xe,%10;"
201  "mov %7,%11;"
202  "movdqa %%xmm2,%%xmm4;"
203  "ror $0x9,%11;"
204  "xor %4,%10;"
205  "mov %5,%12;"
206  "ror $0x5,%10;"
207  "psrlq $0x11,%%xmm2;"
208  "xor %7,%11;"
209  "xor %6,%12;"
210  "psrlq $0x13,%%xmm3;"
211  "xor %4,%10;"
212  "and %4,%12;"
213  "ror $0xb,%11;"
214  "psrld $0xa,%%xmm4;"
215  "xor %7,%11;"
216  "ror $0x6,%10;"
217  "xor %6,%12;"
218  "pxor %%xmm3,%%xmm2;"
219  "ror $0x2,%11;"
220  "add %10,%12;"
221  "add 12+%16,%12;"
222  "pxor %%xmm2,%%xmm4;"
223  "mov %7,%10;"
224  "add %12,%k2;"
225  "mov %7,%12;"
226  "pshufb %%xmm11,%%xmm4;"
227  "or %9,%10;"
228  "add %k2,%3;"
229  "and %9,%12;"
230  "paddd %%xmm0,%%xmm4;"
231  "and %8,%10;"
232  "add %11,%k2;"
233  "or %12,%10;"
234  "add %10,%k2;"
235  "movdqa 0x10(%13),%%xmm9;"
236  "paddd %%xmm5,%%xmm9;"
237  "movdqa %%xmm9,%16;"
238  "movdqa %%xmm4,%%xmm0;"
239  "mov %3,%10;"
240  "ror $0xe,%10;"
241  "mov %k2,%11;"
242  "palignr $0x4,%%xmm7,%%xmm0;"
243  "ror $0x9,%11;"
244  "xor %3,%10;"
245  "mov %4,%12;"
246  "ror $0x5,%10;"
247  "movdqa %%xmm6,%%xmm1;"
248  "xor %k2,%11;"
249  "xor %5,%12;"
250  "paddd %%xmm5,%%xmm0;"
251  "xor %3,%10;"
252  "and %3,%12;"
253  "ror $0xb,%11;"
254  "palignr $0x4,%%xmm5,%%xmm1;"
255  "xor %k2,%11;"
256  "ror $0x6,%10;"
257  "xor %5,%12;"
258  "movdqa %%xmm1,%%xmm2;"
259  "ror $0x2,%11;"
260  "add %10,%12;"
261  "add %16,%12;"
262  "movdqa %%xmm1,%%xmm3;"
263  "mov %k2,%10;"
264  "add %12,%6;"
265  "mov %k2,%12;"
266  "pslld $0x19,%%xmm1;"
267  "or %8,%10;"
268  "add %6,%9;"
269  "and %8,%12;"
270  "psrld $0x7,%%xmm2;"
271  "and %7,%10;"
272  "add %11,%6;"
273  "por %%xmm2,%%xmm1;"
274  "or %12,%10;"
275  "add %10,%6;"
276  "movdqa %%xmm3,%%xmm2;"
277  "mov %9,%10;"
278  "mov %6,%11;"
279  "movdqa %%xmm3,%%xmm8;"
280  "ror $0xe,%10;"
281  "xor %9,%10;"
282  "mov %3,%12;"
283  "ror $0x9,%11;"
284  "pslld $0xe,%%xmm3;"
285  "xor %6,%11;"
286  "ror $0x5,%10;"
287  "xor %4,%12;"
288  "psrld $0x12,%%xmm2;"
289  "ror $0xb,%11;"
290  "xor %9,%10;"
291  "and %9,%12;"
292  "ror $0x6,%10;"
293  "pxor %%xmm3,%%xmm1;"
294  "xor %6,%11;"
295  "xor %4,%12;"
296  "psrld $0x3,%%xmm8;"
297  "add %10,%12;"
298  "add 4+%16,%12;"
299  "ror $0x2,%11;"
300  "pxor %%xmm2,%%xmm1;"
301  "mov %6,%10;"
302  "add %12,%5;"
303  "mov %6,%12;"
304  "pxor %%xmm8,%%xmm1;"
305  "or %7,%10;"
306  "add %5,%8;"
307  "and %7,%12;"
308  "pshufd $0xfa,%%xmm4,%%xmm2;"
309  "and %k2,%10;"
310  "add %11,%5;"
311  "paddd %%xmm1,%%xmm0;"
312  "or %12,%10;"
313  "add %10,%5;"
314  "movdqa %%xmm2,%%xmm3;"
315  "mov %8,%10;"
316  "mov %5,%11;"
317  "ror $0xe,%10;"
318  "movdqa %%xmm2,%%xmm8;"
319  "xor %8,%10;"
320  "ror $0x9,%11;"
321  "mov %9,%12;"
322  "xor %5,%11;"
323  "ror $0x5,%10;"
324  "psrlq $0x11,%%xmm2;"
325  "xor %3,%12;"
326  "psrlq $0x13,%%xmm3;"
327  "xor %8,%10;"
328  "and %8,%12;"
329  "psrld $0xa,%%xmm8;"
330  "ror $0xb,%11;"
331  "xor %5,%11;"
332  "xor %3,%12;"
333  "ror $0x6,%10;"
334  "pxor %%xmm3,%%xmm2;"
335  "add %10,%12;"
336  "ror $0x2,%11;"
337  "add 8+%16,%12;"
338  "pxor %%xmm2,%%xmm8;"
339  "mov %5,%10;"
340  "add %12,%4;"
341  "mov %5,%12;"
342  "pshufb %%xmm10,%%xmm8;"
343  "or %k2,%10;"
344  "add %4,%7;"
345  "and %k2,%12;"
346  "paddd %%xmm8,%%xmm0;"
347  "and %6,%10;"
348  "add %11,%4;"
349  "pshufd $0x50,%%xmm0,%%xmm2;"
350  "or %12,%10;"
351  "add %10,%4;"
352  "movdqa %%xmm2,%%xmm3;"
353  "mov %7,%10;"
354  "ror $0xe,%10;"
355  "mov %4,%11;"
356  "movdqa %%xmm2,%%xmm5;"
357  "ror $0x9,%11;"
358  "xor %7,%10;"
359  "mov %8,%12;"
360  "ror $0x5,%10;"
361  "psrlq $0x11,%%xmm2;"
362  "xor %4,%11;"
363  "xor %9,%12;"
364  "psrlq $0x13,%%xmm3;"
365  "xor %7,%10;"
366  "and %7,%12;"
367  "ror $0xb,%11;"
368  "psrld $0xa,%%xmm5;"
369  "xor %4,%11;"
370  "ror $0x6,%10;"
371  "xor %9,%12;"
372  "pxor %%xmm3,%%xmm2;"
373  "ror $0x2,%11;"
374  "add %10,%12;"
375  "add 12+%16,%12;"
376  "pxor %%xmm2,%%xmm5;"
377  "mov %4,%10;"
378  "add %12,%3;"
379  "mov %4,%12;"
380  "pshufb %%xmm11,%%xmm5;"
381  "or %6,%10;"
382  "add %3,%k2;"
383  "and %6,%12;"
384  "paddd %%xmm0,%%xmm5;"
385  "and %5,%10;"
386  "add %11,%3;"
387  "or %12,%10;"
388  "add %10,%3;"
389  "movdqa 0x20(%13),%%xmm9;"
390  "paddd %%xmm6,%%xmm9;"
391  "movdqa %%xmm9,%16;"
392  "movdqa %%xmm5,%%xmm0;"
393  "mov %k2,%10;"
394  "ror $0xe,%10;"
395  "mov %3,%11;"
396  "palignr $0x4,%%xmm4,%%xmm0;"
397  "ror $0x9,%11;"
398  "xor %k2,%10;"
399  "mov %7,%12;"
400  "ror $0x5,%10;"
401  "movdqa %%xmm7,%%xmm1;"
402  "xor %3,%11;"
403  "xor %8,%12;"
404  "paddd %%xmm6,%%xmm0;"
405  "xor %k2,%10;"
406  "and %k2,%12;"
407  "ror $0xb,%11;"
408  "palignr $0x4,%%xmm6,%%xmm1;"
409  "xor %3,%11;"
410  "ror $0x6,%10;"
411  "xor %8,%12;"
412  "movdqa %%xmm1,%%xmm2;"
413  "ror $0x2,%11;"
414  "add %10,%12;"
415  "add %16,%12;"
416  "movdqa %%xmm1,%%xmm3;"
417  "mov %3,%10;"
418  "add %12,%9;"
419  "mov %3,%12;"
420  "pslld $0x19,%%xmm1;"
421  "or %5,%10;"
422  "add %9,%6;"
423  "and %5,%12;"
424  "psrld $0x7,%%xmm2;"
425  "and %4,%10;"
426  "add %11,%9;"
427  "por %%xmm2,%%xmm1;"
428  "or %12,%10;"
429  "add %10,%9;"
430  "movdqa %%xmm3,%%xmm2;"
431  "mov %6,%10;"
432  "mov %9,%11;"
433  "movdqa %%xmm3,%%xmm8;"
434  "ror $0xe,%10;"
435  "xor %6,%10;"
436  "mov %k2,%12;"
437  "ror $0x9,%11;"
438  "pslld $0xe,%%xmm3;"
439  "xor %9,%11;"
440  "ror $0x5,%10;"
441  "xor %7,%12;"
442  "psrld $0x12,%%xmm2;"
443  "ror $0xb,%11;"
444  "xor %6,%10;"
445  "and %6,%12;"
446  "ror $0x6,%10;"
447  "pxor %%xmm3,%%xmm1;"
448  "xor %9,%11;"
449  "xor %7,%12;"
450  "psrld $0x3,%%xmm8;"
451  "add %10,%12;"
452  "add 4+%16,%12;"
453  "ror $0x2,%11;"
454  "pxor %%xmm2,%%xmm1;"
455  "mov %9,%10;"
456  "add %12,%8;"
457  "mov %9,%12;"
458  "pxor %%xmm8,%%xmm1;"
459  "or %4,%10;"
460  "add %8,%5;"
461  "and %4,%12;"
462  "pshufd $0xfa,%%xmm5,%%xmm2;"
463  "and %3,%10;"
464  "add %11,%8;"
465  "paddd %%xmm1,%%xmm0;"
466  "or %12,%10;"
467  "add %10,%8;"
468  "movdqa %%xmm2,%%xmm3;"
469  "mov %5,%10;"
470  "mov %8,%11;"
471  "ror $0xe,%10;"
472  "movdqa %%xmm2,%%xmm8;"
473  "xor %5,%10;"
474  "ror $0x9,%11;"
475  "mov %6,%12;"
476  "xor %8,%11;"
477  "ror $0x5,%10;"
478  "psrlq $0x11,%%xmm2;"
479  "xor %k2,%12;"
480  "psrlq $0x13,%%xmm3;"
481  "xor %5,%10;"
482  "and %5,%12;"
483  "psrld $0xa,%%xmm8;"
484  "ror $0xb,%11;"
485  "xor %8,%11;"
486  "xor %k2,%12;"
487  "ror $0x6,%10;"
488  "pxor %%xmm3,%%xmm2;"
489  "add %10,%12;"
490  "ror $0x2,%11;"
491  "add 8+%16,%12;"
492  "pxor %%xmm2,%%xmm8;"
493  "mov %8,%10;"
494  "add %12,%7;"
495  "mov %8,%12;"
496  "pshufb %%xmm10,%%xmm8;"
497  "or %3,%10;"
498  "add %7,%4;"
499  "and %3,%12;"
500  "paddd %%xmm8,%%xmm0;"
501  "and %9,%10;"
502  "add %11,%7;"
503  "pshufd $0x50,%%xmm0,%%xmm2;"
504  "or %12,%10;"
505  "add %10,%7;"
506  "movdqa %%xmm2,%%xmm3;"
507  "mov %4,%10;"
508  "ror $0xe,%10;"
509  "mov %7,%11;"
510  "movdqa %%xmm2,%%xmm6;"
511  "ror $0x9,%11;"
512  "xor %4,%10;"
513  "mov %5,%12;"
514  "ror $0x5,%10;"
515  "psrlq $0x11,%%xmm2;"
516  "xor %7,%11;"
517  "xor %6,%12;"
518  "psrlq $0x13,%%xmm3;"
519  "xor %4,%10;"
520  "and %4,%12;"
521  "ror $0xb,%11;"
522  "psrld $0xa,%%xmm6;"
523  "xor %7,%11;"
524  "ror $0x6,%10;"
525  "xor %6,%12;"
526  "pxor %%xmm3,%%xmm2;"
527  "ror $0x2,%11;"
528  "add %10,%12;"
529  "add 12+%16,%12;"
530  "pxor %%xmm2,%%xmm6;"
531  "mov %7,%10;"
532  "add %12,%k2;"
533  "mov %7,%12;"
534  "pshufb %%xmm11,%%xmm6;"
535  "or %9,%10;"
536  "add %k2,%3;"
537  "and %9,%12;"
538  "paddd %%xmm0,%%xmm6;"
539  "and %8,%10;"
540  "add %11,%k2;"
541  "or %12,%10;"
542  "add %10,%k2;"
543  "movdqa 0x30(%13),%%xmm9;"
544  "paddd %%xmm7,%%xmm9;"
545  "movdqa %%xmm9,%16;"
546  "add $0x40,%13;"
547  "movdqa %%xmm6,%%xmm0;"
548  "mov %3,%10;"
549  "ror $0xe,%10;"
550  "mov %k2,%11;"
551  "palignr $0x4,%%xmm5,%%xmm0;"
552  "ror $0x9,%11;"
553  "xor %3,%10;"
554  "mov %4,%12;"
555  "ror $0x5,%10;"
556  "movdqa %%xmm4,%%xmm1;"
557  "xor %k2,%11;"
558  "xor %5,%12;"
559  "paddd %%xmm7,%%xmm0;"
560  "xor %3,%10;"
561  "and %3,%12;"
562  "ror $0xb,%11;"
563  "palignr $0x4,%%xmm7,%%xmm1;"
564  "xor %k2,%11;"
565  "ror $0x6,%10;"
566  "xor %5,%12;"
567  "movdqa %%xmm1,%%xmm2;"
568  "ror $0x2,%11;"
569  "add %10,%12;"
570  "add %16,%12;"
571  "movdqa %%xmm1,%%xmm3;"
572  "mov %k2,%10;"
573  "add %12,%6;"
574  "mov %k2,%12;"
575  "pslld $0x19,%%xmm1;"
576  "or %8,%10;"
577  "add %6,%9;"
578  "and %8,%12;"
579  "psrld $0x7,%%xmm2;"
580  "and %7,%10;"
581  "add %11,%6;"
582  "por %%xmm2,%%xmm1;"
583  "or %12,%10;"
584  "add %10,%6;"
585  "movdqa %%xmm3,%%xmm2;"
586  "mov %9,%10;"
587  "mov %6,%11;"
588  "movdqa %%xmm3,%%xmm8;"
589  "ror $0xe,%10;"
590  "xor %9,%10;"
591  "mov %3,%12;"
592  "ror $0x9,%11;"
593  "pslld $0xe,%%xmm3;"
594  "xor %6,%11;"
595  "ror $0x5,%10;"
596  "xor %4,%12;"
597  "psrld $0x12,%%xmm2;"
598  "ror $0xb,%11;"
599  "xor %9,%10;"
600  "and %9,%12;"
601  "ror $0x6,%10;"
602  "pxor %%xmm3,%%xmm1;"
603  "xor %6,%11;"
604  "xor %4,%12;"
605  "psrld $0x3,%%xmm8;"
606  "add %10,%12;"
607  "add 4+%16,%12;"
608  "ror $0x2,%11;"
609  "pxor %%xmm2,%%xmm1;"
610  "mov %6,%10;"
611  "add %12,%5;"
612  "mov %6,%12;"
613  "pxor %%xmm8,%%xmm1;"
614  "or %7,%10;"
615  "add %5,%8;"
616  "and %7,%12;"
617  "pshufd $0xfa,%%xmm6,%%xmm2;"
618  "and %k2,%10;"
619  "add %11,%5;"
620  "paddd %%xmm1,%%xmm0;"
621  "or %12,%10;"
622  "add %10,%5;"
623  "movdqa %%xmm2,%%xmm3;"
624  "mov %8,%10;"
625  "mov %5,%11;"
626  "ror $0xe,%10;"
627  "movdqa %%xmm2,%%xmm8;"
628  "xor %8,%10;"
629  "ror $0x9,%11;"
630  "mov %9,%12;"
631  "xor %5,%11;"
632  "ror $0x5,%10;"
633  "psrlq $0x11,%%xmm2;"
634  "xor %3,%12;"
635  "psrlq $0x13,%%xmm3;"
636  "xor %8,%10;"
637  "and %8,%12;"
638  "psrld $0xa,%%xmm8;"
639  "ror $0xb,%11;"
640  "xor %5,%11;"
641  "xor %3,%12;"
642  "ror $0x6,%10;"
643  "pxor %%xmm3,%%xmm2;"
644  "add %10,%12;"
645  "ror $0x2,%11;"
646  "add 8+%16,%12;"
647  "pxor %%xmm2,%%xmm8;"
648  "mov %5,%10;"
649  "add %12,%4;"
650  "mov %5,%12;"
651  "pshufb %%xmm10,%%xmm8;"
652  "or %k2,%10;"
653  "add %4,%7;"
654  "and %k2,%12;"
655  "paddd %%xmm8,%%xmm0;"
656  "and %6,%10;"
657  "add %11,%4;"
658  "pshufd $0x50,%%xmm0,%%xmm2;"
659  "or %12,%10;"
660  "add %10,%4;"
661  "movdqa %%xmm2,%%xmm3;"
662  "mov %7,%10;"
663  "ror $0xe,%10;"
664  "mov %4,%11;"
665  "movdqa %%xmm2,%%xmm7;"
666  "ror $0x9,%11;"
667  "xor %7,%10;"
668  "mov %8,%12;"
669  "ror $0x5,%10;"
670  "psrlq $0x11,%%xmm2;"
671  "xor %4,%11;"
672  "xor %9,%12;"
673  "psrlq $0x13,%%xmm3;"
674  "xor %7,%10;"
675  "and %7,%12;"
676  "ror $0xb,%11;"
677  "psrld $0xa,%%xmm7;"
678  "xor %4,%11;"
679  "ror $0x6,%10;"
680  "xor %9,%12;"
681  "pxor %%xmm3,%%xmm2;"
682  "ror $0x2,%11;"
683  "add %10,%12;"
684  "add 12+%16,%12;"
685  "pxor %%xmm2,%%xmm7;"
686  "mov %4,%10;"
687  "add %12,%3;"
688  "mov %4,%12;"
689  "pshufb %%xmm11,%%xmm7;"
690  "or %6,%10;"
691  "add %3,%k2;"
692  "and %6,%12;"
693  "paddd %%xmm0,%%xmm7;"
694  "and %5,%10;"
695  "add %11,%3;"
696  "or %12,%10;"
697  "add %10,%3;"
698  "sub $0x1,%1;"
699  "jne Lloop1_%=;"
700  "mov $0x2,%1;"
701 
702  "Lloop2_%=:"
703  "paddd 0x0(%13),%%xmm4;"
704  "movdqa %%xmm4,%16;"
705  "mov %k2,%10;"
706  "ror $0xe,%10;"
707  "mov %3,%11;"
708  "xor %k2,%10;"
709  "ror $0x9,%11;"
710  "mov %7,%12;"
711  "xor %3,%11;"
712  "ror $0x5,%10;"
713  "xor %8,%12;"
714  "xor %k2,%10;"
715  "ror $0xb,%11;"
716  "and %k2,%12;"
717  "xor %3,%11;"
718  "ror $0x6,%10;"
719  "xor %8,%12;"
720  "add %10,%12;"
721  "ror $0x2,%11;"
722  "add %16,%12;"
723  "mov %3,%10;"
724  "add %12,%9;"
725  "mov %3,%12;"
726  "or %5,%10;"
727  "add %9,%6;"
728  "and %5,%12;"
729  "and %4,%10;"
730  "add %11,%9;"
731  "or %12,%10;"
732  "add %10,%9;"
733  "mov %6,%10;"
734  "ror $0xe,%10;"
735  "mov %9,%11;"
736  "xor %6,%10;"
737  "ror $0x9,%11;"
738  "mov %k2,%12;"
739  "xor %9,%11;"
740  "ror $0x5,%10;"
741  "xor %7,%12;"
742  "xor %6,%10;"
743  "ror $0xb,%11;"
744  "and %6,%12;"
745  "xor %9,%11;"
746  "ror $0x6,%10;"
747  "xor %7,%12;"
748  "add %10,%12;"
749  "ror $0x2,%11;"
750  "add 4+%16,%12;"
751  "mov %9,%10;"
752  "add %12,%8;"
753  "mov %9,%12;"
754  "or %4,%10;"
755  "add %8,%5;"
756  "and %4,%12;"
757  "and %3,%10;"
758  "add %11,%8;"
759  "or %12,%10;"
760  "add %10,%8;"
761  "mov %5,%10;"
762  "ror $0xe,%10;"
763  "mov %8,%11;"
764  "xor %5,%10;"
765  "ror $0x9,%11;"
766  "mov %6,%12;"
767  "xor %8,%11;"
768  "ror $0x5,%10;"
769  "xor %k2,%12;"
770  "xor %5,%10;"
771  "ror $0xb,%11;"
772  "and %5,%12;"
773  "xor %8,%11;"
774  "ror $0x6,%10;"
775  "xor %k2,%12;"
776  "add %10,%12;"
777  "ror $0x2,%11;"
778  "add 8+%16,%12;"
779  "mov %8,%10;"
780  "add %12,%7;"
781  "mov %8,%12;"
782  "or %3,%10;"
783  "add %7,%4;"
784  "and %3,%12;"
785  "and %9,%10;"
786  "add %11,%7;"
787  "or %12,%10;"
788  "add %10,%7;"
789  "mov %4,%10;"
790  "ror $0xe,%10;"
791  "mov %7,%11;"
792  "xor %4,%10;"
793  "ror $0x9,%11;"
794  "mov %5,%12;"
795  "xor %7,%11;"
796  "ror $0x5,%10;"
797  "xor %6,%12;"
798  "xor %4,%10;"
799  "ror $0xb,%11;"
800  "and %4,%12;"
801  "xor %7,%11;"
802  "ror $0x6,%10;"
803  "xor %6,%12;"
804  "add %10,%12;"
805  "ror $0x2,%11;"
806  "add 12+%16,%12;"
807  "mov %7,%10;"
808  "add %12,%k2;"
809  "mov %7,%12;"
810  "or %9,%10;"
811  "add %k2,%3;"
812  "and %9,%12;"
813  "and %8,%10;"
814  "add %11,%k2;"
815  "or %12,%10;"
816  "add %10,%k2;"
817  "paddd 0x10(%13),%%xmm5;"
818  "movdqa %%xmm5,%16;"
819  "add $0x20,%13;"
820  "mov %3,%10;"
821  "ror $0xe,%10;"
822  "mov %k2,%11;"
823  "xor %3,%10;"
824  "ror $0x9,%11;"
825  "mov %4,%12;"
826  "xor %k2,%11;"
827  "ror $0x5,%10;"
828  "xor %5,%12;"
829  "xor %3,%10;"
830  "ror $0xb,%11;"
831  "and %3,%12;"
832  "xor %k2,%11;"
833  "ror $0x6,%10;"
834  "xor %5,%12;"
835  "add %10,%12;"
836  "ror $0x2,%11;"
837  "add %16,%12;"
838  "mov %k2,%10;"
839  "add %12,%6;"
840  "mov %k2,%12;"
841  "or %8,%10;"
842  "add %6,%9;"
843  "and %8,%12;"
844  "and %7,%10;"
845  "add %11,%6;"
846  "or %12,%10;"
847  "add %10,%6;"
848  "mov %9,%10;"
849  "ror $0xe,%10;"
850  "mov %6,%11;"
851  "xor %9,%10;"
852  "ror $0x9,%11;"
853  "mov %3,%12;"
854  "xor %6,%11;"
855  "ror $0x5,%10;"
856  "xor %4,%12;"
857  "xor %9,%10;"
858  "ror $0xb,%11;"
859  "and %9,%12;"
860  "xor %6,%11;"
861  "ror $0x6,%10;"
862  "xor %4,%12;"
863  "add %10,%12;"
864  "ror $0x2,%11;"
865  "add 4+%16,%12;"
866  "mov %6,%10;"
867  "add %12,%5;"
868  "mov %6,%12;"
869  "or %7,%10;"
870  "add %5,%8;"
871  "and %7,%12;"
872  "and %k2,%10;"
873  "add %11,%5;"
874  "or %12,%10;"
875  "add %10,%5;"
876  "mov %8,%10;"
877  "ror $0xe,%10;"
878  "mov %5,%11;"
879  "xor %8,%10;"
880  "ror $0x9,%11;"
881  "mov %9,%12;"
882  "xor %5,%11;"
883  "ror $0x5,%10;"
884  "xor %3,%12;"
885  "xor %8,%10;"
886  "ror $0xb,%11;"
887  "and %8,%12;"
888  "xor %5,%11;"
889  "ror $0x6,%10;"
890  "xor %3,%12;"
891  "add %10,%12;"
892  "ror $0x2,%11;"
893  "add 8+%16,%12;"
894  "mov %5,%10;"
895  "add %12,%4;"
896  "mov %5,%12;"
897  "or %k2,%10;"
898  "add %4,%7;"
899  "and %k2,%12;"
900  "and %6,%10;"
901  "add %11,%4;"
902  "or %12,%10;"
903  "add %10,%4;"
904  "mov %7,%10;"
905  "ror $0xe,%10;"
906  "mov %4,%11;"
907  "xor %7,%10;"
908  "ror $0x9,%11;"
909  "mov %8,%12;"
910  "xor %4,%11;"
911  "ror $0x5,%10;"
912  "xor %9,%12;"
913  "xor %7,%10;"
914  "ror $0xb,%11;"
915  "and %7,%12;"
916  "xor %4,%11;"
917  "ror $0x6,%10;"
918  "xor %9,%12;"
919  "add %10,%12;"
920  "ror $0x2,%11;"
921  "add 12+%16,%12;"
922  "mov %4,%10;"
923  "add %12,%3;"
924  "mov %4,%12;"
925  "or %6,%10;"
926  "add %3,%k2;"
927  "and %6,%12;"
928  "and %5,%10;"
929  "add %11,%3;"
930  "or %12,%10;"
931  "add %10,%3;"
932  "movdqa %%xmm6,%%xmm4;"
933  "movdqa %%xmm7,%%xmm5;"
934  "sub $0x1,%1;"
935  "jne Lloop2_%=;"
936  "add (%0),%3;"
937  "mov %3,(%0);"
938  "add 0x4(%0),%4;"
939  "mov %4,0x4(%0);"
940  "add 0x8(%0),%5;"
941  "mov %5,0x8(%0);"
942  "add 0xc(%0),%6;"
943  "mov %6,0xc(%0);"
944  "add 0x10(%0),%k2;"
945  "mov %k2,0x10(%0);"
946  "add 0x14(%0),%7;"
947  "mov %7,0x14(%0);"
948  "add 0x18(%0),%8;"
949  "mov %8,0x18(%0);"
950  "add 0x1c(%0),%9;"
951  "mov %9,0x1c(%0);"
952  "mov %15,%1;"
953  "add $0x40,%1;"
954  "cmp %14,%1;"
955  "jne Lloop0_%=;"
956 
957  "Ldone_hash_%=:"
958 
959  : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer)
960  : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00)
961  : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12"
962  );
963 }
964 }
965 
966 /*
967 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
968 ; Copyright (c) 2012, Intel Corporation
969 ;
970 ; All rights reserved.
971 ;
972 ; Redistribution and use in source and binary forms, with or without
973 ; modification, are permitted provided that the following conditions are
974 ; met:
975 ;
976 ; * Redistributions of source code must retain the above copyright
977 ; notice, this list of conditions and the following disclaimer.
978 ;
979 ; * Redistributions in binary form must reproduce the above copyright
980 ; notice, this list of conditions and the following disclaimer in the
981 ; documentation and/or other materials provided with the
982 ; distribution.
983 ;
984 ; * Neither the name of the Intel Corporation nor the names of its
985 ; contributors may be used to endorse or promote products derived from
986 ; this software without specific prior written permission.
987 ;
988 ;
989 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
990 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
991 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
992 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
993 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
994 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
995 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
996 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
997 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
998 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
999 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1000 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1001 ;
1002 ; Example YASM command lines:
1003 ; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
1004 ; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
1005 ;
1006 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1007 ;
1008 ; This code is described in an Intel White-Paper:
1009 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
1010 ;
1011 ; To find it, surf to https://www.intel.com/p/en_US/embedded
1012 ; and search for that title.
1013 ; The paper is expected to be released roughly at the end of April, 2012
1014 ;
1015 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1016 ; This code schedules 1 blocks at a time, with 4 lanes per block
1017 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1018 
1019 %define MOVDQ movdqu ;; assume buffers not aligned
1020 
1021 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
1022 
1023 ; addm [mem], reg
1024 ; Add reg to mem using reg-mem add and store
1025 %macro addm 2
1026  add %2, %1
1027  mov %1, %2
1028 %endm
1029 
1030 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1031 
1032 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
1033 ; Load xmm with mem and byte swap each dword
1034 %macro COPY_XMM_AND_BSWAP 3
1035  MOVDQ %1, %2
1036  pshufb %1, %3
1037 %endmacro
1038 
1039 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1040 
1041 %define X0 xmm4
1042 %define X1 xmm5
1043 %define X2 xmm6
1044 %define X3 xmm7
1045 
1046 %define XTMP0 xmm0
1047 %define XTMP1 xmm1
1048 %define XTMP2 xmm2
1049 %define XTMP3 xmm3
1050 %define XTMP4 xmm8
1051 %define XFER xmm9
1052 
1053 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
1054 %define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
1055 %define BYTE_FLIP_MASK xmm12
1056 
1057 %ifdef LINUX
1058 %define NUM_BLKS rdx ; 3rd arg
1059 %define CTX rsi ; 2nd arg
1060 %define INP rdi ; 1st arg
1061 
1062 %define SRND rdi ; clobbers INP
1063 %define c ecx
1064 %define d r8d
1065 %define e edx
1066 %else
1067 %define NUM_BLKS r8 ; 3rd arg
1068 %define CTX rdx ; 2nd arg
1069 %define INP rcx ; 1st arg
1070 
1071 %define SRND rcx ; clobbers INP
1072 %define c edi
1073 %define d esi
1074 %define e r8d
1075 
1076 %endif
1077 %define TBL rbp
1078 %define a eax
1079 %define b ebx
1080 
1081 %define f r9d
1082 %define g r10d
1083 %define h r11d
1084 
1085 %define y0 r13d
1086 %define y1 r14d
1087 %define y2 r15d
1088 
1089 
1090 
1091 _INP_END_SIZE equ 8
1092 _INP_SIZE equ 8
1093 _XFER_SIZE equ 8
1094 %ifdef LINUX
1095 _XMM_SAVE_SIZE equ 0
1096 %else
1097 _XMM_SAVE_SIZE equ 7*16
1098 %endif
1099 ; STACK_SIZE plus pushes must be an odd multiple of 8
1100 _ALIGN_SIZE equ 8
1101 
1102 _INP_END equ 0
1103 _INP equ _INP_END + _INP_END_SIZE
1104 _XFER equ _INP + _INP_SIZE
1105 _XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
1106 STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
1107 
1108 ; rotate_Xs
1109 ; Rotate values of symbols X0...X3
1110 %macro rotate_Xs 0
1111 %xdefine X_ X0
1112 %xdefine X0 X1
1113 %xdefine X1 X2
1114 %xdefine X2 X3
1115 %xdefine X3 X_
1116 %endm
1117 
1118 ; ROTATE_ARGS
1119 ; Rotate values of symbols a...h
1120 %macro ROTATE_ARGS 0
1121 %xdefine TMP_ h
1122 %xdefine h g
1123 %xdefine g f
1124 %xdefine f e
1125 %xdefine e d
1126 %xdefine d c
1127 %xdefine c b
1128 %xdefine b a
1129 %xdefine a TMP_
1130 %endm
1131 
1132 %macro FOUR_ROUNDS_AND_SCHED 0
1133  ;; compute s0 four at a time and s1 two at a time
1134  ;; compute W[-16] + W[-7] 4 at a time
1135  movdqa XTMP0, X3
1136  mov y0, e ; y0 = e
1137  ror y0, (25-11) ; y0 = e >> (25-11)
1138  mov y1, a ; y1 = a
1139  palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
1140  ror y1, (22-13) ; y1 = a >> (22-13)
1141  xor y0, e ; y0 = e ^ (e >> (25-11))
1142  mov y2, f ; y2 = f
1143  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1144  movdqa XTMP1, X1
1145  xor y1, a ; y1 = a ^ (a >> (22-13)
1146  xor y2, g ; y2 = f^g
1147  paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
1148  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1149  and y2, e ; y2 = (f^g)&e
1150  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1151  ;; compute s0
1152  palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
1153  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1154  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1155  xor y2, g ; y2 = CH = ((f^g)&e)^g
1156  movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
1157  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1158  add y2, y0 ; y2 = S1 + CH
1159  add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
1160  movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
1161  mov y0, a ; y0 = a
1162  add h, y2 ; h = h + S1 + CH + k + w
1163  mov y2, a ; y2 = a
1164  pslld XTMP1, (32-7)
1165  or y0, c ; y0 = a|c
1166  add d, h ; d = d + h + S1 + CH + k + w
1167  and y2, c ; y2 = a&c
1168  psrld XTMP2, 7
1169  and y0, b ; y0 = (a|c)&b
1170  add h, y1 ; h = h + S1 + CH + k + w + S0
1171  por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
1172  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1173  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1174 
1175 ROTATE_ARGS
1176  movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
1177  mov y0, e ; y0 = e
1178  mov y1, a ; y1 = a
1179  movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
1180  ror y0, (25-11) ; y0 = e >> (25-11)
1181  xor y0, e ; y0 = e ^ (e >> (25-11))
1182  mov y2, f ; y2 = f
1183  ror y1, (22-13) ; y1 = a >> (22-13)
1184  pslld XTMP3, (32-18)
1185  xor y1, a ; y1 = a ^ (a >> (22-13)
1186  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1187  xor y2, g ; y2 = f^g
1188  psrld XTMP2, 18
1189  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1190  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1191  and y2, e ; y2 = (f^g)&e
1192  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1193  pxor XTMP1, XTMP3
1194  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1195  xor y2, g ; y2 = CH = ((f^g)&e)^g
1196  psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
1197  add y2, y0 ; y2 = S1 + CH
1198  add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
1199  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1200  pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
1201  mov y0, a ; y0 = a
1202  add h, y2 ; h = h + S1 + CH + k + w
1203  mov y2, a ; y2 = a
1204  pxor XTMP1, XTMP4 ; XTMP1 = s0
1205  or y0, c ; y0 = a|c
1206  add d, h ; d = d + h + S1 + CH + k + w
1207  and y2, c ; y2 = a&c
1208  ;; compute low s1
1209  pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
1210  and y0, b ; y0 = (a|c)&b
1211  add h, y1 ; h = h + S1 + CH + k + w + S0
1212  paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
1213  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1214  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1215 
1216 ROTATE_ARGS
1217  movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
1218  mov y0, e ; y0 = e
1219  mov y1, a ; y1 = a
1220  ror y0, (25-11) ; y0 = e >> (25-11)
1221  movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
1222  xor y0, e ; y0 = e ^ (e >> (25-11))
1223  ror y1, (22-13) ; y1 = a >> (22-13)
1224  mov y2, f ; y2 = f
1225  xor y1, a ; y1 = a ^ (a >> (22-13)
1226  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1227  psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
1228  xor y2, g ; y2 = f^g
1229  psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
1230  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1231  and y2, e ; y2 = (f^g)&e
1232  psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
1233  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1234  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1235  xor y2, g ; y2 = CH = ((f^g)&e)^g
1236  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1237  pxor XTMP2, XTMP3
1238  add y2, y0 ; y2 = S1 + CH
1239  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1240  add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
1241  pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
1242  mov y0, a ; y0 = a
1243  add h, y2 ; h = h + S1 + CH + k + w
1244  mov y2, a ; y2 = a
1245  pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
1246  or y0, c ; y0 = a|c
1247  add d, h ; d = d + h + S1 + CH + k + w
1248  and y2, c ; y2 = a&c
1249  paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
1250  and y0, b ; y0 = (a|c)&b
1251  add h, y1 ; h = h + S1 + CH + k + w + S0
1252  ;; compute high s1
1253  pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
1254  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1255  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1256 
1257 ROTATE_ARGS
1258  movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
1259  mov y0, e ; y0 = e
1260  ror y0, (25-11) ; y0 = e >> (25-11)
1261  mov y1, a ; y1 = a
1262  movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
1263  ror y1, (22-13) ; y1 = a >> (22-13)
1264  xor y0, e ; y0 = e ^ (e >> (25-11))
1265  mov y2, f ; y2 = f
1266  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1267  psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
1268  xor y1, a ; y1 = a ^ (a >> (22-13)
1269  xor y2, g ; y2 = f^g
1270  psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
1271  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1272  and y2, e ; y2 = (f^g)&e
1273  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1274  psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
1275  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1276  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1277  xor y2, g ; y2 = CH = ((f^g)&e)^g
1278  pxor XTMP2, XTMP3
1279  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1280  add y2, y0 ; y2 = S1 + CH
1281  add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
1282  pxor X0, XTMP2 ; X0 = s1 {xDxC}
1283  mov y0, a ; y0 = a
1284  add h, y2 ; h = h + S1 + CH + k + w
1285  mov y2, a ; y2 = a
1286  pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
1287  or y0, c ; y0 = a|c
1288  add d, h ; d = d + h + S1 + CH + k + w
1289  and y2, c ; y2 = a&c
1290  paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
1291  and y0, b ; y0 = (a|c)&b
1292  add h, y1 ; h = h + S1 + CH + k + w + S0
1293  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1294  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1295 
1296 ROTATE_ARGS
1297 rotate_Xs
1298 %endm
1299 
1300 ;; input is [rsp + _XFER + %1 * 4]
1301 %macro DO_ROUND 1
1302  mov y0, e ; y0 = e
1303  ror y0, (25-11) ; y0 = e >> (25-11)
1304  mov y1, a ; y1 = a
1305  xor y0, e ; y0 = e ^ (e >> (25-11))
1306  ror y1, (22-13) ; y1 = a >> (22-13)
1307  mov y2, f ; y2 = f
1308  xor y1, a ; y1 = a ^ (a >> (22-13)
1309  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1310  xor y2, g ; y2 = f^g
1311  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1312  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1313  and y2, e ; y2 = (f^g)&e
1314  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1315  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1316  xor y2, g ; y2 = CH = ((f^g)&e)^g
1317  add y2, y0 ; y2 = S1 + CH
1318  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1319  add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
1320  mov y0, a ; y0 = a
1321  add h, y2 ; h = h + S1 + CH + k + w
1322  mov y2, a ; y2 = a
1323  or y0, c ; y0 = a|c
1324  add d, h ; d = d + h + S1 + CH + k + w
1325  and y2, c ; y2 = a&c
1326  and y0, b ; y0 = (a|c)&b
1327  add h, y1 ; h = h + S1 + CH + k + w + S0
1328  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1329  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1330  ROTATE_ARGS
1331 %endm
1332 
1333 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1334 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1335 ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
1336 ;; arg 1 : pointer to input data
1337 ;; arg 2 : pointer to digest
1338 ;; arg 3 : Num blocks
1339 section .text
1340 global sha256_sse4
1341 align 32
1342 sha256_sse4:
1343  push rbx
1344 %ifndef LINUX
1345  push rsi
1346  push rdi
1347 %endif
1348  push rbp
1349  push r13
1350  push r14
1351  push r15
1352 
1353  sub rsp,STACK_SIZE
1354 %ifndef LINUX
1355  movdqa [rsp + _XMM_SAVE + 0*16],xmm6
1356  movdqa [rsp + _XMM_SAVE + 1*16],xmm7
1357  movdqa [rsp + _XMM_SAVE + 2*16],xmm8
1358  movdqa [rsp + _XMM_SAVE + 3*16],xmm9
1359  movdqa [rsp + _XMM_SAVE + 4*16],xmm10
1360  movdqa [rsp + _XMM_SAVE + 5*16],xmm11
1361  movdqa [rsp + _XMM_SAVE + 6*16],xmm12
1362 %endif
1363 
1364  shl NUM_BLKS, 6 ; convert to bytes
1365  jz done_hash
1366  add NUM_BLKS, INP ; pointer to end of data
1367  mov [rsp + _INP_END], NUM_BLKS
1368 
1369  ;; load initial digest
1370  mov a,[4*0 + CTX]
1371  mov b,[4*1 + CTX]
1372  mov c,[4*2 + CTX]
1373  mov d,[4*3 + CTX]
1374  mov e,[4*4 + CTX]
1375  mov f,[4*5 + CTX]
1376  mov g,[4*6 + CTX]
1377  mov h,[4*7 + CTX]
1378 
1379  movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1380  movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
1381  movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
1382 
1383 loop0:
1384  lea TBL,[K256 wrt rip]
1385 
1386  ;; byte swap first 16 dwords
1387  COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
1388  COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
1389  COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
1390  COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
1391 
1392  mov [rsp + _INP], INP
1393 
1394  ;; schedule 48 input dwords, by doing 3 rounds of 16 each
1395  mov SRND, 3
1396 align 16
1397 loop1:
1398  movdqa XFER, [TBL + 0*16]
1399  paddd XFER, X0
1400  movdqa [rsp + _XFER], XFER
1401  FOUR_ROUNDS_AND_SCHED
1402 
1403  movdqa XFER, [TBL + 1*16]
1404  paddd XFER, X0
1405  movdqa [rsp + _XFER], XFER
1406  FOUR_ROUNDS_AND_SCHED
1407 
1408  movdqa XFER, [TBL + 2*16]
1409  paddd XFER, X0
1410  movdqa [rsp + _XFER], XFER
1411  FOUR_ROUNDS_AND_SCHED
1412 
1413  movdqa XFER, [TBL + 3*16]
1414  paddd XFER, X0
1415  movdqa [rsp + _XFER], XFER
1416  add TBL, 4*16
1417  FOUR_ROUNDS_AND_SCHED
1418 
1419  sub SRND, 1
1420  jne loop1
1421 
1422  mov SRND, 2
1423 loop2:
1424  paddd X0, [TBL + 0*16]
1425  movdqa [rsp + _XFER], X0
1426  DO_ROUND 0
1427  DO_ROUND 1
1428  DO_ROUND 2
1429  DO_ROUND 3
1430  paddd X1, [TBL + 1*16]
1431  movdqa [rsp + _XFER], X1
1432  add TBL, 2*16
1433  DO_ROUND 0
1434  DO_ROUND 1
1435  DO_ROUND 2
1436  DO_ROUND 3
1437 
1438  movdqa X0, X2
1439  movdqa X1, X3
1440 
1441  sub SRND, 1
1442  jne loop2
1443 
1444  addm [4*0 + CTX],a
1445  addm [4*1 + CTX],b
1446  addm [4*2 + CTX],c
1447  addm [4*3 + CTX],d
1448  addm [4*4 + CTX],e
1449  addm [4*5 + CTX],f
1450  addm [4*6 + CTX],g
1451  addm [4*7 + CTX],h
1452 
1453  mov INP, [rsp + _INP]
1454  add INP, 64
1455  cmp INP, [rsp + _INP_END]
1456  jne loop0
1457 
1458 done_hash:
1459 %ifndef LINUX
1460  movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
1461  movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
1462  movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
1463  movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
1464  movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
1465  movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
1466  movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
1467 %endif
1468 
1469  add rsp, STACK_SIZE
1470 
1471  pop r15
1472  pop r14
1473  pop r13
1474  pop rbp
1475 %ifndef LINUX
1476  pop rdi
1477  pop rsi
1478 %endif
1479  pop rbx
1480 
1481  ret
1482 
1483 
1484 section .data
1485 align 64
1486 K256:
1487  dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1488  dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1489  dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1490  dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1491  dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1492  dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1493  dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1494  dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1495  dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1496  dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1497  dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1498  dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1499  dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1500  dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1501  dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1502  dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1503 
1504 PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1505 
1506 ; shuffle xBxA -> 00BA
1507 _SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
1508 
1509 ; shuffle xDxC -> DC00
1510 _SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
1511 */
1512 
1513 #endif
void Transform(uint32_t *s, const unsigned char *chunk, size_t blocks)