summaryrefslogtreecommitdiff
path: root/system/openssl/CVE-2019-1551.patch
diff options
context:
space:
mode:
Diffstat (limited to 'system/openssl/CVE-2019-1551.patch')
-rw-r--r--system/openssl/CVE-2019-1551.patch757
1 files changed, 757 insertions, 0 deletions
diff --git a/system/openssl/CVE-2019-1551.patch b/system/openssl/CVE-2019-1551.patch
new file mode 100644
index 000000000..8daf04ebf
--- /dev/null
+++ b/system/openssl/CVE-2019-1551.patch
@@ -0,0 +1,757 @@
+From 419102400a2811582a7a3d4a4e317d72e5ce0a8f Mon Sep 17 00:00:00 2001
+From: Andy Polyakov <appro@openssl.org>
+Date: Wed, 4 Dec 2019 12:48:21 +0100
+Subject: [PATCH] Fix an overflow bug in rsaz_512_sqr
+
+There is an overflow bug in the x64_64 Montgomery squaring procedure used in
+exponentiation with 512-bit moduli. No EC algorithms are affected. Analysis
+suggests that attacks against 2-prime RSA1024, 3-prime RSA1536, and DSA1024 as a
+result of this defect would be very difficult to perform and are not believed
+likely. Attacks against DH512 are considered just feasible. However, for an
+attack the target would have to re-use the DH512 private key, which is not
+recommended anyway. Also applications directly using the low level API
+BN_mod_exp may be affected if they use BN_FLG_CONSTTIME.
+
+CVE-2019-1551
+
+Reviewed-by: Paul Dale <paul.dale@oracle.com>
+Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
+(Merged from https://github.com/openssl/openssl/pull/10575)
+---
+ crypto/bn/asm/rsaz-x86_64.pl | 381 ++++++++++++++++++-----------------
+ 1 file changed, 197 insertions(+), 184 deletions(-)
+
+diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl
+index b1797b649f..7534d5cd03 100755
+--- a/crypto/bn/asm/rsaz-x86_64.pl
++++ b/crypto/bn/asm/rsaz-x86_64.pl
+@@ -116,7 +116,7 @@ rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
+ subq \$128+24, %rsp
+ .cfi_adjust_cfa_offset 128+24
+ .Lsqr_body:
+- movq $mod, %rbp # common argument
++ movq $mod, %xmm1 # common off-load
+ movq ($inp), %rdx
+ movq 8($inp), %rax
+ movq $n0, 128(%rsp)
+@@ -134,7 +134,8 @@ $code.=<<___;
+ .Loop_sqr:
+ movl $times,128+8(%rsp)
+ #first iteration
+- movq %rdx, %rbx
++ movq %rdx, %rbx # 0($inp)
++ mov %rax, %rbp # 8($inp)
+ mulq %rdx
+ movq %rax, %r8
+ movq 16($inp), %rax
+@@ -173,31 +174,29 @@ $code.=<<___;
+ mulq %rbx
+ addq %rax, %r14
+ movq %rbx, %rax
+- movq %rdx, %r15
+- adcq \$0, %r15
++ adcq \$0, %rdx
+
+- addq %r8, %r8 #shlq \$1, %r8
+- movq %r9, %rcx
+- adcq %r9, %r9 #shld \$1, %r8, %r9
++ xorq %rcx,%rcx # rcx:r8 = r8 << 1
++ addq %r8, %r8
++ movq %rdx, %r15
++ adcq \$0, %rcx
+
+ mulq %rax
+- movq %rax, (%rsp)
+- addq %rdx, %r8
+- adcq \$0, %r9
++ addq %r8, %rdx
++ adcq \$0, %rcx
+
+- movq %r8, 8(%rsp)
+- shrq \$63, %rcx
++ movq %rax, (%rsp)
++ movq %rdx, 8(%rsp)
+
+ #second iteration
+- movq 8($inp), %r8
+ movq 16($inp), %rax
+- mulq %r8
++ mulq %rbp
+ addq %rax, %r10
+ movq 24($inp), %rax
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+- mulq %r8
++ mulq %rbp
+ addq %rax, %r11
+ movq 32($inp), %rax
+ adcq \$0, %rdx
+@@ -205,7 +204,7 @@ $code.=<<___;
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+- mulq %r8
++ mulq %rbp
+ addq %rax, %r12
+ movq 40($inp), %rax
+ adcq \$0, %rdx
+@@ -213,7 +212,7 @@ $code.=<<___;
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+- mulq %r8
++ mulq %rbp
+ addq %rax, %r13
+ movq 48($inp), %rax
+ adcq \$0, %rdx
+@@ -221,7 +220,7 @@ $code.=<<___;
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+- mulq %r8
++ mulq %rbp
+ addq %rax, %r14
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+@@ -229,39 +228,39 @@ $code.=<<___;
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+- mulq %r8
++ mulq %rbp
+ addq %rax, %r15
+- movq %r8, %rax
++ movq %rbp, %rax
+ adcq \$0, %rdx
+ addq %rbx, %r15
+- movq %rdx, %r8
+- movq %r10, %rdx
+- adcq \$0, %r8
++ adcq \$0, %rdx
+
+- add %rdx, %rdx
+- lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
+- movq %r11, %rbx
+- adcq %r11, %r11 #shld \$1, %r10, %r11
++ xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
++ addq %r9, %r9
++ movq %rdx, %r8
++ adcq %r10, %r10
++ adcq \$0, %rbx
+
+ mulq %rax
++ addq %rcx, %rax
++ movq 16($inp), %rbp
++ adcq \$0, %rdx
+ addq %rax, %r9
++ movq 24($inp), %rax
+ adcq %rdx, %r10
+- adcq \$0, %r11
++ adcq \$0, %rbx
+
+ movq %r9, 16(%rsp)
+ movq %r10, 24(%rsp)
+- shrq \$63, %rbx
+
+ #third iteration
+- movq 16($inp), %r9
+- movq 24($inp), %rax
+- mulq %r9
++ mulq %rbp
+ addq %rax, %r12
+ movq 32($inp), %rax
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+- mulq %r9
++ mulq %rbp
+ addq %rax, %r13
+ movq 40($inp), %rax
+ adcq \$0, %rdx
+@@ -269,7 +268,7 @@ $code.=<<___;
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+- mulq %r9
++ mulq %rbp
+ addq %rax, %r14
+ movq 48($inp), %rax
+ adcq \$0, %rdx
+@@ -277,9 +276,7 @@ $code.=<<___;
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+- mulq %r9
+- movq %r12, %r10
+- lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
++ mulq %rbp
+ addq %rax, %r15
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+@@ -287,36 +284,40 @@ $code.=<<___;
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+- mulq %r9
+- shrq \$63, %r10
++ mulq %rbp
+ addq %rax, %r8
+- movq %r9, %rax
++ movq %rbp, %rax
+ adcq \$0, %rdx
+ addq %rcx, %r8
+- movq %rdx, %r9
+- adcq \$0, %r9
++ adcq \$0, %rdx
+
+- movq %r13, %rcx
+- leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
++ xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
++ addq %r11, %r11
++ movq %rdx, %r9
++ adcq %r12, %r12
++ adcq \$0, %rcx
+
+ mulq %rax
++ addq %rbx, %rax
++ movq 24($inp), %r10
++ adcq \$0, %rdx
+ addq %rax, %r11
++ movq 32($inp), %rax
+ adcq %rdx, %r12
+- adcq \$0, %r13
++ adcq \$0, %rcx
+
+ movq %r11, 32(%rsp)
+ movq %r12, 40(%rsp)
+- shrq \$63, %rcx
+
+ #fourth iteration
+- movq 24($inp), %r10
+- movq 32($inp), %rax
++ mov %rax, %r11 # 32($inp)
+ mulq %r10
+ addq %rax, %r14
+ movq 40($inp), %rax
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
++ mov %rax, %r12 # 40($inp)
+ mulq %r10
+ addq %rax, %r15
+ movq 48($inp), %rax
+@@ -325,9 +326,8 @@ $code.=<<___;
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
++ mov %rax, %rbp # 48($inp)
+ mulq %r10
+- movq %r14, %r12
+- leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
+ addq %rax, %r8
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+@@ -336,32 +336,33 @@ $code.=<<___;
+ adcq \$0, %rbx
+
+ mulq %r10
+- shrq \$63, %r12
+ addq %rax, %r9
+ movq %r10, %rax
+ adcq \$0, %rdx
+ addq %rbx, %r9
+- movq %rdx, %r10
+- adcq \$0, %r10
++ adcq \$0, %rdx
+
+- movq %r15, %rbx
+- leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
++ xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1
++ addq %r13, %r13
++ movq %rdx, %r10
++ adcq %r14, %r14
++ adcq \$0, %rbx
+
+ mulq %rax
++ addq %rcx, %rax
++ adcq \$0, %rdx
+ addq %rax, %r13
++ movq %r12, %rax # 40($inp)
+ adcq %rdx, %r14
+- adcq \$0, %r15
++ adcq \$0, %rbx
+
+ movq %r13, 48(%rsp)
+ movq %r14, 56(%rsp)
+- shrq \$63, %rbx
+
+ #fifth iteration
+- movq 32($inp), %r11
+- movq 40($inp), %rax
+ mulq %r11
+ addq %rax, %r8
+- movq 48($inp), %rax
++ movq %rbp, %rax # 48($inp)
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+@@ -369,97 +370,99 @@ $code.=<<___;
+ addq %rax, %r9
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+- movq %r8, %r12
+- leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
+ addq %rcx, %r9
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
++ mov %rax, %r14 # 56($inp)
+ mulq %r11
+- shrq \$63, %r12
+ addq %rax, %r10
+ movq %r11, %rax
+ adcq \$0, %rdx
+ addq %rcx, %r10
+- movq %rdx, %r11
+- adcq \$0, %r11
++ adcq \$0, %rdx
+
+- movq %r9, %rcx
+- leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
++ xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1
++ addq %r15, %r15
++ movq %rdx, %r11
++ adcq %r8, %r8
++ adcq \$0, %rcx
+
+ mulq %rax
++ addq %rbx, %rax
++ adcq \$0, %rdx
+ addq %rax, %r15
++ movq %rbp, %rax # 48($inp)
+ adcq %rdx, %r8
+- adcq \$0, %r9
++ adcq \$0, %rcx
+
+ movq %r15, 64(%rsp)
+ movq %r8, 72(%rsp)
+- shrq \$63, %rcx
+
+ #sixth iteration
+- movq 40($inp), %r12
+- movq 48($inp), %rax
+ mulq %r12
+ addq %rax, %r10
+- movq 56($inp), %rax
++ movq %r14, %rax # 56($inp)
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r12
+ addq %rax, %r11
+ movq %r12, %rax
+- movq %r10, %r15
+- leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
+ adcq \$0, %rdx
+- shrq \$63, %r15
+ addq %rbx, %r11
+- movq %rdx, %r12
+- adcq \$0, %r12
++ adcq \$0, %rdx
+
+- movq %r11, %rbx
+- leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
++ xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
++ addq %r9, %r9
++ movq %rdx, %r12
++ adcq %r10, %r10
++ adcq \$0, %rbx
+
+ mulq %rax
++ addq %rcx, %rax
++ adcq \$0, %rdx
+ addq %rax, %r9
++ movq %r14, %rax # 56($inp)
+ adcq %rdx, %r10
+- adcq \$0, %r11
++ adcq \$0, %rbx
+
+ movq %r9, 80(%rsp)
+ movq %r10, 88(%rsp)
+
+ #seventh iteration
+- movq 48($inp), %r13
+- movq 56($inp), %rax
+- mulq %r13
++ mulq %rbp
+ addq %rax, %r12
+- movq %r13, %rax
+- movq %rdx, %r13
+- adcq \$0, %r13
++ movq %rbp, %rax
++ adcq \$0, %rdx
+
+- xorq %r14, %r14
+- shlq \$1, %rbx
+- adcq %r12, %r12 #shld \$1, %rbx, %r12
+- adcq %r13, %r13 #shld \$1, %r12, %r13
+- adcq %r14, %r14 #shld \$1, %r13, %r14
++ xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
++ addq %r11, %r11
++ movq %rdx, %r13
++ adcq %r12, %r12
++ adcq \$0, %rcx
+
+ mulq %rax
++ addq %rbx, %rax
++ adcq \$0, %rdx
+ addq %rax, %r11
++ movq %r14, %rax # 56($inp)
+ adcq %rdx, %r12
+- adcq \$0, %r13
++ adcq \$0, %rcx
+
+ movq %r11, 96(%rsp)
+ movq %r12, 104(%rsp)
+
+ #eighth iteration
+- movq 56($inp), %rax
++ xorq %rbx, %rbx # rbx:r13 = r13 << 1
++ addq %r13, %r13
++ adcq \$0, %rbx
++
+ mulq %rax
+- addq %rax, %r13
++ addq %rcx, %rax
+ adcq \$0, %rdx
+-
+- addq %rdx, %r14
+-
+- movq %r13, 112(%rsp)
+- movq %r14, 120(%rsp)
++ addq %r13, %rax
++ adcq %rbx, %rdx
+
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+@@ -469,6 +472,10 @@ $code.=<<___;
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
++ movq %xmm1, %rbp
++
++ movq %rax, 112(%rsp)
++ movq %rdx, 120(%rsp)
+
+ call __rsaz_512_reduce
+
+@@ -500,9 +507,9 @@ $code.=<<___;
+ .Loop_sqrx:
+ movl $times,128+8(%rsp)
+ movq $out, %xmm0 # off-load
+- movq %rbp, %xmm1 # off-load
+ #first iteration
+ mulx %rax, %r8, %r9
++ mov %rax, %rbx
+
+ mulx 16($inp), %rcx, %r10
+ xor %rbp, %rbp # cf=0, of=0
+@@ -510,40 +517,39 @@ $code.=<<___;
+ mulx 24($inp), %rax, %r11
+ adcx %rcx, %r9
+
+- mulx 32($inp), %rcx, %r12
++ .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12
+ adcx %rax, %r10
+
+- mulx 40($inp), %rax, %r13
++ .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13
+ adcx %rcx, %r11
+
+- .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
++ mulx 48($inp), %rcx, %r14
+ adcx %rax, %r12
+ adcx %rcx, %r13
+
+- .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
++ mulx 56($inp), %rax, %r15
+ adcx %rax, %r14
+ adcx %rbp, %r15 # %rbp is 0
+
+- mov %r9, %rcx
+- shld \$1, %r8, %r9
+- shl \$1, %r8
+-
+- xor %ebp, %ebp
+- mulx %rdx, %rax, %rdx
+- adcx %rdx, %r8
+- mov 8($inp), %rdx
+- adcx %rbp, %r9
++ mulx %rdx, %rax, $out
++ mov %rbx, %rdx # 8($inp)
++ xor %rcx, %rcx
++ adox %r8, %r8
++ adcx $out, %r8
++ adox %rbp, %rcx
++ adcx %rbp, %rcx
+
+ mov %rax, (%rsp)
+ mov %r8, 8(%rsp)
+
+ #second iteration
+- mulx 16($inp), %rax, %rbx
++ .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx
+ adox %rax, %r10
+ adcx %rbx, %r11
+
+- .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
++ mulx 24($inp), $out, %r8
+ adox $out, %r11
++ .byte 0x66
+ adcx %r8, %r12
+
+ mulx 32($inp), %rax, %rbx
+@@ -561,24 +567,25 @@ $code.=<<___;
+ .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
+ adox $out, %r15
+ adcx %rbp, %r8
++ mulx %rdx, %rax, $out
+ adox %rbp, %r8
++ .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx
+
+- mov %r11, %rbx
+- shld \$1, %r10, %r11
+- shld \$1, %rcx, %r10
+-
+- xor %ebp,%ebp
+- mulx %rdx, %rax, %rcx
+- mov 16($inp), %rdx
++ xor %rbx, %rbx
++ adcx %rcx, %rax
++ adox %r9, %r9
++ adcx %rbp, $out
++ adox %r10, %r10
+ adcx %rax, %r9
+- adcx %rcx, %r10
+- adcx %rbp, %r11
++ adox %rbp, %rbx
++ adcx $out, %r10
++ adcx %rbp, %rbx
+
+ mov %r9, 16(%rsp)
+ .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
+
+ #third iteration
+- .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
++ mulx 24($inp), $out, %r9
+ adox $out, %r12
+ adcx %r9, %r13
+
+@@ -586,7 +593,7 @@ $code.=<<___;
+ adox %rax, %r13
+ adcx %rcx, %r14
+
+- mulx 40($inp), $out, %r9
++ .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9
+ adox $out, %r14
+ adcx %r9, %r15
+
+@@ -594,27 +601,28 @@ $code.=<<___;
+ adox %rax, %r15
+ adcx %rcx, %r8
+
+- .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
++ mulx 56($inp), $out, %r9
+ adox $out, %r8
+ adcx %rbp, %r9
++ mulx %rdx, %rax, $out
+ adox %rbp, %r9
++ mov 24($inp), %rdx
+
+- mov %r13, %rcx
+- shld \$1, %r12, %r13
+- shld \$1, %rbx, %r12
+-
+- xor %ebp, %ebp
+- mulx %rdx, %rax, %rdx
++ xor %rcx, %rcx
++ adcx %rbx, %rax
++ adox %r11, %r11
++ adcx %rbp, $out
++ adox %r12, %r12
+ adcx %rax, %r11
+- adcx %rdx, %r12
+- mov 24($inp), %rdx
+- adcx %rbp, %r13
++ adox %rbp, %rcx
++ adcx $out, %r12
++ adcx %rbp, %rcx
+
+ mov %r11, 32(%rsp)
+- .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
++ mov %r12, 40(%rsp)
+
+ #fourth iteration
+- .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
++ mulx 32($inp), %rax, %rbx
+ adox %rax, %r14
+ adcx %rbx, %r15
+
+@@ -629,25 +637,25 @@ $code.=<<___;
+ mulx 56($inp), $out, %r10
+ adox $out, %r9
+ adcx %rbp, %r10
++ mulx %rdx, %rax, $out
+ adox %rbp, %r10
++ mov 32($inp), %rdx
+
+- .byte 0x66
+- mov %r15, %rbx
+- shld \$1, %r14, %r15
+- shld \$1, %rcx, %r14
+-
+- xor %ebp, %ebp
+- mulx %rdx, %rax, %rdx
++ xor %rbx, %rbx
++ adcx %rcx, %rax
++ adox %r13, %r13
++ adcx %rbp, $out
++ adox %r14, %r14
+ adcx %rax, %r13
+- adcx %rdx, %r14
+- mov 32($inp), %rdx
+- adcx %rbp, %r15
++ adox %rbp, %rbx
++ adcx $out, %r14
++ adcx %rbp, %rbx
+
+ mov %r13, 48(%rsp)
+ mov %r14, 56(%rsp)
+
+ #fifth iteration
+- .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
++ mulx 40($inp), $out, %r11
+ adox $out, %r8
+ adcx %r11, %r9
+
+@@ -658,18 +666,19 @@ $code.=<<___;
+ mulx 56($inp), $out, %r11
+ adox $out, %r10
+ adcx %rbp, %r11
++ mulx %rdx, %rax, $out
++ mov 40($inp), %rdx
+ adox %rbp, %r11
+
+- mov %r9, %rcx
+- shld \$1, %r8, %r9
+- shld \$1, %rbx, %r8
+-
+- xor %ebp, %ebp
+- mulx %rdx, %rax, %rdx
++ xor %rcx, %rcx
++ adcx %rbx, %rax
++ adox %r15, %r15
++ adcx %rbp, $out
++ adox %r8, %r8
+ adcx %rax, %r15
+- adcx %rdx, %r8
+- mov 40($inp), %rdx
+- adcx %rbp, %r9
++ adox %rbp, %rcx
++ adcx $out, %r8
++ adcx %rbp, %rcx
+
+ mov %r15, 64(%rsp)
+ mov %r8, 72(%rsp)
+@@ -682,18 +691,19 @@ $code.=<<___;
+ .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
+ adox $out, %r11
+ adcx %rbp, %r12
++ mulx %rdx, %rax, $out
+ adox %rbp, %r12
++ mov 48($inp), %rdx
+
+- mov %r11, %rbx
+- shld \$1, %r10, %r11
+- shld \$1, %rcx, %r10
+-
+- xor %ebp, %ebp
+- mulx %rdx, %rax, %rdx
++ xor %rbx, %rbx
++ adcx %rcx, %rax
++ adox %r9, %r9
++ adcx %rbp, $out
++ adox %r10, %r10
+ adcx %rax, %r9
+- adcx %rdx, %r10
+- mov 48($inp), %rdx
+- adcx %rbp, %r11
++ adcx $out, %r10
++ adox %rbp, %rbx
++ adcx %rbp, %rbx
+
+ mov %r9, 80(%rsp)
+ mov %r10, 88(%rsp)
+@@ -703,31 +713,31 @@ $code.=<<___;
+ adox %rax, %r12
+ adox %rbp, %r13
+
+- xor %r14, %r14
+- shld \$1, %r13, %r14
+- shld \$1, %r12, %r13
+- shld \$1, %rbx, %r12
+-
+- xor %ebp, %ebp
+- mulx %rdx, %rax, %rdx
+- adcx %rax, %r11
+- adcx %rdx, %r12
++ mulx %rdx, %rax, $out
++ xor %rcx, %rcx
+ mov 56($inp), %rdx
+- adcx %rbp, %r13
++ adcx %rbx, %rax
++ adox %r11, %r11
++ adcx %rbp, $out
++ adox %r12, %r12
++ adcx %rax, %r11
++ adox %rbp, %rcx
++ adcx $out, %r12
++ adcx %rbp, %rcx
+
+ .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
+ .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
+
+ #eighth iteration
+ mulx %rdx, %rax, %rdx
+- adox %rax, %r13
+- adox %rbp, %rdx
++ xor %rbx, %rbx
++ adcx %rcx, %rax
++ adox %r13, %r13
++ adcx %rbp, %rdx
++ adox %rbp, %rbx
++ adcx %r13, %rax
++ adcx %rdx, %rbx
+
+- .byte 0x66
+- add %rdx, %r14
+-
+- movq %r13, 112(%rsp)
+- movq %r14, 120(%rsp)
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+@@ -741,6 +751,9 @@ $code.=<<___;
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
++ movq %rax, 112(%rsp)
++ movq %rbx, 120(%rsp)
++
+ call __rsaz_512_reducex
+
+ addq 64(%rsp), %r8
+--
+2.17.1
+