1 files changed, 39 insertions, 15 deletions
diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl
index 595fc6d31f60..c41b620bc23e 100755
--- a/crypto/bn/asm/ppc64-mont.pl
+++ b/crypto/bn/asm/ppc64-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -28,7 +35,7 @@
 # key lengths. As it's obviously inappropriate as "best all-round"
 # alternative, it has to be complemented with run-time CPU family
 # detection. Oh! It should also be noted that unlike other PowerPC
-# implementation IALU ppc-mont.pl module performs *suboptimaly* on
+# implementation IALU ppc-mont.pl module performs *suboptimally* on
 # >=1024-bit key lengths on Power 6. It should also be noted that
 # *everything* said so far applies to 64-bit builds! As far as 32-bit
 # application executed on 64-bit CPU goes, this module is likely to
@@ -1346,7 +1353,7 @@ $code.=<<___;
 	std	$t3,-16($tp)		; tp[j-1]
 	std	$t5,-8($tp)		; tp[j]
 
-	add	$carry,$carry,$ovf	; comsume upmost overflow
+	add	$carry,$carry,$ovf	; consume upmost overflow
 	add	$t6,$t6,$carry		; can not overflow
 	srdi	$carry,$t6,16
 	add	$t7,$t7,$carry
@@ -1494,16 +1501,14 @@ Lsub:	ldx	$t0,$tp,$i
 
 	li	$i,0
 	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
-	and	$ap,$tp,$ovf
-	andc	$np,$rp,$ovf
-	or	$ap,$ap,$np	; ap=borrow?tp:rp
-	addi	$t7,$ap,8
 	mtctr	$j
 
 .align	4
-Lcopy:				; copy or in-place refresh
-	ldx	$t0,$ap,$i
-	ldx	$t1,$t7,$i
+Lcopy:				; conditional copy
+	ldx	$t0,$tp,$i
+	ldx	$t1,$t4,$i
+	ldx	$t2,$rp,$i
+	ldx	$t3,$t6,$i
 	std	$i,8($nap_d)	; zap nap_d
 	std	$i,16($nap_d)
 	std	$i,24($nap_d)
@@ -1512,6 +1517,12 @@ Lcopy:				; copy or in-place refresh
 	std	$i,48($nap_d)
 	std	$i,56($nap_d)
 	stdu	$i,64($nap_d)
+	and	$t0,$t0,$ovf
+	and	$t1,$t1,$ovf
+	andc	$t2,$t2,$ovf
+	andc	$t3,$t3,$ovf
+	or	$t0,$t0,$t2
+	or	$t1,$t1,$t3
 	stdx	$t0,$rp,$i
 	stdx	$t1,$t6,$i
 	stdx	$i,$tp,$i	; zap tp at once
@@ -1554,20 +1565,21 @@ Lsub:	lwz	$t0,12($tp)	; load tp[j..j+3] in 64-bit word order
 
 	li	$i,0
 	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
-	addi	$tp,$sp,`$FRAME+$TRANSFER+4`
+	addi	$ap,$sp,`$FRAME+$TRANSFER+4`
 	subf	$rp,$num,$rp	; rewind rp
-	and	$ap,$tp,$ovf
-	andc	$np,$rp,$ovf
-	or	$ap,$ap,$np	; ap=borrow?tp:rp
 	addi	$tp,$sp,`$FRAME+$TRANSFER`
 	mtctr	$j
 
 .align	4
-Lcopy:				; copy or in-place refresh
+Lcopy:				; conditional copy
 	lwz	$t0,4($ap)
 	lwz	$t1,8($ap)
 	lwz	$t2,12($ap)
 	lwzu	$t3,16($ap)
+	lwz	$t4,4($rp)
+	lwz	$t5,8($rp)
+	lwz	$t6,12($rp)
+	lwz	$t7,16($rp)
 	std	$i,8($nap_d)	; zap nap_d
 	std	$i,16($nap_d)
 	std	$i,24($nap_d)
@@ -1576,6 +1588,18 @@ Lcopy:				; copy or in-place refresh
 	std	$i,48($nap_d)
 	std	$i,56($nap_d)
 	stdu	$i,64($nap_d)
+	and	$t0,$t0,$ovf
+	and	$t1,$t1,$ovf
+	and	$t2,$t2,$ovf
+	and	$t3,$t3,$ovf
+	andc	$t4,$t4,$ovf
+	andc	$t5,$t5,$ovf
+	andc	$t6,$t6,$ovf
+	andc	$t7,$t7,$ovf
+	or	$t0,$t0,$t4
+	or	$t1,$t1,$t5
+	or	$t2,$t2,$t6
+	or	$t3,$t3,$t7
 	stw	$t0,4($rp)
 	stw	$t1,8($rp)
 	stw	$t2,12($rp)