1 files changed, 29 insertions, 18 deletions
diff --git a/crypto/bn/asm/vis3-mont.pl b/crypto/bn/asm/vis3-mont.pl
index 263ac02b6f45..04833a0c876d 100755
--- a/crypto/bn/asm/vis3-mont.pl
+++ b/crypto/bn/asm/vis3-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2012-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -9,7 +16,7 @@
 
 # October 2012.
 #
-# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
+# SPARCv9 VIS3 Montgomery multiplication procedure suitable for T3 and
 # onward. There are three new instructions used here: umulxhi,
 # addxc[cc] and initializing store. On T3 RSA private key operations
 # are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
@@ -18,16 +25,20 @@
 # for reference purposes, because T4 has dedicated Montgomery
 # multiplication and squaring *instructions* that deliver even more.
 
-$bits=32;
-for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64)  { $bias=2047; $frame=192; }
-else            { $bias=0;    $frame=112; }
+$output = pop;
+open STDOUT,">$output";
+
+$frame = "STACK_FRAME";
+$bias = "STACK_BIAS";
+
+$code.=<<___;
+#include "sparc_arch.h"
 
-$code.=<<___ if ($bits==64);
+#ifdef	__arch64__
 .register	%g2,#scratch
 .register	%g3,#scratch
-___
-$code.=<<___;
+#endif
+
 .section	".text",#alloc,#execinstr
 ___
 
@@ -299,23 +310,23 @@ $code.=<<___;
 	sub	$anp,	$num,	$anp
 	sub	$rp,	$num,	$rp
 
-	subc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
-	and	$tp,	$ovf,	$ap
-	andn	$rp,	$ovf,	$np
-	or	$np,	$ap,	$ap	! ap=borrow?tp:rp
+	subccc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
 	ba	.Lcopy
 	sub	$num,	8,	$cnt
 
 .align	16
-.Lcopy:					! copy or in-place refresh
-	ld	[$ap+0],	$t2
-	ld	[$ap+4],	$t3
-	add	$ap,	8,	$ap
+.Lcopy:					! conditional copy
+	ld	[$tp+0],	$t0
+	ld	[$tp+4],	$t1
+	ld	[$rp+0],	$t2
+	ld	[$rp+4],	$t3
 	stx	%g0,	[$tp]		! zap
 	add	$tp,	8,	$tp
 	stx	%g0,	[$anp]		! zap
 	stx	%g0,	[$anp+8]
 	add	$anp,	16,	$anp
+	movcs	%icc,	$t0,	$t2
+	movcs	%icc,	$t1,	$t3
 	st	$t3,	[$rp+0]		! flip order
 	st	$t2,	[$rp+4]
 	add	$rp,	8,	$rp
@@ -333,7 +344,7 @@ ___
 
 # Purpose of these subroutines is to explicitly encode VIS instructions,
 # so that one can compile the module without having to specify VIS
-# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 # Idea is to reserve for option to produce "universal" binary and let
 # programmer detect if current CPU is VIS capable at run-time.
 sub unvis3 {