From a43ce912fc025d11e1395506111f75fc194d7ba5 Mon Sep 17 00:00:00 2001 From: Jung-uk Kim Date: Thu, 13 Sep 2018 19:18:07 +0000 Subject: Import OpenSSL 1.1.1. --- crypto/bn/asm/ia64.S | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) (limited to 'crypto/bn/asm/ia64.S') diff --git a/crypto/bn/asm/ia64.S b/crypto/bn/asm/ia64.S index a9a42abfc302..d235c45e2d63 100644 --- a/crypto/bn/asm/ia64.S +++ b/crypto/bn/asm/ia64.S @@ -1,11 +1,18 @@ .explicit .text .ident "ia64.S, Version 2.1" -.ident "IA-64 ISA artwork by Andy Polyakov " +.ident "IA-64 ISA artwork by Andy Polyakov " + +// Copyright 2001-2018 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html // // ==================================================================== -// Written by Andy Polyakov for the OpenSSL +// Written by Andy Polyakov for the OpenSSL // project. // // Rights for redistribution and usage in source and binary forms are @@ -13,7 +20,7 @@ // disclaimed. // ==================================================================== // -// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is +// Version 2.x is Itanium2 re-tune. Few words about how Itanium2 is // different from Itanium to this module viewpoint. Most notably, is it // "wider" than Itanium? Can you experience loop scalability as // discussed in commentary sections? Not really:-( Itanium2 has 6 @@ -22,7 +29,7 @@ // ports is the same, i.e. 2, while I need 4. In other words, to this // module Itanium2 remains effectively as "wide" as Itanium. Yet it's // essentially different in respect to this module, and a re-tune was -// required. Well, because some intruction latencies has changed. Most +// required. Well, because some instruction latencies has changed. Most // noticeably those intensively used: // // Itanium Itanium2 @@ -134,7 +141,7 @@ // User Mask I want to excuse the kernel from preserving upper // (f32-f128) FP register bank over process context switch, thus // minimizing bus bandwidth consumption during the switch (i.e. -// after PKI opration completes and the program is off doing +// after PKI operation completes and the program is off doing // something else like bulk symmetric encryption). Having said // this, I also want to point out that it might be good idea // to compile the whole toolkit (as well as majority of the @@ -150,12 +157,15 @@ #else #define ADDP add #endif +#ifdef __VMS +.alias abort, "decc$abort" +#endif #if 1 // // bn_[add|sub]_words routines. // -// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the +// Loops are spinning in 2*(n+5) ticks on Itanium (provided that the // data reside in L1 cache, i.e. 2 ticks away). It's possible to // compress the epilogue and get down to 2*n+6, but at the cost of // scalability (the neat feature of this implementation is that it @@ -363,7 +373,7 @@ bn_mul_words: // The loop therefore spins at the latency of xma minus 1, or in other // words at 6*(n+4) ticks:-( Compare to the "production" loop above // that runs in 2*(n+11) where the low latency problem is worked around -// by moving the dependency to one-tick latent interger ALU. Note that +// by moving the dependency to one-tick latent integer ALU. Note that // "distance" between ldf8 and xma is not latency of ldf8, but the // *difference* between xma and ldf8 latencies. .L_bn_mul_words_ctop: @@ -425,7 +435,7 @@ bn_mul_add_words: // version was performing *all* additions in IALU and was starving // for those even on Itanium 2. In this version one addition is // moved to FPU and is folded with multiplication. This is at cost -// of propogating the result from previous call to this subroutine +// of propagating the result from previous call to this subroutine // to L2 cache... In other words negligible even for shorter keys. // *Overall* performance improvement [over previous version] varies // from 11 to 22 percent depending on key length. @@ -493,9 +503,9 @@ bn_sqr_words: // possible to compress the epilogue (I'm getting tired to write this // comment over and over) and get down to 2*n+16 at the cost of // scalability. The decision will very likely be reconsidered after the -// benchmark program is profiled. I.e. if perfomance gain on Itanium +// benchmark program is profiled. I.e. if performance gain on Itanium // will appear larger than loss on "wider" IA-64, then the loop should -// be explicitely split and the epilogue compressed. +// be explicitly split and the epilogue compressed. .L_bn_sqr_words_ctop: { .mfi; (p16) ldf8 f32=[r33],8 (p25) xmpy.lu f42=f41,f41 @@ -929,7 +939,7 @@ bn_mul_comba8: xma.hu f118=f39,f127,f117 } { .mfi; xma.lu f117=f39,f127,f117 };;// //-------------------------------------------------// -// Leaving muliplier's heaven... Quite a ride, huh? +// Leaving multiplier's heaven... Quite a ride, huh? { .mii; getf.sig r31=f47 add r25=r25,r24 @@ -1421,6 +1431,7 @@ bn_div_words: mov ar.ec=0 // don't rotate at exit mov pr.rot=0 } { .mii; mov L=r33 // save l + mov r25=r0 // needed if abort is called on VMS mov r36=r0 };; .L_divw_shift: // -vv- note signed comparison @@ -1522,9 +1533,8 @@ bn_div_words: // output: f8 = (int)(a/b) // clobbered: f8,f9,f10,f11,pred pred=p15 -// One can argue that this snippet is copyrighted to Intel -// Corporation, as it's essentially identical to one of those -// found in "Divide, Square Root and Remainder" section at +// This snippet is based on text found in the "Divide, Square +// Root and Remainder" section at // http://www.intel.com/software/products/opensource/libraries/num.htm. // Yes, I admit that the referred code was used as template, // but after I realized that there hardly is any other instruction -- cgit v1.2.3