20 files changed, 1981 insertions, 421 deletions
diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile
index 5c3e08fa8085..0491e3db4c47 100644
--- a/crypto/bn/Makefile
+++ b/crypto/bn/Makefile
@@ -67,16 +67,22 @@ bn86-elf.s:	asm/bn-586.pl ../perlasm/x86asm.pl
 	(cd asm; $(PERL) bn-586.pl elf $(CFLAGS) > ../$@)
 co86-elf.s:	asm/co-586.pl ../perlasm/x86asm.pl
 	(cd asm; $(PERL) co-586.pl elf $(CFLAGS) > ../$@)
+mo86-elf.s:	asm/mo-586.pl ../perlasm/x86asm.pl
+	(cd asm; $(PERL) mo-586.pl elf $(CFLAGS) > ../$@)
 # COFF
 bn86-cof.s: asm/bn-586.pl ../perlasm/x86asm.pl
 	(cd asm; $(PERL) bn-586.pl coff $(CFLAGS) > ../$@)
 co86-cof.s: asm/co-586.pl ../perlasm/x86asm.pl
 	(cd asm; $(PERL) co-586.pl coff $(CFLAGS) > ../$@)
+mo86-cof.s: asm/mo-586.pl ../perlasm/x86asm.pl
+	(cd asm; $(PERL) mo-586.pl coff $(CFLAGS) > ../$@)
 # a.out
 bn86-out.s: asm/bn-586.pl ../perlasm/x86asm.pl
 	(cd asm; $(PERL) bn-586.pl a.out $(CFLAGS) > ../$@)
 co86-out.s: asm/co-586.pl ../perlasm/x86asm.pl
 	(cd asm; $(PERL) co-586.pl a.out $(CFLAGS) > ../$@)
+mo86-out.s: asm/mo-586.pl ../perlasm/x86asm.pl
+	(cd asm; $(PERL) mo-586.pl a.out $(CFLAGS) > ../$@)
 
 sparcv8.o:	asm/sparcv8.S
 	$(CC) $(CFLAGS) -c asm/sparcv8.S
@@ -91,6 +97,8 @@ bn-mips3.o:	asm/mips3.s
 
 x86_64-gcc.o:	asm/x86_64-gcc.c
 	$(CC) $(CFLAGS) -c -o $@ asm/x86_64-gcc.c
+x86_64-mont.s:	asm/x86_64-mont.pl
+	$(PERL) asm/x86_64-mont.pl $@
 
 bn-ia64.s:	asm/ia64.S
 	$(CC) $(CFLAGS) -E asm/ia64.S > $@
@@ -108,6 +116,7 @@ linux_ppc64.s: asm/ppc.pl;	$(PERL) $< $@
 aix_ppc32.s: asm/ppc.pl;	$(PERL) asm/ppc.pl $@
 aix_ppc64.s: asm/ppc.pl;	$(PERL) asm/ppc.pl $@
 osx_ppc32.s: asm/ppc.pl;	$(PERL) $< $@
+osx_ppc64.s: asm/ppc.pl;	$(PERL) $< $@
 
 files:
 	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
diff --git a/crypto/bn/asm/ia64.S b/crypto/bn/asm/ia64.S
index 7b82b820e62a..951abc53ea5b 100644
--- a/crypto/bn/asm/ia64.S
+++ b/crypto/bn/asm/ia64.S
@@ -171,21 +171,21 @@
 .skip	32	// makes the loop body aligned at 64-byte boundary
 bn_add_words:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 { .mii;	alloc		r2=ar.pfs,4,12,0,16
 	cmp4.le		p6,p0=r35,r0	};;
 { .mfb;	mov		r8=r0			// return value
 (p6)	br.ret.spnt.many	b0	};;
 
-	.save	ar.lc,r3
 { .mib;	sub		r10=r35,r0,1
+	.save	ar.lc,r3
 	mov		r3=ar.lc
 	brp.loop.imp	.L_bn_add_words_ctop,.L_bn_add_words_cend-16
 					}
-	.body
 { .mib;	ADDP		r14=0,r32		// rp
+	.save	pr,r9
 	mov		r9=pr		};;
+	.body
 { .mii;	ADDP		r15=0,r33		// ap
 	mov		ar.lc=r10
 	mov		ar.ec=6		}
@@ -224,21 +224,21 @@ bn_add_words:
 .skip	32	// makes the loop body aligned at 64-byte boundary
 bn_sub_words:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 { .mii;	alloc		r2=ar.pfs,4,12,0,16
 	cmp4.le		p6,p0=r35,r0	};;
 { .mfb;	mov		r8=r0			// return value
 (p6)	br.ret.spnt.many	b0	};;
 
-	.save	ar.lc,r3
 { .mib;	sub		r10=r35,r0,1
+	.save	ar.lc,r3
 	mov		r3=ar.lc
 	brp.loop.imp	.L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
 					}
-	.body
 { .mib;	ADDP		r14=0,r32		// rp
+	.save	pr,r9
 	mov		r9=pr		};;
+	.body
 { .mii;	ADDP		r15=0,r33		// ap
 	mov		ar.lc=r10
 	mov		ar.ec=6		}
@@ -283,7 +283,6 @@ bn_sub_words:
 .skip	32	// makes the loop body aligned at 64-byte boundary
 bn_mul_words:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 #ifdef XMA_TEMPTATION
 { .mfi;	alloc		r2=ar.pfs,4,0,0,0	};;
@@ -294,9 +293,10 @@ bn_mul_words:
 	cmp4.le		p6,p0=r34,r0
 (p6)	br.ret.spnt.many	b0		};;
 
-	.save	ar.lc,r3
 { .mii;	sub	r10=r34,r0,1
+	.save	ar.lc,r3
 	mov	r3=ar.lc
+	.save	pr,r9
 	mov	r9=pr			};;
 
 	.body
@@ -397,22 +397,21 @@ bn_mul_words:
 .skip	48	// makes the loop body aligned at 64-byte boundary
 bn_mul_add_words:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
-	.save	ar.lc,r3
-	.save	pr,r9
 { .mmi;	alloc		r2=ar.pfs,4,4,0,8
 	cmp4.le		p6,p0=r34,r0
+	.save	ar.lc,r3
 	mov		r3=ar.lc	};;
 { .mib;	mov		r8=r0		// return value
 	sub		r10=r34,r0,1
 (p6)	br.ret.spnt.many	b0	};;
 
-	.body
 { .mib;	setf.sig	f8=r35		// w
+	.save	pr,r9
 	mov		r9=pr
 	brp.loop.imp	.L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
 					}
+	.body
 { .mmi;	ADDP		r14=0,r32	// rp
 	ADDP		r15=0,r33	// ap
 	mov		ar.lc=r10	}
@@ -466,7 +465,6 @@ bn_mul_add_words:
 .skip	32	// makes the loop body aligned at 64-byte boundary 
 bn_sqr_words:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 { .mii;	alloc		r2=ar.pfs,3,0,0,0
 	sxt4		r34=r34		};;
@@ -476,9 +474,10 @@ bn_sqr_words:
 	nop.f		0x0
 (p6)	br.ret.spnt.many	b0	};;
 
-	.save	ar.lc,r3
 { .mii;	sub	r10=r34,r0,1
+	.save	ar.lc,r3
 	mov	r3=ar.lc
+	.save	pr,r9
 	mov	r9=pr			};;
 
 	.body
@@ -545,7 +544,6 @@ bn_sqr_words:
 .align	64
 bn_sqr_comba8:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 #if defined(_HPUX_SOURCE) && !defined(_LP64)
 { .mii;	alloc	r2=ar.pfs,2,1,0,0
@@ -617,7 +615,6 @@ bn_sqr_comba8:
 .align	64
 bn_mul_comba8:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 #if defined(_HPUX_SOURCE) && !defined(_LP64)
 { .mii;	alloc	r2=ar.pfs,3,0,0,0
@@ -1175,7 +1172,6 @@ bn_mul_comba8:
 .align	64
 bn_sqr_comba4:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 #if defined(_HPUX_SOURCE) && !defined(_LP64)
 { .mii;	alloc   r2=ar.pfs,2,1,0,0
@@ -1208,7 +1204,6 @@ bn_sqr_comba4:
 .align	64
 bn_mul_comba4:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 #if defined(_HPUX_SOURCE) && !defined(_LP64)
 { .mii;	alloc   r2=ar.pfs,3,0,0,0
@@ -1411,11 +1406,11 @@ equ=p24
 .align	64
 bn_div_words:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
-	.save	b0,r3
 { .mii;	alloc		r2=ar.pfs,3,5,0,8
+	.save	b0,r3
 	mov		r3=b0
+	.save	pr,r10
 	mov		r10=pr		};;
 { .mmb;	cmp.eq		p6,p0=r34,r0
 	mov		r8=-1
diff --git a/crypto/bn/asm/mo-586.pl b/crypto/bn/asm/mo-586.pl
new file mode 100755
index 000000000000..0982293094d9
--- /dev/null
+++ b/crypto/bn/asm/mo-586.pl
@@ -0,0 +1,603 @@
+#!/usr/bin/env perl
+
+# This is crypto/bn/asm/x86-mont.pl (with asciz from crypto/perlasm/x86asm.pl)
+# from OpenSSL 0.9.9-dev 
+
+sub ::asciz
+{ my @str=unpack("C*",shift);
+    push @str,0;
+    while ($#str>15) {
+	&data_byte(@str[0..15]);
+	foreach (0..15) { shift @str; }
+    }
+    &data_byte(@str) if (@str);
+}
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# This is a "teaser" code, as it can be improved in several ways...
+# First of all non-SSE2 path should be implemented (yes, for now it
+# performs Montgomery multiplication/convolution only on SSE2-capable
+# CPUs such as P4, others fall down to original code). Then inner loop
+# can be unrolled and modulo-scheduled to improve ILP and possibly
+# moved to 128-bit XMM register bank (though it would require input
+# rearrangement and/or increase bus bandwidth utilization). Dedicated
+# squaring procedure should give further performance improvement...
+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
+
+# December 2006
+#
+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
+# Integer-only code [being equipped with dedicated squaring procedure]
+# gives ~40% on rsa512 sign benchmark...
+
+push(@INC,"perlasm","../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&function_begin("bn_mul_mont");
+
+$i="edx";
+$j="ecx";
+$ap="esi";	$tp="esi";		# overlapping variables!!!
+$rp="edi";	$bp="edi";		# overlapping variables!!!
+$np="ebp";
+$num="ebx";
+
+$_num=&DWP(4*0,"esp");			# stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
+$_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
+$frame=32;				# size of above frame rounded up to 16n
+
+	&xor	("eax","eax");
+	&mov	("edi",&wparam(5));	# int num
+	&cmp	("edi",4);
+	&jl	(&label("just_leave"));
+
+	&lea	("esi",&wparam(0));	# put aside pointer to argument block
+	&lea	("edx",&wparam(1));	# load ap
+	&mov	("ebp","esp");		# saved stack pointer!
+	&add	("edi",2);		# extra two words on top of tp
+	&neg	("edi");
+	&lea	("esp",&DWP(-$frame,"esp","edi",4));	# alloca($frame+4*(num+2))
+	&neg	("edi");
+
+	# minimize cache contention by arraning 2K window between stack
+	# pointer and ap argument [np is also position sensitive vector,
+	# but it's assumed to be near ap, as it's allocated at ~same
+	# time].
+	&mov	("eax","esp");
+	&sub	("eax","edx");
+	&and	("eax",2047);
+	&sub	("esp","eax");		# this aligns sp and ap modulo 2048
+
+	&xor	("edx","esp");
+	&and	("edx",2048);
+	&xor	("edx",2048);
+	&sub	("esp","edx");		# this splits them apart modulo 4096
+
+	&and	("esp",-64);		# align to cache line
+
+	################################# load argument block...
+	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+	&mov	("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+	#&mov	("edi",&DWP(5*4,"esi"));# int num
+
+	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
+	&mov	($_rp,"eax");		# ... save a copy of argument block
+	&mov	($_ap,"ebx");
+	&mov	($_bp,"ecx");
+	&mov	($_np,"edx");
+	&mov	($_n0,"esi");
+	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
+	#&mov	($_num,$num);		# redundant as $num is not reused
+	&mov	($_sp,"ebp");		# saved stack pointer!
+
+if($sse2) {
+$acc0="mm0";	# mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+
+	&picmeup("eax","OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,"eax"),26);
+	&jnc	(&label("non_sse2"));
+
+	&mov	("eax",-1);
+	&movd	($mask,"eax");		# mask 32 lower bits
+
+	&mov	($ap,$_ap);		# load input pointers
+	&mov	($bp,$_bp);
+	&mov	($np,$_np);
+
+	&xor	($i,$i);		# i=0
+	&xor	($j,$j);		# j=0
+
+	&movd	($mul0,&DWP(0,$bp));		# bp[0]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
+	&movq	($car0,$mul1);
+	&movq	($acc0,$mul1);			# I wish movd worked for
+	&pand	($acc0,$mask);			# inter-register transfers
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
+	&paddq	($car1,$acc0);
+
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&inc	($j);				# j++
+&set_label("1st",16);
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
+	&psrlq	($car1,32);
+
+	&lea	($j,&DWP(1,$j));
+	&cmp	($j,$num);
+	&jl	(&label("1st"));
+
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&paddq	($car1,$car0);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&inc	($i);				# i++
+&set_label("outer");
+	&xor	($j,$j);			# j=0
+
+	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
+
+	&paddq	($mul1,$temp);			# +=tp[0]
+	&movq	($acc0,$mul1);
+	&movq	($car0,$mul1);
+	&pand	($acc0,$mask);
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);
+	&paddq	($car1,$acc0);
+
+	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[1]
+
+	&inc	($j);				# j++
+	&dec	($num);
+&set_label("inner");
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[j+1]
+
+	&dec	($num);
+	&lea	($j,&DWP(1,$j));		# j++
+	&jnz	(&label("inner"));
+
+	&mov	($num,$j);
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
+	&paddq	($car1,$car0);
+	&paddq	($car1,$temp);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&lea	($i,&DWP(1,$i));		# i++
+	&cmp	($i,$num);
+	&jle	(&label("outer"));
+
+	&emms	();				# done with mmx bank
+	&jmp	(&label("common_tail"));
+
+&set_label("non_sse2",16);
+}
+
+if (0) {
+	&mov	("esp",$_sp);
+	&xor	("eax","eax");	# signal "not fast enough [yet]"
+	&jmp	(&label("just_leave"));
+	# While the below code provides competitive performance for
+	# all key lengthes on modern Intel cores, it's still more
+	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
+	# means compared to the original integer-only assembler.
+	# 512-bit RSA sign is better by ~40%, but that's about all
+	# one can say about all CPUs...
+} else {
+$inp="esi";	# integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+
+	&mov	($inp,$_ap);
+	&lea	($carry,&DWP(1,$num));
+	&mov	($word,$_bp);
+	&xor	($j,$j);				# j=0
+	&mov	("edx",$inp);
+	&and	($carry,1);				# see if num is even
+	&sub	("edx",$word);				# see if ap==bp
+	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
+	&or	($carry,"edx");
+	&mov	($word,&DWP(0,$word));			# bp[0]
+	&jz	(&label("bn_sqr_mont"));
+	&mov	($_bpend,"eax");
+	&mov	("eax",&DWP(0,$inp));
+	&xor	("edx","edx");
+
+&set_label("mull",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[0]
+	&add	($carry,"eax");
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("mull"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[0]
+	 &mov	($word,$_n0);
+	&add	("eax",$carry);
+	 &mov	($inp,$_np);
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
+	&xor	($j,$j);
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&inc	($j);
+
+	&jmp	(&label("2ndmadd"));
+
+&set_label("1stmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[i]
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("1stmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[i]
+	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&xor	($j,$j);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
+	&adc	($j,0);
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&mov	($j,1);
+
+&set_label("2ndmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
+	&jl	(&label("2ndmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&xor	("eax","eax");
+	 &mov	($j,$_bp);				# &bp[i]
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	 &lea	($j,&DWP(4,$j));
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	 &cmp	($j,$_bpend);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(0,$j));			# bp[i+1]
+	&mov	($inp,$_ap);
+	&mov	($_bp,$j);				# &bp[++i]
+	&xor	($j,$j);
+	&xor	("edx","edx");
+	&mov	("eax",&DWP(0,$inp));
+	&jmp	(&label("1stmadd"));
+
+&set_label("bn_sqr_mont",16);
+$sbit=$num;
+	&mov	($_num,$num);
+	&mov	($_bp,$j);				# i=0
+
+	&mov	("eax",$word);				# ap[0]
+	&mul	($word);				# ap[0]*ap[0]
+	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
+	&mov	($sbit,"edx");
+	&shr	("edx",1);
+	&and	($sbit,1);
+	&inc	($j);
+&set_label("sqr",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[0]
+	&add	("eax",$carry);
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	&shr	("eax",31);
+	&cmp	($j,$_num);
+	&mov	($sbit,"eax");
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("sqr"));
+
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*ap[0]
+	&add	("eax",$carry);
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+	&shr	("eax",31);
+	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
+
+	&lea	($carry,&DWP(0,"eax","edx",2));
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&shr	("edx",31);
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	($num,$j);
+	&adc	("edx",0);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&mov	($j,1);
+
+&set_label("3rdmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j+1]*m
+	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
+	&lea	($j,&DWP(2,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("3rdmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&mov	($j,$_bp);				# i
+	&xor	("eax","eax");
+	&mov	($inp,$_ap);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	&cmp	($j,$num);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
+	&lea	($j,&DWP(1,$j));
+	&mov	("eax",$word);
+	&mov	($_bp,$j);				# ++i
+	&mul	($word);				# ap[i]*ap[i]
+	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
+	&adc	("edx",0);
+	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
+	&xor	($carry,$carry);
+	&cmp	($j,$num);
+	&lea	($j,&DWP(1,$j));
+	&je	(&label("sqrlast"));
+
+	&mov	($sbit,"edx");				# zaps $num
+	&shr	("edx",1);
+	&and	($sbit,1);
+&set_label("sqradd",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[i]
+	&add	("eax",$carry);
+	&lea	($carry,&DWP(0,"eax","eax"));
+	&adc	("edx",0);
+	&shr	("eax",31);
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("eax",0);
+	&add	($carry,$sbit);
+	&adc	("eax",0);
+	&cmp	($j,$_num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&mov	($sbit,"eax");
+	&jle	(&label("sqradd"));
+
+	&mov	($carry,"edx");
+	&lea	("edx",&DWP(0,$sbit,"edx",2));
+	&shr	($carry,31);
+&set_label("sqrlast");
+	&mov	($word,$_n0);
+	&mov	($inp,$_np);
+	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&adc	($carry,0);
+	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&lea	($num,&DWP(-1,$j));
+	&adc	("edx",0);
+	&mov	($j,1);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+
+	&jmp	(&label("3rdmadd"));
+}
+
+&set_label("common_tail",16);
+	&mov	($np,$_np);			# load modulus pointer
+	&mov	($rp,$_rp);			# load result pointer
+	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
+
+	&mov	("eax",&DWP(0,$tp));		# tp[0]
+	&mov	($j,$num);			# j=num-1
+	&xor	($i,$i);			# i=0 and clear CF!
+
+&set_label("sub",16);
+	&sbb	("eax",&DWP(0,$np,$i,4));
+	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
+	&dec	($j);				# doesn't affect CF!
+	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
+	&lea	($i,&DWP(1,$i));		# i++
+	&jge	(&label("sub"));
+
+	&sbb	("eax",0);			# handle upmost overflow bit
+	&and	($tp,"eax");
+	&not	("eax");
+	&mov	($np,$rp);
+	&and	($np,"eax");
+	&or	($tp,$np);			# tp=carry?tp:rp
+
+&set_label("copy",16);				# copy or in-place refresh
+	&mov	("eax",&DWP(0,$tp,$num,4));
+	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
+	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
+	&dec	($num);
+	&jge	(&label("copy"));
+
+	&mov	("esp",$_sp);		# pull saved stack pointer
+	&mov	("eax",1);
+&set_label("just_leave");
+&function_end("bn_mul_mont");
+
+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
new file mode 100755
index 000000000000..c43b69592a5c
--- /dev/null
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -0,0 +1,214 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005.
+#
+# Montgomery multiplication routine for x86_64. While it gives modest
+# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
+# than twice, >2x, as fast. Most common rsa1024 sign is improved by
+# respectful 50%. It remains to be seen if loop unrolling and
+# dedicated squaring routine can provide further improvement...
+
+$output=shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $output";
+
+# int bn_mul_mont(
+$rp="%rdi";	# BN_ULONG *rp,
+$ap="%rsi";	# const BN_ULONG *ap,
+$bp="%rdx";	# const BN_ULONG *bp,
+$np="%rcx";	# const BN_ULONG *np,
+$n0="%r8";	# const BN_ULONG *n0,
+$num="%r9";	# int num);
+$lo0="%r10";
+$hi0="%r11";
+$bp="%r12";	# reassign $bp
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.globl	bn_mul_mont
+.type	bn_mul_mont,\@function,6
+.align	16
+bn_mul_mont:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	${num}d,${num}d
+	lea	2($num),%rax
+	mov	%rsp,%rbp
+	neg	%rax
+	lea	(%rsp,%rax,8),%rsp	# tp=alloca(8*(num+2))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%rbp,8(%rsp,$num,8)	# tp[num+1]=%rsp
+	mov	%rdx,$bp		# $bp reassigned, remember?
+
+	mov	($n0),$n0		# pull n0[0] value
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	mov	($bp),$m0		# m0=bp[0]
+	mov	($ap),%rax
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$lo0
+	mov	%rdx,$hi0
+
+	imulq	$n0,%rax		# "tp[0]"*n0
+	mov	%rax,$m1
+
+	mulq	($np)			# np[0]*m1
+	add	$lo0,%rax		# discarded
+	adc	\$0,%rdx
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+.L1st:
+	mov	($ap,$j,8),%rax
+	mulq	$m0			# ap[j]*bp[0]
+	add	$hi0,%rax
+	adc	\$0,%rdx
+	mov	%rax,$lo0
+	mov	($np,$j,8),%rax
+	mov	%rdx,$hi0
+
+	mulq	$m1			# np[j]*m1
+	add	$hi1,%rax
+	lea	1($j),$j		# j++
+	adc	\$0,%rdx
+	add	$lo0,%rax		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	%rax,-16(%rsp,$j,8)	# tp[j-1]
+	cmp	$num,$j
+	mov	%rdx,$hi1
+	jl	.L1st
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+.align	4
+.Louter:
+	xor	$j,$j			# j=0
+
+	mov	($bp,$i,8),$m0		# m0=bp[i]
+	mov	($ap),%rax		# ap[0]
+	mulq	$m0			# ap[0]*bp[i]
+	add	(%rsp),%rax		# ap[0]*bp[i]+tp[0]
+	adc	\$0,%rdx
+	mov	%rax,$lo0
+	mov	%rdx,$hi0
+
+	imulq	$n0,%rax		# tp[0]*n0
+	mov	%rax,$m1
+
+	mulq	($np,$j,8)		# np[0]*m1
+	add	$lo0,%rax		# discarded
+	mov	8(%rsp),$lo0		# tp[1]
+	adc	\$0,%rdx
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+.align	4
+.Linner:
+	mov	($ap,$j,8),%rax
+	mulq	$m0			# ap[j]*bp[i]
+	add	$hi0,%rax
+	adc	\$0,%rdx
+	add	%rax,$lo0		# ap[j]*bp[i]+tp[j]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$hi0
+
+	mulq	$m1			# np[j]*m1
+	add	$hi1,%rax
+	lea	1($j),$j		# j++
+	adc	\$0,%rdx
+	add	$lo0,%rax		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	(%rsp,$j,8),$lo0
+	cmp	$num,$j
+	mov	%rax,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+	jl	.Linner
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# pull upmost overflow bit
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	cmp	$num,$i
+	jl	.Louter
+
+	lea	(%rsp),$ap		# borrow ap for tp
+	lea	-1($num),$j		# j=num-1
+
+	mov	($ap),%rax		# tp[0]
+	xor	$i,$i			# i=0 and clear CF!
+	jmp	.Lsub
+.align	16
+.Lsub:	sbb	($np,$i,8),%rax
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
+	dec	$j			# doesn't affect CF!
+	mov	8($ap,$i,8),%rax	# tp[i+1]
+	lea	1($i),$i		# i++
+	jge	.Lsub
+
+	sbb	\$0,%rax		# handle upmost overflow bit
+	and	%rax,$ap
+	not	%rax
+	mov	$rp,$np
+	and	%rax,$np
+	lea	-1($num),$j
+	or	$np,$ap			# ap=borrow?tp:rp
+.align	16
+.Lcopy:					# copy or in-place refresh
+	mov	($ap,$j,8),%rax
+	mov	%rax,($rp,$j,8)		# rp[i]=tp[i]
+	mov	$i,(%rsp,$j,8)		# zap temporary vector
+	dec	$j
+	jge	.Lcopy
+
+	mov	8(%rsp,$num,8),%rsp	# restore %rsp
+	mov	\$1,%rax
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	ret
+.size	bn_mul_mont,.-bn_mul_mont
+.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h
index 95c5d643cbd1..6d754d554776 100644
--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h
@@ -245,8 +245,18 @@ extern "C" {
 
 #define BN_FLG_MALLOCED		0x01
 #define BN_FLG_STATIC_DATA	0x02
-#define BN_FLG_EXP_CONSTTIME	0x04 /* avoid leaking exponent information through timings
-                            	      * (BN_mod_exp_mont() will call BN_mod_exp_mont_consttime) */
+#define BN_FLG_CONSTTIME	0x04 /* avoid leaking exponent information through timing,
+                                      * BN_mod_exp_mont() will call BN_mod_exp_mont_consttime,
+                                      * BN_div() will call BN_div_no_branch,
+                                      * BN_mod_inverse() will call BN_mod_inverse_no_branch.
+                                      */
+
+#ifndef OPENSSL_NO_DEPRECATED
+#define BN_FLG_EXP_CONSTTIME BN_FLG_CONSTTIME /* deprecated name for the flag */
+                                      /* avoid leaking exponent information through timings
+                                      * (BN_mod_exp_mont() will call BN_mod_exp_mont_consttime) */
+#endif
+
 #ifndef OPENSSL_NO_DEPRECATED
 #define BN_FLG_FREE		0x8000	/* used for debuging */
 #endif
@@ -293,7 +303,12 @@ struct bn_mont_ctx_st
 	BIGNUM N;      /* The modulus */
 	BIGNUM Ni;     /* R*(1/R mod N) - N*Ni = 1
 	                * (Ni is only stored for bignum algorithm) */
+#if 0
+	/* OpenSSL 0.9.9 preview: */
+	BN_ULONG n0[2];/* least significant word(s) of Ni */
+#else
 	BN_ULONG n0;   /* least significant word of Ni */
+#endif
 	int flags;
 	};
 
@@ -534,7 +549,7 @@ BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
 #define	BN_BLINDING_NO_UPDATE	0x00000001
 #define	BN_BLINDING_NO_RECREATE	0x00000002
 
-BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod);
+BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, /* const */ BIGNUM *mod);
 void BN_BLINDING_free(BN_BLINDING *b);
 int BN_BLINDING_update(BN_BLINDING *b,BN_CTX *ctx);
 int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
@@ -546,7 +561,7 @@ void BN_BLINDING_set_thread_id(BN_BLINDING *, unsigned long);
 unsigned long BN_BLINDING_get_flags(const BN_BLINDING *);
 void BN_BLINDING_set_flags(BN_BLINDING *, unsigned long);
 BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b,
-	const BIGNUM *e, BIGNUM *m, BN_CTX *ctx,
+	const BIGNUM *e, /* const */ BIGNUM *m, BN_CTX *ctx,
 	int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
 			  const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx),
 	BN_MONT_CTX *m_ctx);
@@ -775,6 +790,7 @@ void ERR_load_BN_strings(void);
 #define BN_F_BN_CTX_NEW					 106
 #define BN_F_BN_CTX_START				 129
 #define BN_F_BN_DIV					 107
+#define BN_F_BN_DIV_NO_BRANCH				 138
 #define BN_F_BN_DIV_RECP				 130
 #define BN_F_BN_EXP					 123
 #define BN_F_BN_EXPAND2					 108
@@ -793,6 +809,7 @@ void ERR_load_BN_strings(void);
 #define BN_F_BN_MOD_EXP_RECP				 125
 #define BN_F_BN_MOD_EXP_SIMPLE				 126
 #define BN_F_BN_MOD_INVERSE				 110
+#define BN_F_BN_MOD_INVERSE_NO_BRANCH			 139
 #define BN_F_BN_MOD_LSHIFT_QUICK			 119
 #define BN_F_BN_MOD_MUL_RECIPROCAL			 111
 #define BN_F_BN_MOD_SQRT				 121
diff --git a/crypto/bn/bn_blind.c b/crypto/bn/bn_blind.c
index ca22d4f8bdcb..c11fb4ccc2d3 100644
--- a/crypto/bn/bn_blind.c
+++ b/crypto/bn/bn_blind.c
@@ -131,7 +131,7 @@ struct bn_blinding_st
 			  BN_MONT_CTX *m_ctx);
 	};
 
-BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod)
+BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, /* const */ BIGNUM *mod)
 	{
 	BN_BLINDING *ret=NULL;
 
@@ -151,7 +151,12 @@ BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod)
 		{
 		if ((ret->Ai = BN_dup(Ai)) == NULL) goto err;
 		}
-	ret->mod = mod;
+
+	/* save a copy of mod in the BN_BLINDING structure */
+	if ((ret->mod = BN_dup(mod)) == NULL) goto err;
+	if (BN_get_flags(mod, BN_FLG_CONSTTIME) != 0)
+		BN_set_flags(ret->mod, BN_FLG_CONSTTIME);
+
 	ret->counter = BN_BLINDING_COUNTER;
 	return(ret);
 err:
@@ -167,6 +172,7 @@ void BN_BLINDING_free(BN_BLINDING *r)
 	if (r->A  != NULL) BN_free(r->A );
 	if (r->Ai != NULL) BN_free(r->Ai);
 	if (r->e  != NULL) BN_free(r->e );
+	if (r->mod != NULL) BN_free(r->mod); 
 	OPENSSL_free(r);
 	}
 
@@ -278,7 +284,7 @@ void BN_BLINDING_set_flags(BN_BLINDING *b, unsigned long flags)
 	}
 
 BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b,
-	const BIGNUM *e, BIGNUM *m, BN_CTX *ctx,
+	const BIGNUM *e, /* const */ BIGNUM *m, BN_CTX *ctx,
 	int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
 			  const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx),
 	BN_MONT_CTX *m_ctx)
diff --git a/crypto/bn/bn_div.c b/crypto/bn/bn_div.c
index 2857f44861a7..1e8e57626be3 100644
--- a/crypto/bn/bn_div.c
+++ b/crypto/bn/bn_div.c
@@ -169,13 +169,15 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
 #endif /* OPENSSL_NO_ASM */
 
 
-/* BN_div computes  dv := num / divisor,  rounding towards zero, and sets up
- * rm  such that  dv*divisor + rm = num  holds.
+/* BN_div[_no_branch] computes  dv := num / divisor,  rounding towards
+ * zero, and sets up rm  such that  dv*divisor + rm = num  holds.
  * Thus:
  *     dv->neg == num->neg ^ divisor->neg  (unless the result is zero)
  *     rm->neg == num->neg                 (unless the remainder is zero)
  * If 'dv' or 'rm' is NULL, the respective value is not returned.
  */
+static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
+        const BIGNUM *divisor, BN_CTX *ctx);
 int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 	   BN_CTX *ctx)
 	{
@@ -185,9 +187,25 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 	BN_ULONG d0,d1;
 	int num_n,div_n;
 
+	/* Invalid zero-padding would have particularly bad consequences
+	 * in the case of 'num', so don't just rely on bn_check_top() for this one
+	 * (bn_check_top() works only for BN_DEBUG builds) */
+	if (num->top > 0 && num->d[num->top - 1] == 0)
+		{
+		BNerr(BN_F_BN_DIV,BN_R_NOT_INITIALIZED);
+		return 0;
+		}
+
+	bn_check_top(num);
+
+	if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0))
+		{
+		return BN_div_no_branch(dv, rm, num, divisor, ctx);
+		}
+
 	bn_check_top(dv);
 	bn_check_top(rm);
-	bn_check_top(num);
+	/* bn_check_top(num); */ /* 'num' has been checked already */
 	bn_check_top(divisor);
 
 	if (BN_is_zero(divisor))
@@ -397,4 +415,229 @@ err:
 	return(0);
 	}
 
+
+/* BN_div_no_branch is a special version of BN_div. It does not contain
+ * branches that may leak sensitive information.
+ */
+static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, 
+	const BIGNUM *divisor, BN_CTX *ctx)
+	{
+	int norm_shift,i,loop;
+	BIGNUM *tmp,wnum,*snum,*sdiv,*res;
+	BN_ULONG *resp,*wnump;
+	BN_ULONG d0,d1;
+	int num_n,div_n;
+
+	bn_check_top(dv);
+	bn_check_top(rm);
+	/* bn_check_top(num); */ /* 'num' has been checked in BN_div() */
+	bn_check_top(divisor);
+
+	if (BN_is_zero(divisor))
+		{
+		BNerr(BN_F_BN_DIV_NO_BRANCH,BN_R_DIV_BY_ZERO);
+		return(0);
+		}
+
+	BN_CTX_start(ctx);
+	tmp=BN_CTX_get(ctx);
+	snum=BN_CTX_get(ctx);
+	sdiv=BN_CTX_get(ctx);
+	if (dv == NULL)
+		res=BN_CTX_get(ctx);
+	else	res=dv;
+	if (sdiv == NULL || res == NULL) goto err;
+
+	/* First we normalise the numbers */
+	norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2);
+	if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err;
+	sdiv->neg=0;
+	norm_shift+=BN_BITS2;
+	if (!(BN_lshift(snum,num,norm_shift))) goto err;
+	snum->neg=0;
+
+	/* Since we don't know whether snum is larger than sdiv,
+	 * we pad snum with enough zeroes without changing its
+	 * value. 
+	 */
+	if (snum->top <= sdiv->top+1) 
+		{
+		if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
+		for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
+		snum->top = sdiv->top + 2;
+		}
+	else
+		{
+		if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
+		snum->d[snum->top] = 0;
+		snum->top ++;
+		}
+
+	div_n=sdiv->top;
+	num_n=snum->top;
+	loop=num_n-div_n;
+	/* Lets setup a 'window' into snum
+	 * This is the part that corresponds to the current
+	 * 'area' being divided */
+	wnum.neg   = 0;
+	wnum.d     = &(snum->d[loop]);
+	wnum.top   = div_n;
+	/* only needed when BN_ucmp messes up the values between top and max */
+	wnum.dmax  = snum->dmax - loop; /* so we don't step out of bounds */
+
+	/* Get the top 2 words of sdiv */
+	/* div_n=sdiv->top; */
+	d0=sdiv->d[div_n-1];
+	d1=(div_n == 1)?0:sdiv->d[div_n-2];
+
+	/* pointer to the 'top' of snum */
+	wnump= &(snum->d[num_n-1]);
+
+	/* Setup to 'res' */
+	res->neg= (num->neg^divisor->neg);
+	if (!bn_wexpand(res,(loop+1))) goto err;
+	res->top=loop-1;
+	resp= &(res->d[loop-1]);
+
+	/* space for temp */
+	if (!bn_wexpand(tmp,(div_n+1))) goto err;
+
+	/* if res->top == 0 then clear the neg value otherwise decrease
+	 * the resp pointer */
+	if (res->top == 0)
+		res->neg = 0;
+	else
+		resp--;
+
+	for (i=0; i<loop-1; i++, wnump--, resp--)
+		{
+		BN_ULONG q,l0;
+		/* the first part of the loop uses the top two words of
+		 * snum and sdiv to calculate a BN_ULONG q such that
+		 * | wnum - sdiv * q | < sdiv */
+#if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM)
+		BN_ULONG bn_div_3_words(BN_ULONG*,BN_ULONG,BN_ULONG);
+		q=bn_div_3_words(wnump,d1,d0);
+#else
+		BN_ULONG n0,n1,rem=0;
+
+		n0=wnump[0];
+		n1=wnump[-1];
+		if (n0 == d0)
+			q=BN_MASK2;
+		else 			/* n0 < d0 */
+			{
+#ifdef BN_LLONG
+			BN_ULLONG t2;
+
+#if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words)
+			q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0);
+#else
+			q=bn_div_words(n0,n1,d0);
+#ifdef BN_DEBUG_LEVITTE
+			fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
+X) -> 0x%08X\n",
+				n0, n1, d0, q);
+#endif
+#endif
+
+#ifndef REMAINDER_IS_ALREADY_CALCULATED
+			/*
+			 * rem doesn't have to be BN_ULLONG. The least we
+			 * know it's less that d0, isn't it?
+			 */
+			rem=(n1-q*d0)&BN_MASK2;
+#endif
+			t2=(BN_ULLONG)d1*q;
+
+			for (;;)
+				{
+				if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2]))
+					break;
+				q--;
+				rem += d0;
+				if (rem < d0) break; /* don't let rem overflow */
+				t2 -= d1;
+				}
+#else /* !BN_LLONG */
+			BN_ULONG t2l,t2h,ql,qh;
+
+			q=bn_div_words(n0,n1,d0);
+#ifdef BN_DEBUG_LEVITTE
+			fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
+X) -> 0x%08X\n",
+				n0, n1, d0, q);
+#endif
+#ifndef REMAINDER_IS_ALREADY_CALCULATED
+			rem=(n1-q*d0)&BN_MASK2;
+#endif
+
+#if defined(BN_UMULT_LOHI)
+			BN_UMULT_LOHI(t2l,t2h,d1,q);
+#elif defined(BN_UMULT_HIGH)
+			t2l = d1 * q;
+			t2h = BN_UMULT_HIGH(d1,q);
+#else
+			t2l=LBITS(d1); t2h=HBITS(d1);
+			ql =LBITS(q);  qh =HBITS(q);
+			mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
+#endif
+
+			for (;;)
+				{
+				if ((t2h < rem) ||
+					((t2h == rem) && (t2l <= wnump[-2])))
+					break;
+				q--;
+				rem += d0;
+				if (rem < d0) break; /* don't let rem overflow */
+				if (t2l < d1) t2h--; t2l -= d1;
+				}
+#endif /* !BN_LLONG */
+			}
+#endif /* !BN_DIV3W */
+
+		l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
+		tmp->d[div_n]=l0;
+		wnum.d--;
+		/* ingore top values of the bignums just sub the two 
+		 * BN_ULONG arrays with bn_sub_words */
+		if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1))
+			{
+			/* Note: As we have considered only the leading
+			 * two BN_ULONGs in the calculation of q, sdiv * q
+			 * might be greater than wnum (but then (q-1) * sdiv
+			 * is less or equal than wnum)
+			 */
+			q--;
+			if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n))
+				/* we can't have an overflow here (assuming
+				 * that q != 0, but if q == 0 then tmp is
+				 * zero anyway) */
+				(*wnump)++;
+			}
+		/* store part of the result */
+		*resp = q;
+		}
+	bn_correct_top(snum);
+	if (rm != NULL)
+		{
+		/* Keep a copy of the neg flag in num because if rm==num
+		 * BN_rshift() will overwrite it.
+		 */
+		int neg = num->neg;
+		BN_rshift(rm,snum,norm_shift);
+		if (!BN_is_zero(rm))
+			rm->neg = neg;
+		bn_check_top(rm);
+		}
+	bn_correct_top(res);
+	BN_CTX_end(ctx);
+	return(1);
+err:
+	bn_check_top(rm);
+	BN_CTX_end(ctx);
+	return(0);
+	}
+
 #endif
diff --git a/crypto/bn/bn_err.c b/crypto/bn/bn_err.c
index 24fbbb772d0f..cfe2eb94a0ce 100644
--- a/crypto/bn/bn_err.c
+++ b/crypto/bn/bn_err.c
@@ -1,6 +1,6 @@
 /* crypto/bn/bn_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2005 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2007 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -82,6 +82,7 @@ static ERR_STRING_DATA BN_str_functs[]=
 {ERR_FUNC(BN_F_BN_CTX_NEW),	"BN_CTX_new"},
 {ERR_FUNC(BN_F_BN_CTX_START),	"BN_CTX_start"},
 {ERR_FUNC(BN_F_BN_DIV),	"BN_div"},
+{ERR_FUNC(BN_F_BN_DIV_NO_BRANCH),	"BN_div_no_branch"},
 {ERR_FUNC(BN_F_BN_DIV_RECP),	"BN_div_recp"},
 {ERR_FUNC(BN_F_BN_EXP),	"BN_exp"},
 {ERR_FUNC(BN_F_BN_EXPAND2),	"bn_expand2"},
@@ -100,6 +101,7 @@ static ERR_STRING_DATA BN_str_functs[]=
 {ERR_FUNC(BN_F_BN_MOD_EXP_RECP),	"BN_mod_exp_recp"},
 {ERR_FUNC(BN_F_BN_MOD_EXP_SIMPLE),	"BN_mod_exp_simple"},
 {ERR_FUNC(BN_F_BN_MOD_INVERSE),	"BN_mod_inverse"},
+{ERR_FUNC(BN_F_BN_MOD_INVERSE_NO_BRANCH),	"BN_mod_inverse_no_branch"},
 {ERR_FUNC(BN_F_BN_MOD_LSHIFT_QUICK),	"BN_mod_lshift_quick"},
 {ERR_FUNC(BN_F_BN_MOD_MUL_RECIPROCAL),	"BN_mod_mul_reciprocal"},
 {ERR_FUNC(BN_F_BN_MOD_SQRT),	"BN_mod_sqrt"},
diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c
index 8f8c69448191..70a33f0d936c 100644
--- a/crypto/bn/bn_exp.c
+++ b/crypto/bn/bn_exp.c
@@ -122,9 +122,9 @@ int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 	int i,bits,ret=0;
 	BIGNUM *v,*rr;
 
-	if (BN_get_flags(p, BN_FLG_EXP_CONSTTIME) != 0)
+	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
 		{
-		/* BN_FLG_EXP_CONSTTIME only supported by BN_mod_exp_mont() */
+		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
 		BNerr(BN_F_BN_EXP,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
 		return -1;
 		}
@@ -213,7 +213,7 @@ int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
 	if (BN_is_odd(m))
 		{
 #  ifdef MONT_EXP_WORD
-		if (a->top == 1 && !a->neg && (BN_get_flags(p, BN_FLG_EXP_CONSTTIME) == 0))
+		if (a->top == 1 && !a->neg && (BN_get_flags(p, BN_FLG_CONSTTIME) == 0))
 			{
 			BN_ULONG A = a->d[0];
 			ret=BN_mod_exp_mont_word(r,A,p,m,ctx,NULL);
@@ -245,9 +245,9 @@ int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
 	BIGNUM *val[TABLE_SIZE];
 	BN_RECP_CTX recp;
 
-	if (BN_get_flags(p, BN_FLG_EXP_CONSTTIME) != 0)
+	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
 		{
-		/* BN_FLG_EXP_CONSTTIME only supported by BN_mod_exp_mont() */
+		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
 		BNerr(BN_F_BN_MOD_EXP_RECP,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
 		return -1;
 		}
@@ -379,7 +379,7 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 	BIGNUM *val[TABLE_SIZE];
 	BN_MONT_CTX *mont=NULL;
 
-	if (BN_get_flags(p, BN_FLG_EXP_CONSTTIME) != 0)
+	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
 		{
 		return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, in_mont);
 		}
@@ -745,9 +745,9 @@ int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,
 #define BN_TO_MONTGOMERY_WORD(r, w, mont) \
 		(BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx))
 
-	if (BN_get_flags(p, BN_FLG_EXP_CONSTTIME) != 0)
+	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
 		{
-		/* BN_FLG_EXP_CONSTTIME only supported by BN_mod_exp_mont() */
+		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
 		BNerr(BN_F_BN_MOD_EXP_MONT_WORD,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
 		return -1;
 		}
@@ -881,9 +881,9 @@ int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
 	/* Table of variables obtained from 'ctx' */
 	BIGNUM *val[TABLE_SIZE];
 
-	if (BN_get_flags(p, BN_FLG_EXP_CONSTTIME) != 0)
+	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
 		{
-		/* BN_FLG_EXP_CONSTTIME only supported by BN_mod_exp_mont() */
+		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
 		BNerr(BN_F_BN_MOD_EXP_SIMPLE,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
 		return -1;
 		}
diff --git a/crypto/bn/bn_gcd.c b/crypto/bn/bn_gcd.c
index f02e6fcdb422..4a352119ba8a 100644
--- a/crypto/bn/bn_gcd.c
+++ b/crypto/bn/bn_gcd.c
@@ -203,6 +203,8 @@ err:
 
 
 /* solves ax == 1 (mod n) */
+static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
+        const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx);
 BIGNUM *BN_mod_inverse(BIGNUM *in,
 	const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
 	{
@@ -210,6 +212,11 @@ BIGNUM *BN_mod_inverse(BIGNUM *in,
 	BIGNUM *ret=NULL;
 	int sign;
 
+	if ((BN_get_flags(a, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(n, BN_FLG_CONSTTIME) != 0))
+		{
+		return BN_mod_inverse_no_branch(in, a, n, ctx);
+		}
+
 	bn_check_top(a);
 	bn_check_top(n);
 
@@ -491,3 +498,157 @@ err:
 	bn_check_top(ret);
 	return(ret);
 	}
+
+
+/* BN_mod_inverse_no_branch is a special version of BN_mod_inverse. 
+ * It does not contain branches that may leak sensitive information.
+ */
+static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
+	const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
+	{
+	BIGNUM *A,*B,*X,*Y,*M,*D,*T,*R=NULL;
+	BIGNUM local_A, local_B;
+	BIGNUM *pA, *pB;
+	BIGNUM *ret=NULL;
+	int sign;
+
+	bn_check_top(a);
+	bn_check_top(n);
+
+	BN_CTX_start(ctx);
+	A = BN_CTX_get(ctx);
+	B = BN_CTX_get(ctx);
+	X = BN_CTX_get(ctx);
+	D = BN_CTX_get(ctx);
+	M = BN_CTX_get(ctx);
+	Y = BN_CTX_get(ctx);
+	T = BN_CTX_get(ctx);
+	if (T == NULL) goto err;
+
+	if (in == NULL)
+		R=BN_new();
+	else
+		R=in;
+	if (R == NULL) goto err;
+
+	BN_one(X);
+	BN_zero(Y);
+	if (BN_copy(B,a) == NULL) goto err;
+	if (BN_copy(A,n) == NULL) goto err;
+	A->neg = 0;
+
+	if (B->neg || (BN_ucmp(B, A) >= 0))
+		{
+		/* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
+	 	 * BN_div_no_branch will be called eventually.
+	 	 */
+		pB = &local_B;
+		BN_with_flags(pB, B, BN_FLG_CONSTTIME);	
+		if (!BN_nnmod(B, pB, A, ctx)) goto err;
+		}
+	sign = -1;
+	/* From  B = a mod |n|,  A = |n|  it follows that
+	 *
+	 *      0 <= B < A,
+	 *     -sign*X*a  ==  B   (mod |n|),
+	 *      sign*Y*a  ==  A   (mod |n|).
+	 */
+
+	while (!BN_is_zero(B))
+		{
+		BIGNUM *tmp;
+		
+		/*
+		 *      0 < B < A,
+		 * (*) -sign*X*a  ==  B   (mod |n|),
+		 *      sign*Y*a  ==  A   (mod |n|)
+		 */
+
+		/* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
+	 	 * BN_div_no_branch will be called eventually.
+	 	 */
+		pA = &local_A;
+		BN_with_flags(pA, A, BN_FLG_CONSTTIME);	
+		
+		/* (D, M) := (A/B, A%B) ... */		
+		if (!BN_div(D,M,pA,B,ctx)) goto err;
+		
+		/* Now
+		 *      A = D*B + M;
+		 * thus we have
+		 * (**)  sign*Y*a  ==  D*B + M   (mod |n|).
+		 */
+		
+		tmp=A; /* keep the BIGNUM object, the value does not matter */
+		
+		/* (A, B) := (B, A mod B) ... */
+		A=B;
+		B=M;
+		/* ... so we have  0 <= B < A  again */
+		
+		/* Since the former  M  is now  B  and the former  B  is now  A,
+		 * (**) translates into
+		 *       sign*Y*a  ==  D*A + B    (mod |n|),
+		 * i.e.
+		 *       sign*Y*a - D*A  ==  B    (mod |n|).
+		 * Similarly, (*) translates into
+		 *      -sign*X*a  ==  A          (mod |n|).
+		 *
+		 * Thus,
+		 *   sign*Y*a + D*sign*X*a  ==  B  (mod |n|),
+		 * i.e.
+		 *        sign*(Y + D*X)*a  ==  B  (mod |n|).
+		 *
+		 * So if we set  (X, Y, sign) := (Y + D*X, X, -sign),  we arrive back at
+		 *      -sign*X*a  ==  B   (mod |n|),
+		 *       sign*Y*a  ==  A   (mod |n|).
+		 * Note that  X  and  Y  stay non-negative all the time.
+		 */
+			
+		if (!BN_mul(tmp,D,X,ctx)) goto err;
+		if (!BN_add(tmp,tmp,Y)) goto err;
+
+		M=Y; /* keep the BIGNUM object, the value does not matter */
+		Y=X;
+		X=tmp;
+		sign = -sign;
+		}
+		
+	/*
+	 * The while loop (Euclid's algorithm) ends when
+	 *      A == gcd(a,n);
+	 * we have
+	 *       sign*Y*a  ==  A  (mod |n|),
+	 * where  Y  is non-negative.
+	 */
+
+	if (sign < 0)
+		{
+		if (!BN_sub(Y,n,Y)) goto err;
+		}
+	/* Now  Y*a  ==  A  (mod |n|).  */
+
+	if (BN_is_one(A))
+		{
+		/* Y*a == 1  (mod |n|) */
+		if (!Y->neg && BN_ucmp(Y,n) < 0)
+			{
+			if (!BN_copy(R,Y)) goto err;
+			}
+		else
+			{
+			if (!BN_nnmod(R,Y,n,ctx)) goto err;
+			}
+		}
+	else
+		{
+		BNerr(BN_F_BN_MOD_INVERSE_NO_BRANCH,BN_R_NO_INVERSE);
+		goto err;
+		}
+	ret=R;
+err:
+	if ((ret == NULL) && (in == NULL)) BN_free(R);
+	BN_CTX_end(ctx);
+	bn_check_top(ret);
+	return(ret);
+	}
diff --git a/crypto/bn/bn_gf2m.c b/crypto/bn/bn_gf2m.c
index 6a793857e130..306f029f2789 100644
--- a/crypto/bn/bn_gf2m.c
+++ b/crypto/bn/bn_gf2m.c
@@ -384,7 +384,11 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[])
 		if (zz == 0) break;
 		d1 = BN_BITS2 - d0;
 		
-		if (d0) z[dN] = (z[dN] << d1) >> d1; /* clear up the top d1 bits */
+		/* clear up the top d1 bits */
+		if (d0)
+			z[dN] = (z[dN] << d1) >> d1;
+		else
+			z[dN] = 0;
 		z[0] ^= zz; /* reduction t^0 component */
 
 		for (k = 1; p[k] != 0; k++)
diff --git a/crypto/bn/bn_lcl.h b/crypto/bn/bn_lcl.h
index ad4ca7ff305a..27ac4397a151 100644
--- a/crypto/bn/bn_lcl.h
+++ b/crypto/bn/bn_lcl.h
@@ -481,6 +481,7 @@ BN_ULONG bn_add_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 	int cl, int dl);
 BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 	int cl, int dl);
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
 
 #ifdef  __cplusplus
 }
diff --git a/crypto/bn/bn_lib.c b/crypto/bn/bn_lib.c
index 210ccb42bba1..2649b8c53851 100644
--- a/crypto/bn/bn_lib.c
+++ b/crypto/bn/bn_lib.c
@@ -763,7 +763,7 @@ int BN_is_bit_set(const BIGNUM *a, int n)
 	i=n/BN_BITS2;
 	j=n%BN_BITS2;
 	if (a->top <= i) return 0;
-	return((a->d[i]&(((BN_ULONG)1)<<j))?1:0);
+	return(((a->d[i])>>j)&((BN_ULONG)1));
 	}
 
 int BN_mask_bits(BIGNUM *a, int n)
diff --git a/crypto/bn/bn_mont.c b/crypto/bn/bn_mont.c
index 961ca67ea1f5..4799b152ddcb 100644
--- a/crypto/bn/bn_mont.c
+++ b/crypto/bn/bn_mont.c
@@ -122,11 +122,50 @@
 
 #define MONT_WORD /* use the faster word-based algorithm */
 
+#if defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
+/* This condition means we have a specific non-default build:
+ * In the 0.9.8 branch, OPENSSL_BN_ASM_MONT is normally not set for any
+ * BN_BITS2<=32 platform; an explicit "enable-montasm" is required.
+ * I.e., if we are here, the user intentionally deviates from the
+ * normal stable build to get better Montgomery performance from
+ * the 0.9.9-dev backport.
+ *
+ * In this case only, we also enable BN_from_montgomery_word()
+ * (another non-stable feature from 0.9.9-dev).
+ */
+#define MONT_FROM_WORD___NON_DEFAULT_0_9_8_BUILD
+#endif
+
+#ifdef MONT_FROM_WORD___NON_DEFAULT_0_9_8_BUILD
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont);
+#endif
+
+
+
 int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
 			  BN_MONT_CTX *mont, BN_CTX *ctx)
 	{
 	BIGNUM *tmp;
 	int ret=0;
+#if defined(OPENSSL_BN_ASM_MONT) && defined(MONT_WORD)
+	int num = mont->N.top;
+
+	if (num>1 && a->top==num && b->top==num)
+		{
+		if (bn_wexpand(r,num) == NULL) return(0);
+#if 0 /* for OpenSSL 0.9.9 mont->n0 */
+		if (bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num))
+#else
+		if (bn_mul_mont(r->d,a->d,b->d,mont->N.d,&mont->n0,num))
+#endif
+			{
+			r->neg = a->neg^b->neg;
+			r->top = num;
+			bn_correct_top(r);
+			return(1);
+			}
+		}
+#endif
 
 	BN_CTX_start(ctx);
 	tmp = BN_CTX_get(ctx);
@@ -142,7 +181,11 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
 		if (!BN_mul(tmp,a,b,ctx)) goto err;
 		}
 	/* reduce from aRR to aR */
+#ifdef MONT_FROM_WORD___NON_DEFAULT_0_9_8_BUILD
+	if (!BN_from_montgomery_word(r,tmp,mont)) goto err;
+#else
 	if (!BN_from_montgomery(r,tmp,mont,ctx)) goto err;
+#endif
 	bn_check_top(r);
 	ret=1;
 err:
@@ -150,6 +193,150 @@ err:
 	return(ret);
 	}
 
+#ifdef MONT_FROM_WORD___NON_DEFAULT_0_9_8_BUILD
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
+	{
+	BIGNUM *n;
+	BN_ULONG *ap,*np,*rp,n0,v,*nrp;
+	int al,nl,max,i,x,ri;
+
+	n= &(mont->N);
+	/* mont->ri is the size of mont->N in bits (rounded up
+	   to the word size) */
+	al=ri=mont->ri/BN_BITS2;
+
+	nl=n->top;
+	if ((al == 0) || (nl == 0)) { ret->top=0; return(1); }
+
+	max=(nl+al+1); /* allow for overflow (no?) XXX */
+	if (bn_wexpand(r,max) == NULL) return(0);
+
+	r->neg^=n->neg;
+	np=n->d;
+	rp=r->d;
+	nrp= &(r->d[nl]);
+
+	/* clear the top words of T */
+	for (i=r->top; i<max; i++) /* memset? XXX */
+		r->d[i]=0;
+
+	r->top=max;
+#if 0 /* for OpenSSL 0.9.9 mont->n0 */
+	n0=mont->n0[0];
+#else
+	n0=mont->n0;
+#endif
+
+#ifdef BN_COUNT
+	fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl);
+#endif
+	for (i=0; i<nl; i++)
+		{
+#ifdef __TANDEM
+                {
+                   long long t1;
+                   long long t2;
+                   long long t3;
+                   t1 = rp[0] * (n0 & 0177777);
+                   t2 = 037777600000l;
+                   t2 = n0 & t2;
+                   t3 = rp[0] & 0177777;
+                   t2 = (t3 * t2) & BN_MASK2;
+                   t1 = t1 + t2;
+                   v=bn_mul_add_words(rp,np,nl,(BN_ULONG) t1);
+                }
+#else
+		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
+#endif
+		nrp++;
+		rp++;
+		if (((nrp[-1]+=v)&BN_MASK2) >= v)
+			continue;
+		else
+			{
+			if (((++nrp[0])&BN_MASK2) != 0) continue;
+			if (((++nrp[1])&BN_MASK2) != 0) continue;
+			for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ;
+			}
+		}
+	bn_correct_top(r);
+
+	/* mont->ri will be a multiple of the word size and below code
+	 * is kind of BN_rshift(ret,r,mont->ri) equivalent */
+	if (r->top <= ri)
+		{
+		ret->top=0;
+		return(1);
+		}
+	al=r->top-ri;
+
+	if (bn_wexpand(ret,ri) == NULL) return(0);
+	x=0-(((al-ri)>>(sizeof(al)*8-1))&1);
+	ret->top=x=(ri&~x)|(al&x);	/* min(ri,al) */
+	ret->neg=r->neg;
+
+	rp=ret->d;
+	ap=&(r->d[ri]);
+
+	{
+	size_t m1,m2;
+
+	v=bn_sub_words(rp,ap,np,ri);
+	/* this ----------------^^ works even in al<ri case
+	 * thanks to zealous zeroing of top of the vector in the
+	 * beginning. */
+
+	/* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
+	/* in other words if subtraction result is real, then
+	 * trick unconditional memcpy below to perform in-place
+	 * "refresh" instead of actual copy. */
+	m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1);	/* al<ri */
+	m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1);	/* al>ri */
+	m1|=m2;			/* (al!=ri) */
+	m1|=(0-(size_t)v);	/* (al!=ri || v) */
+	m1&=~m2;		/* (al!=ri || v) && !al>ri */
+	nrp=(BN_ULONG *)(((size_t)rp&~m1)|((size_t)ap&m1));
+	}
+
+	/* 'i<ri' is chosen to eliminate dependency on input data, even
+	 * though it results in redundant copy in al<ri case. */
+	for (i=0,ri-=4; i<ri; i+=4)
+		{
+		BN_ULONG t1,t2,t3,t4;
+		
+		t1=nrp[i+0];
+		t2=nrp[i+1];
+		t3=nrp[i+2];	ap[i+0]=0;
+		t4=nrp[i+3];	ap[i+1]=0;
+		rp[i+0]=t1;	ap[i+2]=0;
+		rp[i+1]=t2;	ap[i+3]=0;
+		rp[i+2]=t3;
+		rp[i+3]=t4;
+		}
+	for (ri+=4; i<ri; i++)
+		rp[i]=nrp[i], ap[i]=0;
+	bn_correct_top(r);
+	bn_correct_top(ret);
+	bn_check_top(ret);
+
+	return(1);
+	}
+
+int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
+	     BN_CTX *ctx)
+	{
+	int retn=0;
+	BIGNUM *t;
+
+	BN_CTX_start(ctx);
+	if ((t = BN_CTX_get(ctx)) && BN_copy(t,a))
+		retn = BN_from_montgomery_word(ret,t,mont);
+	BN_CTX_end(ctx);
+	return retn;
+	}
+
+#else /* !MONT_FROM_WORD___NON_DEFAULT_0_9_8_BUILD */
+
 int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
 	     BN_CTX *ctx)
 	{
@@ -176,7 +363,6 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
 
 	max=(nl+al+1); /* allow for overflow (no?) XXX */
 	if (bn_wexpand(r,max) == NULL) goto err;
-	if (bn_wexpand(ret,max) == NULL) goto err;
 
 	r->neg=a->neg^n->neg;
 	np=n->d;
@@ -228,19 +414,72 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
 		}
 	bn_correct_top(r);
 	
-	/* mont->ri will be a multiple of the word size */
-#if 0
-	BN_rshift(ret,r,mont->ri);
-#else
-	ret->neg = r->neg;
-	x=ri;
+	/* mont->ri will be a multiple of the word size and below code
+	 * is kind of BN_rshift(ret,r,mont->ri) equivalent */
+	if (r->top <= ri)
+		{
+		ret->top=0;
+		retn=1;
+		goto err;
+		}
+	al=r->top-ri;
+
+# define BRANCH_FREE 1
+# if BRANCH_FREE
+	if (bn_wexpand(ret,ri) == NULL) goto err;
+	x=0-(((al-ri)>>(sizeof(al)*8-1))&1);
+	ret->top=x=(ri&~x)|(al&x);	/* min(ri,al) */
+	ret->neg=r->neg;
+
 	rp=ret->d;
-	ap= &(r->d[x]);
-	if (r->top < x)
-		al=0;
-	else
-		al=r->top-x;
+	ap=&(r->d[ri]);
+
+	{
+	size_t m1,m2;
+
+	v=bn_sub_words(rp,ap,np,ri);
+	/* this ----------------^^ works even in al<ri case
+	 * thanks to zealous zeroing of top of the vector in the
+	 * beginning. */
+
+	/* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
+	/* in other words if subtraction result is real, then
+	 * trick unconditional memcpy below to perform in-place
+	 * "refresh" instead of actual copy. */
+	m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1);	/* al<ri */
+	m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1);	/* al>ri */
+	m1|=m2;			/* (al!=ri) */
+	m1|=(0-(size_t)v);	/* (al!=ri || v) */
+	m1&=~m2;		/* (al!=ri || v) && !al>ri */
+	nrp=(BN_ULONG *)(((size_t)rp&~m1)|((size_t)ap&m1));
+	}
+
+	/* 'i<ri' is chosen to eliminate dependency on input data, even
+	 * though it results in redundant copy in al<ri case. */
+	for (i=0,ri-=4; i<ri; i+=4)
+		{
+		BN_ULONG t1,t2,t3,t4;
+		
+		t1=nrp[i+0];
+		t2=nrp[i+1];
+		t3=nrp[i+2];	ap[i+0]=0;
+		t4=nrp[i+3];	ap[i+1]=0;
+		rp[i+0]=t1;	ap[i+2]=0;
+		rp[i+1]=t2;	ap[i+3]=0;
+		rp[i+2]=t3;
+		rp[i+3]=t4;
+		}
+	for (ri+=4; i<ri; i++)
+		rp[i]=nrp[i], ap[i]=0;
+	bn_correct_top(r);
+	bn_correct_top(ret);
+# else
+	if (bn_wexpand(ret,al) == NULL) goto err;
 	ret->top=al;
+	ret->neg=r->neg;
+
+	rp=ret->d;
+	ap=&(r->d[ri]);
 	al-=4;
 	for (i=0; i<al; i+=4)
 		{
@@ -258,7 +497,7 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
 	al+=4;
 	for (; i<al; i++)
 		rp[i]=ap[i];
-#endif
+# endif
 #else /* !MONT_WORD */ 
 	BIGNUM *t1,*t2;
 
@@ -278,16 +517,19 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
 	if (!BN_rshift(ret,t2,mont->ri)) goto err;
 #endif /* MONT_WORD */
 
+#if !defined(BRANCH_FREE) || BRANCH_FREE==0
 	if (BN_ucmp(ret, &(mont->N)) >= 0)
 		{
 		if (!BN_usub(ret,ret,&(mont->N))) goto err;
 		}
+#endif
 	retn=1;
 	bn_check_top(ret);
  err:
 	BN_CTX_end(ctx);
 	return(retn);
 	}
+#endif /* MONT_FROM_WORD___NON_DEFAULT_0_9_8_BUILD */
 
 BN_MONT_CTX *BN_MONT_CTX_new(void)
 	{
@@ -307,6 +549,11 @@ void BN_MONT_CTX_init(BN_MONT_CTX *ctx)
 	BN_init(&(ctx->RR));
 	BN_init(&(ctx->N));
 	BN_init(&(ctx->Ni));
+#if 0 /* for OpenSSL 0.9.9 mont->n0 */
+	ctx->n0[0] = ctx->n0[1] = 0;
+#else
+	ctx->n0 = 0;
+#endif
 	ctx->flags=0;
 	}
 
@@ -340,14 +587,51 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
 
 		mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2;
 		BN_zero(R);
+#if 0 /* for OpenSSL 0.9.9 mont->n0, would be "#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)",
+         only certain BN_BITS2<=32 platforms actually need this */
+		if (!(BN_set_bit(R,2*BN_BITS2))) goto err;	/* R */
+#else
 		if (!(BN_set_bit(R,BN_BITS2))) goto err;	/* R */
+#endif
 
 		buf[0]=mod->d[0]; /* tmod = N mod word size */
 		buf[1]=0;
+
+		BN_init(&tmod);
 		tmod.d=buf;
 		tmod.top = buf[0] != 0 ? 1 : 0;
 		tmod.dmax=2;
 		tmod.neg=0;
+
+#if 0 /* for OpenSSL 0.9.9 mont->n0, would be "#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)";
+         only certain BN_BITS2<=32 platforms actually need this */
+								tmod.top=0;
+		if ((buf[0] = mod->d[0]))			tmod.top=1;
+		if ((buf[1] = mod->top>1 ? mod->d[1] : 0))	tmod.top=2;
+
+		if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL)
+			goto err;
+		if (!BN_lshift(Ri,Ri,2*BN_BITS2)) goto err; /* R*Ri */
+		if (!BN_is_zero(Ri))
+			{
+			if (!BN_sub_word(Ri,1)) goto err;
+			}
+		else /* if N mod word size == 1 */
+			{
+			if (bn_expand(Ri,(int)sizeof(BN_ULONG)*2) == NULL)
+				goto err;
+			/* Ri-- (mod double word size) */
+			Ri->neg=0;
+			Ri->d[0]=BN_MASK2;
+			Ri->d[1]=BN_MASK2;
+			Ri->top=2;
+			}
+		if (!BN_div(Ri,NULL,Ri,&tmod,ctx)) goto err;
+		/* Ni = (R*Ri-1)/N,
+		 * keep only couple of least significant words: */
+		mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
+		mont->n0[1] = (Ri->top > 1) ? Ri->d[1] : 0;
+#else
 							/* Ri = R^-1 mod N*/
 		if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL)
 			goto err;
@@ -363,7 +647,13 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
 		if (!BN_div(Ri,NULL,Ri,&tmod,ctx)) goto err;
 		/* Ni = (R*Ri-1)/N,
 		 * keep only least significant word: */
+# if 0 /* for OpenSSL 0.9.9 mont->n0 */
+		mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
+		mont->n0[1] = 0;
+# else
 		mont->n0 = (Ri->top > 0) ? Ri->d[0] : 0;
+# endif
+#endif
 		}
 #else /* !MONT_WORD */
 		{ /* bignum version */
@@ -399,7 +689,12 @@ BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from)
 	if (!BN_copy(&(to->N),&(from->N))) return NULL;
 	if (!BN_copy(&(to->Ni),&(from->Ni))) return NULL;
 	to->ri=from->ri;
+#if 0 /* for OpenSSL 0.9.9 mont->n0 */
+	to->n0[0]=from->n0[0];
+	to->n0[1]=from->n0[1];
+#else
 	to->n0=from->n0;
+#endif
 	return(to);
 	}
 
diff --git a/crypto/bn/bn_mul.c b/crypto/bn/bn_mul.c
index aec1eafc65fb..b848c8cc60f4 100644
--- a/crypto/bn/bn_mul.c
+++ b/crypto/bn/bn_mul.c
@@ -389,6 +389,7 @@ BN_ULONG bn_add_part_words(BN_ULONG *r,
  * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
  * a[1]*b[1]
  */
+/* dnX may not be positive, but n2/2+dnX has to be */
 void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
 	int dna, int dnb, BN_ULONG *t)
 	{
@@ -398,7 +399,7 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
 	BN_ULONG ln,lo,*p;
 
 # ifdef BN_COUNT
-	fprintf(stderr," bn_mul_recursive %d * %d\n",n2,n2);
+	fprintf(stderr," bn_mul_recursive %d%+d * %d%+d\n",n2,dna,n2,dnb);
 # endif
 # ifdef BN_MUL_COMBA
 #  if 0
@@ -545,6 +546,7 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
 
 /* n+tn is the word length
  * t needs to be n*4 is size, as does r */
+/* tnX may not be negative but less than n */
 void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
 	     int tna, int tnb, BN_ULONG *t)
 	{
@@ -553,8 +555,8 @@ void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
 	BN_ULONG ln,lo,*p;
 
 # ifdef BN_COUNT
-	fprintf(stderr," bn_mul_part_recursive (%d+%d) * (%d+%d)\n",
-		tna, n, tnb, n);
+	fprintf(stderr," bn_mul_part_recursive (%d%+d) * (%d%+d)\n",
+		n, tna, n, tnb);
 # endif
 	if (n < 8)
 		{
@@ -655,14 +657,17 @@ void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
 				for (;;)
 					{
 					i/=2;
-					if (i < tna && i < tnb)
+					/* these simplified conditions work
+					 * exclusively because difference
+					 * between tna and tnb is 1 or 0 */
+					if (i < tna || i < tnb)
 						{
 						bn_mul_part_recursive(&(r[n2]),
 							&(a[n]),&(b[n]),
 							i,tna-i,tnb-i,p);
 						break;
 						}
-					else if (i <= tna && i <= tnb)
+					else if (i == tna || i == tnb)
 						{
 						bn_mul_recursive(&(r[n2]),
 							&(a[n]),&(b[n]),
diff --git a/crypto/bn/bn_nist.c b/crypto/bn/bn_nist.c
index f8e306bb82d3..1fc94f55c32c 100644
--- a/crypto/bn/bn_nist.c
+++ b/crypto/bn/bn_nist.c
@@ -59,6 +59,7 @@
 #include "bn_lcl.h"
 #include "cryptlib.h"
 
+
 #define BN_NIST_192_TOP	(192+BN_BITS2-1)/BN_BITS2
 #define BN_NIST_224_TOP	(224+BN_BITS2-1)/BN_BITS2
 #define BN_NIST_256_TOP	(256+BN_BITS2-1)/BN_BITS2
@@ -99,114 +100,106 @@ static const BN_ULONG _nist_p_521[] = {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
 	0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
 	0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
 	0xFFFFFFFF,0x000001FF};
-#elif BN_BITS2 == 16
-static const BN_ULONG _nist_p_192[] = {0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFE,
-	0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF};
-static const BN_ULONG _nist_p_224[] = {0x0001,0x0000,0x0000,0x0000,0x0000,
-	0x0000,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF};
-static const BN_ULONG _nist_p_256[] = {0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
-	0xFFFF,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0000,0xFFFF,
-	0xFFFF};
-static const BN_ULONG _nist_p_384[] = {0xFFFF,0xFFFF,0x0000,0x0000,0x0000,
-	0x0000,0xFFFF,0xFFFF,0xFFFE,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
-	0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF};
-static const BN_ULONG _nist_p_521[] = {0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
-	0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
-	0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
-	0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0x01FF};
-#elif BN_BITS2 == 8
-static const BN_ULONG _nist_p_192[] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFE,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF};
-static const BN_ULONG _nist_p_224[] = {0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-	0x00,0x00,0x00,0x00,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
-static const BN_ULONG _nist_p_256[] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-	0x00,0x00,0x01,0x00,0x00,0x00,0xFF,0xFF,0xFF,0xFF};
-static const BN_ULONG _nist_p_384[] = {0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,
-	0x00,0x00,0x00,0x00,0xFF,0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
-static const BN_ULONG _nist_p_521[] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0x01};
 #endif
 
+
+static const BIGNUM _bignum_nist_p_192 =
+	{
+	(BN_ULONG *)_nist_p_192,
+	BN_NIST_192_TOP,
+	BN_NIST_192_TOP,
+	0,
+	BN_FLG_STATIC_DATA
+	};
+
+static const BIGNUM _bignum_nist_p_224 =
+	{
+	(BN_ULONG *)_nist_p_224,
+	BN_NIST_224_TOP,
+	BN_NIST_224_TOP,
+	0,
+	BN_FLG_STATIC_DATA
+	};
+
+static const BIGNUM _bignum_nist_p_256 =
+	{
+	(BN_ULONG *)_nist_p_256,
+	BN_NIST_256_TOP,
+	BN_NIST_256_TOP,
+	0,
+	BN_FLG_STATIC_DATA
+	};
+
+static const BIGNUM _bignum_nist_p_384 =
+	{
+	(BN_ULONG *)_nist_p_384,
+	BN_NIST_384_TOP,
+	BN_NIST_384_TOP,
+	0,
+	BN_FLG_STATIC_DATA
+	};
+
+static const BIGNUM _bignum_nist_p_521 =
+	{
+	(BN_ULONG *)_nist_p_521,
+	BN_NIST_521_TOP,
+	BN_NIST_521_TOP,
+	0,
+	BN_FLG_STATIC_DATA
+	};
+
+
 const BIGNUM *BN_get0_nist_prime_192(void)
 	{
-	static BIGNUM const_nist_192 = { (BN_ULONG *)_nist_p_192,
-		BN_NIST_192_TOP, BN_NIST_192_TOP, 0, BN_FLG_STATIC_DATA };
-	return &const_nist_192;
+	return &_bignum_nist_p_192;
 	}
 
 const BIGNUM *BN_get0_nist_prime_224(void)
 	{
-	static BIGNUM const_nist_224 = { (BN_ULONG *)_nist_p_224,
-		BN_NIST_224_TOP, BN_NIST_224_TOP, 0, BN_FLG_STATIC_DATA };
-	return &const_nist_224;
+	return &_bignum_nist_p_224;
 	}
 
 const BIGNUM *BN_get0_nist_prime_256(void)
 	{
-	static BIGNUM const_nist_256 = { (BN_ULONG *)_nist_p_256,
-		BN_NIST_256_TOP, BN_NIST_256_TOP, 0, BN_FLG_STATIC_DATA };
-	return &const_nist_256;
+	return &_bignum_nist_p_256;
 	}
 
 const BIGNUM *BN_get0_nist_prime_384(void)
 	{
-	static BIGNUM const_nist_384 = { (BN_ULONG *)_nist_p_384,
-		BN_NIST_384_TOP, BN_NIST_384_TOP, 0, BN_FLG_STATIC_DATA };
-	return &const_nist_384;
+	return &_bignum_nist_p_384;
 	}
 
 const BIGNUM *BN_get0_nist_prime_521(void)
 	{
-	static BIGNUM const_nist_521 = { (BN_ULONG *)_nist_p_521,
-		BN_NIST_521_TOP, BN_NIST_521_TOP, 0, BN_FLG_STATIC_DATA };
-	return &const_nist_521;
+	return &_bignum_nist_p_521;
 	}
 
-/* some misc internal functions */
-#if BN_BITS2 != 64
-static BN_ULONG _256_data[BN_NIST_256_TOP*6];
-static int _is_set_256_data = 0;
-static void _init_256_data(void);
-
-static BN_ULONG _384_data[BN_NIST_384_TOP*8];
-static int _is_set_384_data = 0;
-static void _init_384_data(void);
-#endif
-
-#define BN_NIST_ADD_ONE(a)	while (!(++(*(a)))) ++(a);
 
 static void nist_cp_bn_0(BN_ULONG *buf, BN_ULONG *a, int top, int max)
-        {
+	{
 	int i;
-        BN_ULONG *_tmp1 = (buf), *_tmp2 = (a);
-        for (i = (top); i != 0; i--)
-                *_tmp1++ = *_tmp2++;
-        for (i = (max) - (top); i != 0; i--)
-                *_tmp1++ = (BN_ULONG) 0;
-        }
+	BN_ULONG *_tmp1 = (buf), *_tmp2 = (a);
+
+	OPENSSL_assert(top <= max);
+	for (i = (top); i != 0; i--)
+		*_tmp1++ = *_tmp2++;
+	for (i = (max) - (top); i != 0; i--)
+		*_tmp1++ = (BN_ULONG) 0;
+	}
 
 static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
-        { 
+	{ 
 	int i;
-        BN_ULONG *_tmp1 = (buf), *_tmp2 = (a);
-        for (i = (top); i != 0; i--)
-                *_tmp1++ = *_tmp2++;
-        }
+	BN_ULONG *_tmp1 = (buf), *_tmp2 = (a);
+	for (i = (top); i != 0; i--)
+		*_tmp1++ = *_tmp2++;
+	}
 
 #if BN_BITS2 == 64
-#define bn_cp_64(to, n, from, m)	(to)[n] = (from)[m];
+#define bn_cp_64(to, n, from, m)	(to)[n] = (m>=0)?((from)[m]):0;
 #define bn_64_set_0(to, n)		(to)[n] = (BN_ULONG)0;
 /* TBD */
-#define bn_cp_32(to, n, from, m)	(to)[n] = (from)[m];
+#define bn_cp_32(to, n, from, m)	(to)[n] = (m>=0)?((from)[m]):0;
 #define bn_32_set_0(to, n)		(to)[n] = (BN_ULONG)0;
 #else
 #define bn_cp_64(to, n, from, m) \
@@ -220,26 +213,8 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
 	bn_32_set_0(to, (n)*2+1); \
 	}
 #if BN_BITS2 == 32
-#define bn_cp_32(to, n, from, m)	(to)[n] = (from)[m];
+#define bn_cp_32(to, n, from, m)	(to)[n] = (m>=0)?((from)[m]):0;
 #define bn_32_set_0(to, n)		(to)[n] = (BN_ULONG)0;
-#elif BN_BITS2 == 16
-#define bn_cp_32(to, n, from, m) \
-	{ \
-	(to)[(n)*2]   = (from)[(m)*2];  \
-	(to)[(n)*2+1] = (from)[(m)*2+1];\
-	}
-#define bn_32_set_0(to, n) { (to)[(n)*2] = 0; (to)[(n)*2+1] = 0; }
-#elif BN_BITS2 == 8
-#define bn_cp_32(to, n, from, m) \
-	{ \
-	(to)[(n)*4]   = (from)[(m)*4];  \
-	(to)[(n)*4+1] = (from)[(m)*4+1];\
-	(to)[(n)*4+2] = (from)[(m)*4+2];\
-	(to)[(n)*4+3] = (from)[(m)*4+3];\
-	}
-#define bn_32_set_0(to, n) \
-	{ (to)[(n)*4]   = (BN_ULONG)0; (to)[(n)*4+1] = (BN_ULONG)0; \
-	  (to)[(n)*4+2] = (BN_ULONG)0; (to)[(n)*4+3] = (BN_ULONG)0; }
 #endif
 #endif /* BN_BITS2 != 64 */
 
@@ -255,10 +230,18 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	BN_CTX *ctx)
 	{
 	int      top = a->top, i;
-	BN_ULONG carry = 0;
+	int      carry;
 	register BN_ULONG *r_d, *a_d = a->d;
 	BN_ULONG t_d[BN_NIST_192_TOP],
-	         buf[BN_NIST_192_TOP];
+	         buf[BN_NIST_192_TOP],
+		 c_d[BN_NIST_192_TOP],
+		*res;
+	size_t   mask;
+
+	field = &_bignum_nist_p_192; /* just to make sure */
+
+ 	if (BN_is_negative(a) || a->top > 2*BN_NIST_192_TOP)
+		return BN_nnmod(r, field, a, ctx);
 
 	i = BN_ucmp(field, a);
 	if (i == 0)
@@ -269,9 +252,6 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	else if (i > 0)
 		return (r == a) ? 1 : (BN_copy(r ,a) != NULL);
 
-	if (top == BN_NIST_192_TOP)
-		return BN_usub(r, a, field);
-
 	if (r != a)
 		{
 		if (!bn_wexpand(r, BN_NIST_192_TOP))
@@ -284,41 +264,33 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 
 	nist_cp_bn_0(buf, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
 
-#if defined(OPENSSL_SYS_VMS) && defined(__DECC)
-# pragma message save
-# pragma message disable BADSUBSCRIPT
-#endif
-
 	nist_set_192(t_d, buf, 0, 3, 3);
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP))
-		++carry;
-
+	carry = bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
+	mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_192,BN_NIST_192_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
+	
 	nist_set_192(t_d, buf, 4, 4, 0);
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP))
-		++carry;
-
-#if defined(OPENSSL_SYS_VMS) && defined(__DECC)
-# pragma message restore
-#endif
+	carry = bn_add_words(r_d, res, t_d, BN_NIST_192_TOP);
+	mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_192,BN_NIST_192_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
 
 	nist_set_192(t_d, buf, 5, 5, 5)
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP))
-		++carry;
+	carry = bn_add_words(r_d, res, t_d, BN_NIST_192_TOP);
+	mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_192,BN_NIST_192_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
 
-	while (carry)
-		{
-		if (bn_sub_words(r_d, r_d, _nist_p_192, BN_NIST_192_TOP))
-			--carry; 
-		}
+	nist_cp_bn(r_d, res, BN_NIST_192_TOP);
 	r->top = BN_NIST_192_TOP;
 	bn_correct_top(r);
-	if (BN_ucmp(r, field) >= 0)
+
+	if (BN_ucmp(field, r) <= 0)
 		{
-		bn_sub_words(r_d, r_d, _nist_p_192, BN_NIST_192_TOP);
-		bn_correct_top(r);
+		if (!BN_usub(r, r, field)) return 0;
 		}
 
-	bn_check_top(r);
 	return 1;
 	}
 
@@ -336,12 +308,20 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	BN_CTX *ctx)
 	{
-#if BN_BITS2 != 64
+#if BN_BITS2 == 32
 	int	top = a->top, i;
-	int	carry = 0;
+	int	carry;
 	BN_ULONG *r_d, *a_d = a->d;
 	BN_ULONG t_d[BN_NIST_224_TOP],
-	         buf[BN_NIST_224_TOP];
+	         buf[BN_NIST_224_TOP],
+		 c_d[BN_NIST_224_TOP],
+		*res;
+	size_t   mask;
+
+	field = &_bignum_nist_p_224; /* just to make sure */
+
+ 	if (BN_is_negative(a) || a->top > 2*BN_NIST_224_TOP)
+		return BN_nnmod(r, field, a, ctx);
 
 	i = BN_ucmp(field, a);
 	if (i == 0)
@@ -352,9 +332,6 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	else if (i > 0)
 		return (r == a)? 1 : (BN_copy(r ,a) != NULL);
 
-	if (top == BN_NIST_224_TOP)
-		return BN_usub(r, a, field);
-
 	if (r != a)
 		{
 		if (!bn_wexpand(r, BN_NIST_224_TOP))
@@ -368,65 +345,53 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	nist_cp_bn_0(buf, a_d + BN_NIST_224_TOP, top - BN_NIST_224_TOP, BN_NIST_224_TOP);
 
 	nist_set_224(t_d, buf, 10, 9, 8, 7, 0, 0, 0);
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP))
-		++carry;
+	carry = bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
+	mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_224,BN_NIST_224_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
+	
 	nist_set_224(t_d, buf, 0, 13, 12, 11, 0, 0, 0);
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP))
-		++carry;
+	carry = bn_add_words(r_d, res, t_d, BN_NIST_224_TOP);
+	mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_224,BN_NIST_224_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
+
 	nist_set_224(t_d, buf, 13, 12, 11, 10, 9, 8, 7);
-	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP))
-		--carry;
+#if BRANCH_FREE
+	carry = bn_sub_words(r_d, res, t_d, BN_NIST_224_TOP);
+	bn_add_words(c_d,r_d,_nist_p_224,BN_NIST_224_TOP);
+	mask = 0-(size_t)carry;
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
+#else
+	if (bn_sub_words(r_d, res, t_d, BN_NIST_224_TOP))
+		bn_add_words(r_d,r_d,_nist_p_224,BN_NIST_224_TOP);
+#endif
 	nist_set_224(t_d, buf, 0, 0, 0, 0, 13, 12, 11);
-	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP))
-		--carry;
-
-	if (carry > 0)
-		while (carry)
-			{
-			if (bn_sub_words(r_d,r_d,_nist_p_224,BN_NIST_224_TOP))
-				--carry;
-			}
-	else if (carry < 0)
-		while (carry)
-			{
-			if (bn_add_words(r_d,r_d,_nist_p_224,BN_NIST_224_TOP))
-				++carry;
-			}
+#if BRANCH_FREE
+	carry = bn_sub_words(r_d, res, t_d, BN_NIST_224_TOP);
+	bn_add_words(c_d,r_d,_nist_p_224,BN_NIST_224_TOP);
+	mask = 0-(size_t)carry;
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
 
+	nist_cp_bn(r_d, res, BN_NIST_224_TOP);
+#else
+	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP))
+		bn_add_words(r_d,r_d,_nist_p_224,BN_NIST_224_TOP);
+#endif
 	r->top = BN_NIST_224_TOP;
 	bn_correct_top(r);
-	if (BN_ucmp(r, field) >= 0)
+
+	if (BN_ucmp(field, r) <= 0)
 		{
-		bn_sub_words(r_d, r_d, _nist_p_224, BN_NIST_224_TOP);
-		bn_correct_top(r);
+		if (!BN_usub(r, r, field)) return 0;
 		}
-	bn_check_top(r);
+
 	return 1;
-#else
+#else	/* BN_BITS!=32 */
 	return 0;
 #endif
 	}
 
-#if BN_BITS2 != 64
-static void _init_256_data(void)
-	{
-	int	i;
-	BN_ULONG *tmp1 = _256_data;
-	const BN_ULONG *tmp2 = tmp1;
-
-	memcpy(tmp1, _nist_p_256, BN_NIST_256_TOP * sizeof(BN_ULONG));
-	tmp1 += BN_NIST_256_TOP;
-
-	for (i=0; i<5; i++)
-		{
-		bn_add_words(tmp1, _nist_p_256, tmp2, BN_NIST_256_TOP);
-		tmp2  = tmp1;
-		tmp1 += BN_NIST_256_TOP;
-		}
-	_is_set_256_data = 1;
-	}
-#endif
-
 #define nist_set_256(to, from, a1, a2, a3, a4, a5, a6, a7, a8) \
 	{ \
 	if (a8 != 0) bn_cp_32(to, 0, from, (a8) - 8) else bn_32_set_0(to, 0)\
@@ -442,24 +407,21 @@ static void _init_256_data(void)
 int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	BN_CTX *ctx)
 	{
-#if BN_BITS2 != 64
+#if BN_BITS2 == 32
 	int	i, top = a->top;
 	int	carry = 0;
 	register BN_ULONG *a_d = a->d, *r_d;
 	BN_ULONG t_d[BN_NIST_256_TOP],
-	         t_d2[BN_NIST_256_TOP],
-	         buf[BN_NIST_256_TOP];
+	         buf[BN_NIST_256_TOP],
+		 c_d[BN_NIST_256_TOP],
+		*res;
+	size_t   mask;
+
+	field = &_bignum_nist_p_256; /* just to make sure */
+
+ 	if (BN_is_negative(a) || a->top > 2*BN_NIST_256_TOP)
+		return BN_nnmod(r, field, a, ctx);
 
-	if (!_is_set_256_data)
-		{
-		CRYPTO_w_lock(CRYPTO_LOCK_BN);
-		
-		if (!_is_set_256_data)
-			_init_256_data();
-		
-		CRYPTO_w_unlock(CRYPTO_LOCK_BN);
-		}
-	
 	i = BN_ucmp(field, a);
 	if (i == 0)
 		{
@@ -469,9 +431,6 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	else if (i > 0)
 		return (r == a)? 1 : (BN_copy(r ,a) != NULL);
 
-	if (top == BN_NIST_256_TOP)
-		return BN_usub(r, a, field);
-
 	if (r != a)
 		{
 		if (!bn_wexpand(r, BN_NIST_256_TOP))
@@ -487,98 +446,96 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	/*S1*/
 	nist_set_256(t_d, buf, 15, 14, 13, 12, 11, 0, 0, 0);
 	/*S2*/
-	nist_set_256(t_d2,buf, 0, 15, 14, 13, 12, 0, 0, 0);
-	if (bn_add_words(t_d, t_d, t_d2, BN_NIST_256_TOP))
-		carry = 2;
-	/* left shift */
-		{
-		register BN_ULONG *ap,t,c;
-		ap = t_d;
-		c=0;
-		for (i = BN_NIST_256_TOP; i != 0; --i)
-			{
-			t= *ap;
-			*(ap++)=((t<<1)|c)&BN_MASK2;
-			c=(t & BN_TBIT)?1:0;
-			}
-		if (c)
-			++carry;
-		}
+	nist_set_256(c_d,buf, 0, 15, 14, 13, 12, 0, 0, 0);
+	carry = bn_add_words(t_d, t_d, c_d, BN_NIST_256_TOP);
+	mask = 0-(size_t)bn_sub_words(c_d,t_d,_nist_p_256,BN_NIST_256_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)t_d&~mask));
+
+	carry = bn_add_words(t_d, res, res, BN_NIST_256_TOP);
+	mask = 0-(size_t)bn_sub_words(c_d,t_d,_nist_p_256,BN_NIST_256_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)t_d&~mask));
+
+	carry = bn_add_words(r_d, r_d, res, BN_NIST_256_TOP);
+	mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_256,BN_NIST_256_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
 
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP))
-		++carry;
 	/*S3*/
 	nist_set_256(t_d, buf, 15, 14, 0, 0, 0, 10, 9, 8);
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP))
-		++carry;
+	carry = bn_add_words(r_d, res, t_d, BN_NIST_256_TOP);
+	mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_256,BN_NIST_256_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
+
 	/*S4*/
 	nist_set_256(t_d, buf, 8, 13, 15, 14, 13, 11, 10, 9);
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP))
-		++carry;
+	carry = bn_add_words(r_d, res, t_d, BN_NIST_256_TOP);
+	mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_256,BN_NIST_256_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
+
 	/*D1*/
 	nist_set_256(t_d, buf, 10, 8, 0, 0, 0, 13, 12, 11);
-	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP))
-		--carry;
+#if BRANCH_FREE
+	carry = bn_sub_words(r_d, res, t_d, BN_NIST_256_TOP);
+	bn_add_words(c_d,r_d,_nist_p_256,BN_NIST_256_TOP);
+	mask = 0-(size_t)carry;
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));	
+#else
+	if (bn_sub_words(r_d, res, t_d, BN_NIST_256_TOP))
+		bn_add_words(r_d,r_d,_nist_p_256,BN_NIST_256_TOP);
+#endif
 	/*D2*/
 	nist_set_256(t_d, buf, 11, 9, 0, 0, 15, 14, 13, 12);
+#if BRANCH_FREE
+	carry = bn_sub_words(r_d, res, t_d, BN_NIST_256_TOP);
+	bn_add_words(c_d,r_d,_nist_p_256,BN_NIST_256_TOP);
+	mask = 0-(size_t)carry;
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));	
+#else
 	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP))
-		--carry;
+		bn_add_words(r_d,r_d,_nist_p_256,BN_NIST_256_TOP);
+#endif
 	/*D3*/
 	nist_set_256(t_d, buf, 12, 0, 10, 9, 8, 15, 14, 13);
+#if BRANCH_FREE
+	carry = bn_sub_words(r_d, res, t_d, BN_NIST_256_TOP);
+	bn_add_words(c_d,r_d,_nist_p_256,BN_NIST_256_TOP);
+	mask = 0-(size_t)carry;
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));	
+#else
 	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP))
-		--carry;
+		bn_add_words(r_d,r_d,_nist_p_256,BN_NIST_256_TOP);
+#endif
 	/*D4*/
 	nist_set_256(t_d, buf, 13, 0, 11, 10, 9, 0, 15, 14);
-	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP))
-		--carry;
-	
-	if (carry)
-		{
-		if (carry > 0)
-			bn_sub_words(r_d, r_d, _256_data + BN_NIST_256_TOP *
-				--carry, BN_NIST_256_TOP);
-		else
-			{
-			carry = -carry;
-			bn_add_words(r_d, r_d, _256_data + BN_NIST_256_TOP *
-				--carry, BN_NIST_256_TOP);
-			}
-		}
+#if BRANCH_FREE
+	carry = bn_sub_words(r_d, res, t_d, BN_NIST_256_TOP);
+	bn_add_words(c_d,r_d,_nist_p_256,BN_NIST_256_TOP);
+	mask = 0-(size_t)carry;
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));	
 
+	nist_cp_bn(r_d, res, BN_NIST_384_TOP);
+#else
+	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP))
+		bn_add_words(r_d,r_d,_nist_p_256,BN_NIST_256_TOP);
+#endif
 	r->top = BN_NIST_256_TOP;
 	bn_correct_top(r);
-	if (BN_ucmp(r, field) >= 0)
+
+	if (BN_ucmp(field, r) <= 0)
 		{
-		bn_sub_words(r_d, r_d, _nist_p_256, BN_NIST_256_TOP);
-		bn_correct_top(r);
+		if (!BN_usub(r, r, field)) return 0;
 		}
-	bn_check_top(r);
+
 	return 1;
-#else
+#else	/* BN_BITS!=32 */
 	return 0;
 #endif
 	}
 
-#if BN_BITS2 != 64
-static void _init_384_data(void)
-	{
-	int	i;
-	BN_ULONG *tmp1 = _384_data;
-	const BN_ULONG *tmp2 = tmp1;
-
-	memcpy(tmp1, _nist_p_384, BN_NIST_384_TOP * sizeof(BN_ULONG));
-	tmp1 += BN_NIST_384_TOP;
-
-	for (i=0; i<7; i++)
-		{
-		bn_add_words(tmp1, _nist_p_384, tmp2, BN_NIST_384_TOP);
-		tmp2  = tmp1;
-		tmp1 += BN_NIST_384_TOP;
-		}
-	_is_set_384_data = 1;
-	}
-#endif
-
 #define nist_set_384(to,from,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12) \
 	{ \
 	if (a12 != 0) bn_cp_32(to, 0, from,  (a12) - 12) else bn_32_set_0(to, 0)\
@@ -598,22 +555,20 @@ static void _init_384_data(void)
 int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	BN_CTX *ctx)
 	{
-#if BN_BITS2 != 64
+#if BN_BITS2 == 32
 	int	i, top = a->top;
 	int	carry = 0;
 	register BN_ULONG *r_d, *a_d = a->d;
 	BN_ULONG t_d[BN_NIST_384_TOP],
-	         buf[BN_NIST_384_TOP];
+	         buf[BN_NIST_384_TOP],
+		 c_d[BN_NIST_384_TOP],
+		*res;
+	size_t	 mask;
 
-	if (!_is_set_384_data)
-		{
-		CRYPTO_w_lock(CRYPTO_LOCK_BN);
-		
-		if (!_is_set_384_data)
-			_init_384_data();
+	field = &_bignum_nist_p_384; /* just to make sure */
 
-		CRYPTO_w_unlock(CRYPTO_LOCK_BN);
-		}
+ 	if (BN_is_negative(a) || a->top > 2*BN_NIST_384_TOP)
+		return BN_nnmod(r, field, a, ctx);
 
 	i = BN_ucmp(field, a);
 	if (i == 0)
@@ -624,9 +579,6 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	else if (i > 0)
 		return (r == a)? 1 : (BN_copy(r ,a) != NULL);
 
-	if (top == BN_NIST_384_TOP)
-		return BN_usub(r, a, field);
-
 	if (r != a)
 		{
 		if (!bn_wexpand(r, BN_NIST_384_TOP))
@@ -646,72 +598,108 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 		register BN_ULONG *ap,t,c;
 		ap = t_d;
 		c=0;
-		for (i = BN_NIST_256_TOP; i != 0; --i)
+		for (i = 3; i != 0; --i)
 			{
 			t= *ap;
 			*(ap++)=((t<<1)|c)&BN_MASK2;
 			c=(t & BN_TBIT)?1:0;
 			}
+		*ap=c;
 		}
-	if (bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2), 
-		t_d, BN_NIST_256_TOP))
-		++carry;
+	carry = bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2), 
+		t_d, BN_NIST_256_TOP);
+	/*
+	 * we need if (result>=modulus) subtract(result,modulus);
+	 * in n-bit space this can be expressed as
+	 * if (carry || result>=modulus) subtract(result,modulus);
+	 * the catch is that comparison implies subtraction and
+	 * therefore one can write tmp=subtract(result,modulus);
+	 * and then if(carry || !borrow) result=tmp; this's what
+	 * happens below, but without explicit if:-) a.
+	 */
+	mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
+
 	/*S2 */
-	if (bn_add_words(r_d, r_d, buf, BN_NIST_384_TOP))
-		++carry;
+	carry = bn_add_words(r_d, res, buf, BN_NIST_384_TOP);
+	mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
+
 	/*S3*/
 	nist_set_384(t_d,buf,20,19,18,17,16,15,14,13,12,23,22,21);
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP))
-		++carry;
+	carry = bn_add_words(r_d, res, t_d, BN_NIST_384_TOP);
+	mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
+
 	/*S4*/
 	nist_set_384(t_d,buf,19,18,17,16,15,14,13,12,20,0,23,0);
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP))
-		++carry;
+	carry = bn_add_words(r_d, res, t_d, BN_NIST_384_TOP);
+	mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
+
 	/*S5*/
-	nist_set_256(t_d, buf, 0, 0, 0, 0, 23-4, 22-4, 21-4, 20-4);
-	if (bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2), 
-		t_d, BN_NIST_256_TOP))
-		++carry;
+	nist_set_384(t_d, buf,0,0,0,0,23,22,21,20,0,0,0,0);
+	carry = bn_add_words(r_d, res, t_d, BN_NIST_384_TOP);
+	mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
+
 	/*S6*/
 	nist_set_384(t_d,buf,0,0,0,0,0,0,23,22,21,0,0,20);
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP))
-		++carry;
+	carry = bn_add_words(r_d, res, t_d, BN_NIST_384_TOP);
+	mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP);
+	mask = ~mask | (0-(size_t)carry);
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
+
 	/*D1*/
 	nist_set_384(t_d,buf,22,21,20,19,18,17,16,15,14,13,12,23);
-	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP))
-		--carry;
+#if BRANCH_FREE
+	carry = bn_sub_words(r_d, res, t_d, BN_NIST_384_TOP);
+	bn_add_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP);
+	mask = 0-(size_t)carry;
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
+#else
+	if (bn_sub_words(r_d, res, t_d, BN_NIST_384_TOP))
+		bn_add_words(r_d,r_d,_nist_p_384,BN_NIST_384_TOP);
+#endif
 	/*D2*/
 	nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,22,21,20,0);
+#if BRANCH_FREE
+	carry = bn_sub_words(r_d, res, t_d, BN_NIST_384_TOP);
+	bn_add_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP);
+	mask = 0-(size_t)carry;
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
+#else
 	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP))
-		--carry;
+		bn_add_words(r_d,r_d,_nist_p_384,BN_NIST_384_TOP);
+#endif
 	/*D3*/
 	nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,23,0,0,0);
-	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP))
-		--carry;
-	
-	if (carry)
-		{
-		if (carry > 0)
-			bn_sub_words(r_d, r_d, _384_data + BN_NIST_384_TOP *
-				--carry, BN_NIST_384_TOP);
-		else
-			{
-			carry = -carry;
-			bn_add_words(r_d, r_d, _384_data + BN_NIST_384_TOP *
-				--carry, BN_NIST_384_TOP);
-			}
-		}
+#if BRANCH_FREE
+	carry = bn_sub_words(r_d, res, t_d, BN_NIST_384_TOP);
+	bn_add_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP);
+	mask = 0-(size_t)carry;
+	res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask));
 
+	nist_cp_bn(r_d, res, BN_NIST_384_TOP);
+#else
+	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP))
+		bn_add_words(r_d,r_d,_nist_p_384,BN_NIST_384_TOP);
+#endif
 	r->top = BN_NIST_384_TOP;
 	bn_correct_top(r);
-	if (BN_ucmp(r, field) >= 0)
+
+	if (BN_ucmp(field, r) <= 0)
 		{
-		bn_sub_words(r_d, r_d, _nist_p_384, BN_NIST_384_TOP);
-		bn_correct_top(r);
+		if (!BN_usub(r, r, field)) return 0;
 		}
-	bn_check_top(r);
+
 	return 1;
-#else
+#else	/* BN_BITS!=32 */
 	return 0;
 #endif
 	}
@@ -723,20 +711,37 @@ int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 #define BN_NIST_521_TOP_MASK	(BN_ULONG)0x1FF
 #elif BN_BITS2 == 32
 #define BN_NIST_521_TOP_MASK	(BN_ULONG)0x1FF
-#elif BN_BITS2 == 16
-#define BN_NIST_521_TOP_MASK	(BN_ULONG)0x1FF
-#elif BN_BITS2 == 8
-#define BN_NIST_521_TOP_MASK	(BN_ULONG)0x1
 #endif
 	int	top, ret = 0;
-	BN_ULONG *r_d;
 	BIGNUM	*tmp;
 
+	field = &_bignum_nist_p_521; /* just to make sure */
+
+ 	if (BN_is_negative(a))
+		return BN_nnmod(r, field, a, ctx);
+
 	/* check whether a reduction is necessary */
 	top = a->top;
 	if (top < BN_NIST_521_TOP  || ( top == BN_NIST_521_TOP &&
-           (!(a->d[BN_NIST_521_TOP-1] & ~(BN_NIST_521_TOP_MASK)))))
-		return (r == a)? 1 : (BN_copy(r ,a) != NULL);
+	    (!(a->d[BN_NIST_521_TOP-1] & ~(BN_NIST_521_TOP_MASK)))))
+		{
+		int i = BN_ucmp(field, a);
+		if (i == 0)
+			{
+			BN_zero(r);
+			return 1;
+			}
+		else
+			{
+#ifdef BN_DEBUG
+			OPENSSL_assert(i > 0); /* because 'field' is 1111...1111 */
+#endif
+			return (r == a)? 1 : (BN_copy(r ,a) != NULL);
+			}
+		}
+
+ 	if (BN_num_bits(a) > 2*521)
+		return BN_nnmod(r, field, a, ctx);
 
 	BN_CTX_start(ctx);
 	tmp = BN_CTX_get(ctx);
@@ -756,15 +761,11 @@ int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 
 	if (!BN_uadd(r, tmp, r))
 		goto err;
-	top = r->top;
-	r_d = r->d;
-	if (top == BN_NIST_521_TOP  && 
-           (r_d[BN_NIST_521_TOP-1] & ~(BN_NIST_521_TOP_MASK)))
+
+	if (BN_ucmp(field, r) <= 0)
 		{
-		BN_NIST_ADD_ONE(r_d)
-		r_d[BN_NIST_521_TOP-1] &= BN_NIST_521_TOP_MASK; 
+		if (!BN_usub(r, r, field)) goto err;
 		}
-	bn_correct_top(r);
 
 	ret = 1;
 err:
diff --git a/crypto/bn/bn_prime.c b/crypto/bn/bn_prime.c
index 5bab019553bf..7b25979dd1c4 100644
--- a/crypto/bn/bn_prime.c
+++ b/crypto/bn/bn_prime.c
@@ -377,14 +377,14 @@ static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
 static int probable_prime(BIGNUM *rnd, int bits)
 	{
 	int i;
-	BN_ULONG mods[NUMPRIMES];
+	prime_t mods[NUMPRIMES];
 	BN_ULONG delta,maxdelta;
 
 again:
 	if (!BN_rand(rnd,bits,1,1)) return(0);
 	/* we now have a random number 'rand' to test. */
 	for (i=1; i<NUMPRIMES; i++)
-		mods[i]=BN_mod_word(rnd,(BN_ULONG)primes[i]);
+		mods[i]=(prime_t)BN_mod_word(rnd,(BN_ULONG)primes[i]);
 	maxdelta=BN_MASK2 - primes[NUMPRIMES-1];
 	delta=0;
 	loop: for (i=1; i<NUMPRIMES; i++)
diff --git a/crypto/bn/bn_prime.h b/crypto/bn/bn_prime.h
index b7cf9a9bfe12..51d2194febdb 100644
--- a/crypto/bn/bn_prime.h
+++ b/crypto/bn/bn_prime.h
@@ -58,10 +58,12 @@
 
 #ifndef EIGHT_BIT
 #define NUMPRIMES 2048
+typedef unsigned short prime_t;
 #else
 #define NUMPRIMES 54
+typedef unsigned char prime_t;
 #endif
-static const unsigned int primes[NUMPRIMES]=
+static const prime_t primes[NUMPRIMES]=
 	{
 	   2,   3,   5,   7,  11,  13,  17,  19,
 	  23,  29,  31,  37,  41,  43,  47,  53,
diff --git a/crypto/bn/bn_prime.pl b/crypto/bn/bn_prime.pl
index e583d1d53b9d..3fafb6f3e90a 100644
--- a/crypto/bn/bn_prime.pl
+++ b/crypto/bn/bn_prime.pl
@@ -101,10 +101,12 @@ for ($i=0; $i <= $#primes; $i++)
 
 printf "#ifndef EIGHT_BIT\n";
 printf "#define NUMPRIMES %d\n",$num;
+printf "typedef unsigned short prime_t;\n";
 printf "#else\n";
 printf "#define NUMPRIMES %d\n",$eight;
+printf "typedef unsigned char prime_t;\n";
 printf "#endif\n";
-print "static const unsigned int primes[NUMPRIMES]=\n\t{\n\t";
+print "static const prime_t primes[NUMPRIMES]=\n\t{\n\t";
 $init=0;
 for ($i=0; $i <= $#primes; $i++)
 	{
diff --git a/crypto/bn/bntest.c b/crypto/bn/bntest.c
index c885300a669f..310763eca0c6 100644
--- a/crypto/bn/bntest.c
+++ b/crypto/bn/bntest.c
@@ -184,120 +184,120 @@ int main(int argc, char *argv[])
 
 	message(out,"BN_add");
 	if (!test_add(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_sub");
 	if (!test_sub(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_lshift1");
 	if (!test_lshift1(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_lshift (fixed)");
 	if (!test_lshift(out,ctx,BN_bin2bn(lst,sizeof(lst)-1,NULL)))
 	    goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_lshift");
 	if (!test_lshift(out,ctx,NULL)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_rshift1");
 	if (!test_rshift1(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_rshift");
 	if (!test_rshift(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_sqr");
 	if (!test_sqr(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_mul");
 	if (!test_mul(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_div");
 	if (!test_div(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_div_word");
 	if (!test_div_word(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_div_recp");
 	if (!test_div_recp(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_mod");
 	if (!test_mod(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_mod_mul");
 	if (!test_mod_mul(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_mont");
 	if (!test_mont(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_mod_exp");
 	if (!test_mod_exp(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_mod_exp_mont_consttime");
 	if (!test_mod_exp_mont_consttime(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_exp");
 	if (!test_exp(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_kronecker");
 	if (!test_kron(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_mod_sqrt");
 	if (!test_sqrt(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_add");
 	if (!test_gf2m_add(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod");
 	if (!test_gf2m_mod(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod_mul");
 	if (!test_gf2m_mod_mul(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod_sqr");
 	if (!test_gf2m_mod_sqr(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod_inv");
 	if (!test_gf2m_mod_inv(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod_div");
 	if (!test_gf2m_mod_div(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod_exp");
 	if (!test_gf2m_mod_exp(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod_sqrt");
 	if (!test_gf2m_mod_sqrt(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod_solve_quad");
 	if (!test_gf2m_mod_solve_quad(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	BN_CTX_free(ctx);
 	BIO_free(out);
@@ -307,7 +307,7 @@ int main(int argc, char *argv[])
 err:
 	BIO_puts(out,"1\n"); /* make sure the Perl script fed by bc notices
 	                      * the failure, see test_bn in test/Makefile.ssl*/
-	BIO_flush(out);
+	(void)BIO_flush(out);
 	ERR_load_crypto_strings();
 	ERR_print_errors_fp(stderr);
 	EXIT(1);