ghash-ia64.pl: new file, GHASH for Itanium.

ghash-x86_64.pl: minimize stack frame usage. ghash-x86.pl: modulo-scheduling MMX loop in respect to input vector results in up to 10% performance improvement.
2024-12-04 23:43:55 +08:00 · 2010-03-15 19:07:52 +00:00 · 2010-03-15 19:07:52 +00:00 · 480cd6ab6e
commit 480cd6ab6e
parent 6c6bdd543d
3 changed files with 291 additions and 36 deletions
--- a/crypto/modes/asm/ghash-ia64.pl
+++ b/crypto/modes/asm/ghash-ia64.pl
@ -0,0 +1,228 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" Galois field multiplication and
+# streamed GHASH function. "4-bit" means that it uses 256 bytes
+# per-key table [+128 bytes shared table]. Streamed GHASH performance
+# was measured to be 6.35 cycles per processed byte on Itanium 2,
+# which is >90% better than Microsoft compiler generated code. Well,
+# the number should have been ~6.5. The deviation has everything to do
+# with the way performance is measured, as difference between GCM and
+# straightforward 128-bit counter mode. To anchor to something else
+# sha1-ia64.pl module processes one byte in 6.0 cycles. On Itanium
+# GHASH should run at ~8.5 cycles per byte.
+
+$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
+
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+for (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
+                $big_endian=0 if (/\-DL_ENDIAN/);  }
+if (!defined($big_endian))
+             {  $big_endian=(unpack('L',pack('N',1))==1);  }
+
+sub loop() {
+my $label=shift;
+my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
+
+# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
+# in scalable manner;-) Naturally assuming data in L1 cache...
+# Special note about 'dep' instruction, which is used to construct
+# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
+# bytes boundary and lower 7 bits of its address are guaranteed to
+# be zero.
+$code.=<<___;
+$label:
+{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
+	(p19)	dep	rem=Zlo,rem_4bitp,3,4	}
+{ .mfi;	(p19)	xor	Zhi=Zhi,Hhi
+	($p17)	xor	xi[1]=xi[1],in[1]	};;
+{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
+	(p19)	shrp	Zlo=Zhi,Zlo,4		}
+{ .mfi;	(p19)	ld8	rem=[rem]
+	(p18)	and	Hi[1]=mask0xf0,xi[2]	};;
+{ .mmi;	($p16)	ld1	in[0]=[inp],-1
+	(p18)	xor	Zlo=Zlo,Hlo
+	(p19)	shr.u	Zhi=Zhi,4		}
+{ .mib;	(p19)	xor	Hhi=Hhi,rem
+	(p18)	add	Hi[1]=Htbl,Hi[1]	};;
+
+{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
+	(p18)	dep	rem=Zlo,rem_4bitp,3,4	}
+{ .mfi;	(p17)	shladd	Hi[0]=xi[1],4,r0
+	(p18)	xor	Zhi=Zhi,Hhi		};;
+{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
+	(p18)	shrp	Zlo=Zhi,Zlo,4		}
+{ .mfi;	(p18)	ld8	rem=[rem]
+	(p17)	and	Hi[0]=mask0xf0,Hi[0]	};;
+{ .mmi;	(p16)	ld1	xi[0]=[Xi],-1
+	(p18)	xor	Zlo=Zlo,Hlo
+	(p18)	shr.u	Zhi=Zhi,4		}
+{ .mib;	(p18)	xor	Hhi=Hhi,rem
+	(p17)	add	Hi[0]=Htbl,Hi[0]
+	br.ctop.sptk	$label			};;
+___
+}
+
+$code=<<___;
+.explicit
+.text
+
+prevfs=r2;	prevlc=r3;	prevpr=r8;
+mask0xf0=r21;
+rem=r22;	rem_4bitp=r23;
+Xi=r24;		Htbl=r25;
+inp=r26;	end=r27;
+Hhi=r28;	Hlo=r29;
+Zhi=r30;	Zlo=r31;
+
+.global	gcm_gmult_4bit#
+.proc	gcm_gmult_4bit#
+.align	128
+.skip	16;;					// aligns loop body
+gcm_gmult_4bit:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,2,6,0,8
+	$ADDP	Xi=15,in0			// &Xi[15]
+	mov	rem_4bitp=ip		}
+{ .mii;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
+	.save	ar.lc,prevlc
+	mov	prevlc=ar.lc
+	.save	pr,prevpr
+	mov	prevpr=pr		};;
+
+	.body
+	.rotr	in[3],xi[3],Hi[2]
+
+{ .mib;	ld1	xi[2]=[Xi],-1			// Xi[15]
+	mov	mask0xf0=0xf0
+	brp.loop.imp	.Loop1,.Lend1-16};;
+{ .mmi;	ld1	xi[1]=[Xi],-1			// Xi[14]
+					};;
+{ .mii;	shladd	Hi[1]=xi[2],4,r0
+	mov	pr.rot=0x7<<16
+	mov	ar.lc=13		};;
+{ .mii;	and	Hi[1]=mask0xf0,Hi[1]
+	mov	ar.ec=3
+	xor	Zlo=Zlo,Zlo		};;
+{ .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
+	add	rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
+	xor	Zhi=Zhi,Zhi		};;
+___
+	&loop	(".Loop1",1);
+$code.=<<___;
+.Lend1:
+{ .mib;	xor	Zhi=Zhi,Hhi		};;	// modulo-scheduling artefact
+{ .mib;	mux1	Zlo=Zlo,\@rev		};;
+{ .mib;	mux1	Zhi=Zhi,\@rev		};;
+{ .mmi;	add	Hlo=9,Xi;;			// ;; is here to prevent
+	add	Hhi=1,Xi		};;	// pipeline flush on Itanium
+{ .mib;	st8	[Hlo]=Zlo
+	mov	pr=prevpr,-2		};;
+{ .mib;	st8	[Hhi]=Zhi
+	mov	ar.lc=prevlc
+	br.ret.sptk.many	b0	};;
+.endp	gcm_gmult_4bit#
+
+.global	gcm_ghash_4bit#
+.proc	gcm_ghash_4bit#
+.align	32;;
+gcm_ghash_4bit:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,4,4,0,8
+	$ADDP	inp=15,in0			// &inp[15]
+	mov	rem_4bitp=ip		}
+{ .mmi;	$ADDP	end=in1,in0			// &inp[len]
+	$ADDP	Xi=15,in2			// &Xi[15]
+	.save	ar.lc,prevlc
+	mov	prevlc=ar.lc		};;
+{ .mmi;	$ADDP	Htbl=8,in3			// &Htbl[0].lo
+	mov	mask0xf0=0xf0
+	.save	pr,prevpr
+	mov	prevpr=pr		}
+
+	.body
+	.rotr	in[3],xi[3],Hi[2]
+
+{ .mmi;	ld1	in[2]=[inp],-1			// inp[15]
+	ld1	xi[2]=[Xi],-1			// Xi[15]
+	add	end=-17,end		};;
+{ .mmi;	ld1	in[1]=[inp],-1			// inp[14]
+	ld1	xi[1]=[Xi],-1			// Xi[14]
+	xor	xi[2]=xi[2],in[2]	};;
+{ .mii;	shladd	Hi[1]=xi[2],4,r0
+	mov	pr.rot=0x7<<16
+	mov	ar.lc=13		};;
+{ .mii;	and	Hi[1]=mask0xf0,Hi[1]
+	mov	ar.ec=3
+	xor	Zlo=Zlo,Zlo		};;
+{ .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
+	add	rem_4bitp=rem_4bit#-gcm_ghash_4bit#,rem_4bitp
+	xor	Zhi=Zhi,Zhi		};;
+___
+	&loop	(".LoopN");
+$code.=<<___;
+{ .mib;	xor	Zhi=Zhi,Hhi			// modulo-scheduling artefact
+	extr.u	xi[2]=Zlo,0,8		}	// Xi[15]
+{ .mib;	cmp.ltu	p6,p0=inp,end			// are we done?
+	add	inp=32,inp			// advance inp
+	clrrrb.pr			};;
+{ .mii;
+(p6)	ld1	in[2]=[inp],-1			// inp[15]
+(p6)	extr.u	xi[1]=Zlo,8,8			// Xi[14]
+(p6)	mov	ar.lc=13		};;
+{ .mii;
+(p6)	ld1	in[1]=[inp],-1			// inp[14]
+(p6)	mov	ar.ec=3
+	mux1	Zlo=Zlo,\@rev		};;
+{ .mii;
+(p6)	xor	xi[2]=xi[2],in[2]
+	mux1	Zhi=Zhi,\@rev		};;
+{ .mii;
+(p6)	shladd	Hi[1]=xi[2],4,r0
+	add	Hlo=9,Xi			// Xi is &Xi[-1]
+	add	Hhi=1,Xi		};;
+{ .mii;
+(p6)	and	Hi[1]=mask0xf0,Hi[1]
+(p6)	add	Xi=14,Xi			// &Xi[13]
+(p6)	mov	pr.rot=0x7<<16		};;
+
+{ .mii; st8	[Hlo]=Zlo
+(p6)	xor	Zlo=Zlo,Zlo
+(p6)	add	Hi[1]=Htbl,Hi[1]	};;
+{ .mib;	st8	[Hhi]=Zhi
+(p6)	xor	Zhi=Zhi,Zhi
+(p6)	br.cond.dptk.many	.LoopN	};;
+
+{ .mib;	mov	pr=prevpr,-2		}
+{ .mib;	mov	ar.lc=prevlc
+	br.ret.sptk.many	b0	};;
+.endp	gcm_ghash_4bit#
+
+.align	128;;
+.type	rem_4bit#,\@object
+rem_4bit:
+        data8	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+        data8	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+        data8	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+        data8	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.size	rem_4bit#,128
+stringz	"GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
+
+print $code;
+close STDOUT;
--- a/crypto/modes/asm/ghash-x86.pl
+++ b/crypto/modes/asm/ghash-x86.pl
@ -7,9 +7,11 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
+# March 2010
+#
 # The module implements "4-bit" Galois field multiplication and
 # streamed GHASH function. "4-bit" means that it uses 256 bytes
-# per-key table [+128/256 bytes fixed table]. It has two code paths:
+# per-key table [+64/128 bytes fixed table]. It has two code paths:
 # vanilla x86 and vanilla MMX. Former will be executed on 486 and
 # Pentium, latter on all others. Performance results are for streamed
 # GHASH subroutine and are expressed in cycles per processed byte,
@ -18,13 +20,13 @@
 #		gcc 2.95.3(*)	MMX assembler	x86 assembler
 #
 # Pentium	100/112(**)	-		50
-# PIII		63 /77		17		24
-# P4		96 /122		33		84(***)
-# Opteron	50 /71		22		30
-# Core2		63 /102		21		28
+# PIII		63 /77		16		24
+# P4		96 /122		30		84(***)
+# Opteron	50 /71		21		30
+# Core2		63 /102		19		28
 #
 # (*)	gcc 3.4.x was observed to generate few percent slower code,
-#	which is one of reasons why 2.95.3 result were chosen;
+#	which is one of reasons why 2.95.3 results were chosen,
 #	another reason is lack of 3.4.x results for older CPUs;
 # (**)	second number is result for code compiled with -fPIC flag,
 #	which is actually more relevant, because assembler code is
@ -32,8 +34,8 @@
 # (***)	see comment in non-MMX routine for further details;
 #
 # To summarize, it's 2-3 times faster than gcc-generated code. To
-# anchor it to something else SHA1 assembler processes single byte
-# in 11-13 cycles.
+# anchor it to something else SHA1 assembler processes one byte in
+# 11-13 cycles on contemporary x86 cores.

 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
@ -52,13 +54,13 @@ $Htbl = "esi";

 $unroll = 0;	# Affects x86 loop. Folded loop performs ~7% worse
 		# than unrolled, which has to be weighted against
-		# almost 2x code size reduction. Well, *overall*
-		# code size. x86-specific code shrinks by 7.5x...
+		# 1.7x code size reduction. Well, *overall* 1.7x,
+		# x86-specific code itself shrinks by 2.5x...

 sub mmx_loop() {
-# MMX version performs 2.5 times better on P4 (see comment in non-MMX
-# routine for further details), 35% better on Opteron and Core2, 40%
-# better on PIII... In other words effort is considered to be well
+# MMX version performs 2.8 times better on P4 (see comment in non-MMX
+# routine for further details), 40% better on Opteron, 50% better
+# on PIII and Core2... In other words effort is considered to be well
 # spent...
    my $inp = shift;
    my $rem_4bit = shift;
@ -74,7 +76,7 @@ sub mmx_loop() {
 	&xor	($nlo,$nlo);	# avoid partial register stalls on PIII
 	&mov	($nhi,$Zll);
 	&mov	(&LB($nlo),&LB($nhi));
-	&mov	($cnt,15);
+	&mov	($cnt,14);
 	&shl	(&LB($nlo),4);
 	&and	($nhi,0xf0);
 	&movq	($Zlo,&QWP(8,$Htbl,$nlo));
@ -85,34 +87,59 @@ sub mmx_loop() {
    &set_label("mmx_loop",16);
 	&psrlq	($Zlo,4);
 	&and	($rem,0xf);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
 	&movq	($tmp,$Zhi);
 	&psrlq	($Zhi,4);
+	&mov	(&LB($nlo),&BP(0,$inp,$cnt));
 	&dec	($cnt);
-	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
 	&psllq	($tmp,60);
 	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
 	&movd	($rem,$Zlo);
 	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
+	&mov	($nhi,$nlo);
 	&pxor	($Zlo,$tmp);
 	&js	(&label("mmx_break"));

-	&movz	($nhi,&BP(0,$inp,$cnt));
-	&psrlq	($Zlo,4);
-	&mov	(&LB($nlo),&LB($nhi));
-	&movq	($tmp,$Zhi);
 	&shl	(&LB($nlo),4);
-	&psrlq	($Zhi,4);
 	&and	($rem,0xf);
+	&psrlq	($Zlo,4);
+	&and	($nhi,0xf0);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
 	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
 	&psllq	($tmp,60);
 	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
 	&movd	($rem,$Zlo);
 	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
 	&pxor	($Zlo,$tmp);
-	&and	($nhi,0xf0);
 	&jmp	(&label("mmx_loop"));

    &set_label("mmx_break",16);
+	&shl	(&LB($nlo),4);
+	&and	($rem,0xf);
+	&psrlq	($Zlo,4);
+	&and	($nhi,0xf0);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
+	&pxor	($Zlo,$tmp);
+
+	&psrlq	($Zlo,4);
+	&and	($rem,0xf);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
+	&mov	($nhi,$nlo);
+	&pxor	($Zlo,$tmp);
+
 	&psrlq	($Zlo,32);	# lower part of Zlo is already there
 	&movd	($Zhl,$Zhi);
 	&psrlq	($Zhi,32);
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@ -7,9 +7,11 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
+# March 2010
+#
 # The module implements "4-bit" Galois field multiplication and
 # streamed GHASH function. "4-bit" means that it uses 256 bytes
-# per-key table [+128 bytes fixed table]. Performance results are for
+# per-key table [+128 bytes shared table]. Performance results are for
 # streamed GHASH subroutine and are expressed in cycles per processed
 # byte, less is better:
 #
@ -136,9 +138,8 @@ $code=<<___;
 .align	16
 gcm_gmult_4bit:
 	push	%rbx
-	push	%rbp
-	push	%r12
-	sub	\$16,%rsp
+	push	%rbp		# %rbp and %r12 are pushed exclusively in
+	push	%r12		# order to reuse Win64 exception handler...
 .Lgmult_prologue:

 	movzb	15($Xi),$Zlo
@ -149,8 +150,8 @@ $code.=<<___;
 	mov	$Zlo,8($Xi)
 	mov	$Zhi,($Xi)

-	mov	32(%rsp),%rbx
-	lea	40(%rsp),%rsp
+	mov	16(%rsp),%rbx
+	lea	24(%rsp),%rsp
 .Lgmult_epilogue:
 	ret
 .size	gcm_gmult_4bit,.-gcm_gmult_4bit
@ -174,7 +175,6 @@ gcm_ghash_4bit:
 	push	%rbx
 	push	%rbp
 	push	%r12
-	sub	\$16,%rsp
 .Lghash_prologue:

 	mov	8($Xi),$Zlo
@ -186,11 +186,11 @@ gcm_ghash_4bit:
 	xor	8($inp),$Zlo
 	xor	($inp),$Zhi
 	lea	16($inp),$inp
-	mov	$Zlo,8(%rsp)
-	mov	$Zhi,(%rsp)
+	mov	$Zlo,8($Xi)
+	mov	$Zhi,($Xi)
 	shr	\$56,$Zlo
 ___
-	&loop	("%rsp");
+	&loop	($Xi);
 $code.=<<___;
 	cmp	$len,$inp
 	jb	.Louter_loop
@ -198,10 +198,10 @@ $code.=<<___;
 	mov	$Zlo,8($Xi)
 	mov	$Zhi,($Xi)

-	mov	16(%rsp),%r12
-	mov	24(%rsp),%rbp
-	mov	32(%rsp),%rbx
-	lea	40(%rsp),%rsp
+	mov	0(%rsp),%r12
+	mov	8(%rsp),%rbp
+	mov	16(%rsp),%rbx
+	lea	24(%rsp),%rsp
 .Lghash_epilogue:
 	ret
 .size	gcm_ghash_4bit,.-gcm_ghash_4bit
@ -259,7 +259,7 @@ se_handler:
 	cmp	%r10,%rbx		# context->Rip>=epilogue label
 	jae	.Lin_prologue

-	lea	40(%rax),%rax		# adjust "rsp"
+	lea	24(%rax),%rax		# adjust "rsp"

 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp