RC4_set_key for x86_64 and Core2 optimization.

PR: 1447
2024-11-24 18:43:34 +08:00 · 2007-04-02 09:50:14 +00:00 · 2007-04-02 09:50:14 +00:00 · 9babf3929b
commit 9babf3929b
parent 2ec0be9e77
4 changed files with 178 additions and 5 deletions
--- a/2
+++ b/2
@ -120,7 +120,7 @@ my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o mo86-elf.o:dx86-elf.o yx86
 my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o mo86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o:rm86-cof.o:r586-cof.o:wp_block.o w86mmx-cof.o";
 my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o mo86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o:wp_block.o w86mmx-out.o";

-my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4_skey.o:::wp-x86_64.o";
+my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o";
 my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::";
 my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::::::::";
 my $alpha_asm=":bn_asm.o alpha-mont.o::::::::::";
--- a/8
+++ b/8
@ -268,7 +268,7 @@ $bf_obj       =
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o rc4_skey.o
+$rc4_obj      = rc4-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@ -3152,7 +3152,7 @@ $bf_obj       =
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o rc4_skey.o
+$rc4_obj      = rc4-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@ -3964,7 +3964,7 @@ $bf_obj       =
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o rc4_skey.o
+$rc4_obj      = rc4-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@ -3992,7 +3992,7 @@ $bf_obj       =
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o rc4_skey.o
+$rc4_obj      = rc4-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
--- a/crypto/rc4/asm/rc4-x86_64.pl
+++ b/crypto/rc4/asm/rc4-x86_64.pl
@ -49,6 +49,14 @@
 # is not implemented, then this final RC4_CHAR code-path should be
 # preferred, as it provides better *all-round* performance].

+# Intel Core2 was observed to perform poorly on both code paths:-( It
+# apparently suffers from some kind of partial register stall, which
+# occurs in 64-bit mode only [as virtually identical 32-bit loop was
+# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
+# cloop1 boosts its performance by 80%! This loop appears to be optimal
+# fit for Core2 and therefore the code was modified to skip cloop8 on
+# this CPU.
+
 $output=shift;
 open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";

@ -152,6 +160,8 @@ $code.=<<___;
 	movzb	($dat,$XX[0]),$TX[0]#d
 	test	\$-8,$len
 	jz	.Lcloop1
+	cmp	\$0,260($dat)
+	jnz	.Lcloop1
 	push	%rbx
 	jmp	.Lcloop8
 .align	16
@ -235,6 +245,111 @@ $code.=<<___;
 .size	RC4,.-RC4
 ___

+$idx="%r8";
+$ido="%r9";
+
+$code.=<<___;
+.extern	OPENSSL_ia32cap_P
+.globl	RC4_set_key
+.type	RC4_set_key,\@function,3
+.align	16
+RC4_set_key:
+	lea	8($dat),$dat
+	lea	($inp,$len),$inp
+	neg	$len
+	mov	$len,%rcx
+	xor	%eax,%eax
+	xor	$ido,$ido
+	xor	%r10,%r10
+	xor	%r11,%r11
+
+	mov	OPENSSL_ia32cap_P(%rip),$idx#d
+	bt	\$20,$idx#d
+	jnc	.Lw1stloop
+	bt	\$30,$idx#d
+	setc	$ido#b
+	mov	$ido#d,260($dat)
+	jmp	.Lc1stloop
+
+.align	16
+.Lw1stloop:
+	mov	%eax,($dat,%rax,4)
+	add	\$1,%al
+	jnc	.Lw1stloop
+
+	xor	$ido,$ido
+	xor	$idx,$idx
+.align	16
+.Lw2ndloop:
+	mov	($dat,$ido,4),%r10d
+	add	($inp,$len,1),$idx#b
+	add	%r10b,$idx#b
+	add	\$1,$len
+	mov	($dat,$idx,4),%r11d
+	cmovz	%rcx,$len
+	mov	%r10d,($dat,$idx,4)
+	mov	%r11d,($dat,$ido,4)
+	add	\$1,$ido#b
+	jnc	.Lw2ndloop
+	jmp	.Lexit_key
+
+.align	16
+.Lc1stloop:
+	mov	%al,($dat,%rax)
+	add	\$1,%al
+	jnc	.Lc1stloop
+
+	xor	$ido,$ido
+	xor	$idx,$idx
+.align	16
+.Lc2ndloop:
+	mov	($dat,$ido),%r10b
+	add	($inp,$len),$idx#b
+	add	%r10b,$idx#b
+	add	\$1,$len
+	mov	($dat,$idx),%r11b
+	jnz	.Lcnowrap
+	mov	%rcx,$len
+.Lcnowrap:
+	mov	%r10b,($dat,$idx)
+	mov	%r11b,($dat,$ido)
+	add	\$1,$ido#b
+	jnc	.Lc2ndloop
+	movl	\$-1,256($dat)
+
+.align	16
+.Lexit_key:
+	xor	%eax,%eax
+	mov	%eax,-8($dat)
+	mov	%eax,-4($dat)
+	ret
+.size	RC4_set_key,.-RC4_set_key
+
+.globl	RC4_options
+.type	RC4_options,\@function,0
+.align	16
+RC4_options:
+	.picmeup %rax
+	lea	.Lopts-.(%rax),%rax
+	mov	OPENSSL_ia32cap_P(%rip),%edx
+	bt	\$20,%edx
+	jnc	.Ldone
+	add	\$12,%rax
+	bt	\$30,%edx
+	jnc	.Ldone
+	add	\$13,%rax
+.Ldone:
+	ret
+.align	64
+.Lopts:
+.asciz	"rc4(8x,int)"
+.asciz	"rc4(8x,char)"
+.asciz	"rc4(1x,char)"
+.asciz	"RC4 for x86_64, OpenSSL project"
+.align	64
+.size	RC4_options,.-RC4_options
+___
+
 $code =~ s/#([bwd])/$1/gm;

 print $code;
--- a/crypto/x86_64cpuid.pl
+++ b/crypto/x86_64cpuid.pl
@ -48,8 +48,37 @@ OPENSSL_wipe_cpu	ENDP

 OPENSSL_ia32_cpuid	PROC
 	mov	r8,rbx
+
+	xor	eax,eax
+	cpuid
+	xor	eax,eax
+	cmp	ebx,0756e6547h
+	setne	al
+	mov	r9d,eax
+	cmp	edx,049656e69h
+	setne	al
+	or	r9d,eax
+	cmp	ecx,06c65746eh
+	setne	al
+	or	r9d,eax
+
 	mov	eax,1
 	cpuid
+	bt	edx,28
+	jnc	\$Ldone
+	cmp	r9,0
+	jne	\$Lnotintel
+	or	edx,000100000h
+	and	ah,15
+	cmp	ah,15
+	je	\$Lnotintel
+	or	edx,040000000h
+\$Lnotintel:
+	shr	ebx,16
+	cmp	bl,1
+	ja	\$Ldone
+	and	edx,0efffffffh
+\$Ldone:
 	shl	rcx,32
 	mov	eax,edx
 	mov	rbx,r8
@ -124,8 +153,37 @@ OPENSSL_wipe_cpu:
 .align	16
 OPENSSL_ia32_cpuid:
 	movq	%rbx,%r8
+
+	xor	%eax,%eax
+	cpuid
+	xor	%eax,%eax
+	cmp	\$0x756e6547,%ebx	# "Genu"
+	setne	%al
+	mov	%eax,%r9d
+	cmp	\$0x49656e69,%edx	# "ineI"
+	setne	%al
+	or	%eax,%r9d
+	cmp	\$0x6c65746e,%ecx	# "ntel"
+	setne	%al
+	or	%eax,%r9d
+
 	movl	\$1,%eax
 	cpuid
+	bt	\$28,%edx		# test hyper-threading bit
+	jnc	.Ldone
+	cmp	\$0,%r9
+	jne	.Lnotintel
+	or	\$1<<20,%edx		# use reserved bit to engage RC4_CHAR
+	and	\$15,%ah
+	cmp	\$15,%ah		# examine Family ID
+	je	.Lnotintel
+	or	\$1<<30,%edx		# use reserved bit to skip unrolled loop
+.Lnotintel:
+	shr	\$16,%ebx
+	cmp	\$1,%bl			# see if cache is shared
+	ja	.Ldone
+	and	\$~(1<<28),%edx
+.Ldone:
 	shlq	\$32,%rcx
 	movl	%edx,%eax
 	movq	%r8,%rbx