RC4_set_key for x86_64 and Core2 optimization.

PR: 1447
This commit is contained in:
Andy Polyakov 2007-04-02 09:50:14 +00:00
parent 2ec0be9e77
commit 9babf3929b
4 changed files with 178 additions and 5 deletions

View File

@ -120,7 +120,7 @@ my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o mo86-elf.o:dx86-elf.o yx86
my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o mo86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o:rm86-cof.o:r586-cof.o:wp_block.o w86mmx-cof.o";
my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o mo86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o:wp_block.o w86mmx-out.o";
my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4_skey.o:::wp-x86_64.o";
my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o";
my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::";
my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::::::::";
my $alpha_asm=":bn_asm.o alpha-mont.o::::::::::";

8
TABLE
View File

@ -268,7 +268,7 @@ $bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4_skey.o
$rc4_obj = rc4-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
@ -3152,7 +3152,7 @@ $bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4_skey.o
$rc4_obj = rc4-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
@ -3964,7 +3964,7 @@ $bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4_skey.o
$rc4_obj = rc4-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
@ -3992,7 +3992,7 @@ $bf_obj =
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
$rc4_obj = rc4-x86_64.o rc4_skey.o
$rc4_obj = rc4-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o

View File

@ -49,6 +49,14 @@
# is not implemented, then this final RC4_CHAR code-path should be
# preferred, as it provides better *all-round* performance].
# Intel Core2 was observed to perform poorly on both code paths:-( It
# apparently suffers from some kind of partial register stall, which
# occurs in 64-bit mode only [as virtually identical 32-bit loop was
# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
# cloop1 boosts its performance by 80%! This loop appears to be optimal
# fit for Core2 and therefore the code was modified to skip cloop8 on
# this CPU.
$output=shift;
open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
@ -152,6 +160,8 @@ $code.=<<___;
movzb ($dat,$XX[0]),$TX[0]#d
test \$-8,$len
jz .Lcloop1
cmp \$0,260($dat)
jnz .Lcloop1
push %rbx
jmp .Lcloop8
.align 16
@ -235,6 +245,111 @@ $code.=<<___;
.size RC4,.-RC4
___
$idx="%r8";
$ido="%r9";
$code.=<<___;
.extern OPENSSL_ia32cap_P
.globl RC4_set_key
.type RC4_set_key,\@function,3
.align 16
RC4_set_key:
lea 8($dat),$dat
lea ($inp,$len),$inp
neg $len
mov $len,%rcx
xor %eax,%eax
xor $ido,$ido
xor %r10,%r10
xor %r11,%r11
mov OPENSSL_ia32cap_P(%rip),$idx#d
bt \$20,$idx#d
jnc .Lw1stloop
bt \$30,$idx#d
setc $ido#b
mov $ido#d,260($dat)
jmp .Lc1stloop
.align 16
.Lw1stloop:
mov %eax,($dat,%rax,4)
add \$1,%al
jnc .Lw1stloop
xor $ido,$ido
xor $idx,$idx
.align 16
.Lw2ndloop:
mov ($dat,$ido,4),%r10d
add ($inp,$len,1),$idx#b
add %r10b,$idx#b
add \$1,$len
mov ($dat,$idx,4),%r11d
cmovz %rcx,$len
mov %r10d,($dat,$idx,4)
mov %r11d,($dat,$ido,4)
add \$1,$ido#b
jnc .Lw2ndloop
jmp .Lexit_key
.align 16
.Lc1stloop:
mov %al,($dat,%rax)
add \$1,%al
jnc .Lc1stloop
xor $ido,$ido
xor $idx,$idx
.align 16
.Lc2ndloop:
mov ($dat,$ido),%r10b
add ($inp,$len),$idx#b
add %r10b,$idx#b
add \$1,$len
mov ($dat,$idx),%r11b
jnz .Lcnowrap
mov %rcx,$len
.Lcnowrap:
mov %r10b,($dat,$idx)
mov %r11b,($dat,$ido)
add \$1,$ido#b
jnc .Lc2ndloop
movl \$-1,256($dat)
.align 16
.Lexit_key:
xor %eax,%eax
mov %eax,-8($dat)
mov %eax,-4($dat)
ret
.size RC4_set_key,.-RC4_set_key
.globl RC4_options
.type RC4_options,\@function,0
.align 16
RC4_options:
.picmeup %rax
lea .Lopts-.(%rax),%rax
mov OPENSSL_ia32cap_P(%rip),%edx
bt \$20,%edx
jnc .Ldone
add \$12,%rax
bt \$30,%edx
jnc .Ldone
add \$13,%rax
.Ldone:
ret
.align 64
.Lopts:
.asciz "rc4(8x,int)"
.asciz "rc4(8x,char)"
.asciz "rc4(1x,char)"
.asciz "RC4 for x86_64, OpenSSL project"
.align 64
.size RC4_options,.-RC4_options
___
$code =~ s/#([bwd])/$1/gm;
print $code;

View File

@ -48,8 +48,37 @@ OPENSSL_wipe_cpu ENDP
OPENSSL_ia32_cpuid PROC
mov r8,rbx
xor eax,eax
cpuid
xor eax,eax
cmp ebx,0756e6547h
setne al
mov r9d,eax
cmp edx,049656e69h
setne al
or r9d,eax
cmp ecx,06c65746eh
setne al
or r9d,eax
mov eax,1
cpuid
bt edx,28
jnc \$Ldone
cmp r9,0
jne \$Lnotintel
or edx,000100000h
and ah,15
cmp ah,15
je \$Lnotintel
or edx,040000000h
\$Lnotintel:
shr ebx,16
cmp bl,1
ja \$Ldone
and edx,0efffffffh
\$Ldone:
shl rcx,32
mov eax,edx
mov rbx,r8
@ -124,8 +153,37 @@ OPENSSL_wipe_cpu:
.align 16
OPENSSL_ia32_cpuid:
movq %rbx,%r8
xor %eax,%eax
cpuid
xor %eax,%eax
cmp \$0x756e6547,%ebx # "Genu"
setne %al
mov %eax,%r9d
cmp \$0x49656e69,%edx # "ineI"
setne %al
or %eax,%r9d
cmp \$0x6c65746e,%ecx # "ntel"
setne %al
or %eax,%r9d
movl \$1,%eax
cpuid
bt \$28,%edx # test hyper-threading bit
jnc .Ldone
cmp \$0,%r9
jne .Lnotintel
or \$1<<20,%edx # use reserved bit to engage RC4_CHAR
and \$15,%ah
cmp \$15,%ah # examine Family ID
je .Lnotintel
or \$1<<30,%edx # use reserved bit to skip unrolled loop
.Lnotintel:
shr \$16,%ebx
cmp \$1,%bl # see if cache is shared
ja .Ldone
and \$~(1<<28),%edx
.Ldone:
shlq \$32,%rcx
movl %edx,%eax
movq %r8,%rbx