mirror of
https://github.com/openssl/openssl.git
synced 2024-11-24 18:43:34 +08:00
RC4_set_key for x86_64 and Core2 optimization.
PR: 1447
This commit is contained in:
parent
2ec0be9e77
commit
9babf3929b
@ -120,7 +120,7 @@ my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o mo86-elf.o:dx86-elf.o yx86
|
||||
my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o mo86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o:rm86-cof.o:r586-cof.o:wp_block.o w86mmx-cof.o";
|
||||
my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o mo86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o:wp_block.o w86mmx-out.o";
|
||||
|
||||
my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4_skey.o:::wp-x86_64.o";
|
||||
my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o";
|
||||
my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::";
|
||||
my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::::::::";
|
||||
my $alpha_asm=":bn_asm.o alpha-mont.o::::::::::";
|
||||
|
8
TABLE
8
TABLE
@ -268,7 +268,7 @@ $bf_obj =
|
||||
$md5_obj = md5-x86_64.o
|
||||
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
|
||||
$cast_obj =
|
||||
$rc4_obj = rc4-x86_64.o rc4_skey.o
|
||||
$rc4_obj = rc4-x86_64.o
|
||||
$rmd160_obj =
|
||||
$rc5_obj =
|
||||
$wp_obj = wp-x86_64.o
|
||||
@ -3152,7 +3152,7 @@ $bf_obj =
|
||||
$md5_obj = md5-x86_64.o
|
||||
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
|
||||
$cast_obj =
|
||||
$rc4_obj = rc4-x86_64.o rc4_skey.o
|
||||
$rc4_obj = rc4-x86_64.o
|
||||
$rmd160_obj =
|
||||
$rc5_obj =
|
||||
$wp_obj = wp-x86_64.o
|
||||
@ -3964,7 +3964,7 @@ $bf_obj =
|
||||
$md5_obj = md5-x86_64.o
|
||||
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
|
||||
$cast_obj =
|
||||
$rc4_obj = rc4-x86_64.o rc4_skey.o
|
||||
$rc4_obj = rc4-x86_64.o
|
||||
$rmd160_obj =
|
||||
$rc5_obj =
|
||||
$wp_obj = wp-x86_64.o
|
||||
@ -3992,7 +3992,7 @@ $bf_obj =
|
||||
$md5_obj = md5-x86_64.o
|
||||
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
|
||||
$cast_obj =
|
||||
$rc4_obj = rc4-x86_64.o rc4_skey.o
|
||||
$rc4_obj = rc4-x86_64.o
|
||||
$rmd160_obj =
|
||||
$rc5_obj =
|
||||
$wp_obj = wp-x86_64.o
|
||||
|
@ -49,6 +49,14 @@
|
||||
# is not implemented, then this final RC4_CHAR code-path should be
|
||||
# preferred, as it provides better *all-round* performance].
|
||||
|
||||
# Intel Core2 was observed to perform poorly on both code paths:-( It
|
||||
# apparently suffers from some kind of partial register stall, which
|
||||
# occurs in 64-bit mode only [as virtually identical 32-bit loop was
|
||||
# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
|
||||
# cloop1 boosts its performance by 80%! This loop appears to be optimal
|
||||
# fit for Core2 and therefore the code was modified to skip cloop8 on
|
||||
# this CPU.
|
||||
|
||||
$output=shift;
|
||||
open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
|
||||
|
||||
@ -152,6 +160,8 @@ $code.=<<___;
|
||||
movzb ($dat,$XX[0]),$TX[0]#d
|
||||
test \$-8,$len
|
||||
jz .Lcloop1
|
||||
cmp \$0,260($dat)
|
||||
jnz .Lcloop1
|
||||
push %rbx
|
||||
jmp .Lcloop8
|
||||
.align 16
|
||||
@ -235,6 +245,111 @@ $code.=<<___;
|
||||
.size RC4,.-RC4
|
||||
___
|
||||
|
||||
$idx="%r8";
|
||||
$ido="%r9";
|
||||
|
||||
$code.=<<___;
|
||||
.extern OPENSSL_ia32cap_P
|
||||
.globl RC4_set_key
|
||||
.type RC4_set_key,\@function,3
|
||||
.align 16
|
||||
RC4_set_key:
|
||||
lea 8($dat),$dat
|
||||
lea ($inp,$len),$inp
|
||||
neg $len
|
||||
mov $len,%rcx
|
||||
xor %eax,%eax
|
||||
xor $ido,$ido
|
||||
xor %r10,%r10
|
||||
xor %r11,%r11
|
||||
|
||||
mov OPENSSL_ia32cap_P(%rip),$idx#d
|
||||
bt \$20,$idx#d
|
||||
jnc .Lw1stloop
|
||||
bt \$30,$idx#d
|
||||
setc $ido#b
|
||||
mov $ido#d,260($dat)
|
||||
jmp .Lc1stloop
|
||||
|
||||
.align 16
|
||||
.Lw1stloop:
|
||||
mov %eax,($dat,%rax,4)
|
||||
add \$1,%al
|
||||
jnc .Lw1stloop
|
||||
|
||||
xor $ido,$ido
|
||||
xor $idx,$idx
|
||||
.align 16
|
||||
.Lw2ndloop:
|
||||
mov ($dat,$ido,4),%r10d
|
||||
add ($inp,$len,1),$idx#b
|
||||
add %r10b,$idx#b
|
||||
add \$1,$len
|
||||
mov ($dat,$idx,4),%r11d
|
||||
cmovz %rcx,$len
|
||||
mov %r10d,($dat,$idx,4)
|
||||
mov %r11d,($dat,$ido,4)
|
||||
add \$1,$ido#b
|
||||
jnc .Lw2ndloop
|
||||
jmp .Lexit_key
|
||||
|
||||
.align 16
|
||||
.Lc1stloop:
|
||||
mov %al,($dat,%rax)
|
||||
add \$1,%al
|
||||
jnc .Lc1stloop
|
||||
|
||||
xor $ido,$ido
|
||||
xor $idx,$idx
|
||||
.align 16
|
||||
.Lc2ndloop:
|
||||
mov ($dat,$ido),%r10b
|
||||
add ($inp,$len),$idx#b
|
||||
add %r10b,$idx#b
|
||||
add \$1,$len
|
||||
mov ($dat,$idx),%r11b
|
||||
jnz .Lcnowrap
|
||||
mov %rcx,$len
|
||||
.Lcnowrap:
|
||||
mov %r10b,($dat,$idx)
|
||||
mov %r11b,($dat,$ido)
|
||||
add \$1,$ido#b
|
||||
jnc .Lc2ndloop
|
||||
movl \$-1,256($dat)
|
||||
|
||||
.align 16
|
||||
.Lexit_key:
|
||||
xor %eax,%eax
|
||||
mov %eax,-8($dat)
|
||||
mov %eax,-4($dat)
|
||||
ret
|
||||
.size RC4_set_key,.-RC4_set_key
|
||||
|
||||
.globl RC4_options
|
||||
.type RC4_options,\@function,0
|
||||
.align 16
|
||||
RC4_options:
|
||||
.picmeup %rax
|
||||
lea .Lopts-.(%rax),%rax
|
||||
mov OPENSSL_ia32cap_P(%rip),%edx
|
||||
bt \$20,%edx
|
||||
jnc .Ldone
|
||||
add \$12,%rax
|
||||
bt \$30,%edx
|
||||
jnc .Ldone
|
||||
add \$13,%rax
|
||||
.Ldone:
|
||||
ret
|
||||
.align 64
|
||||
.Lopts:
|
||||
.asciz "rc4(8x,int)"
|
||||
.asciz "rc4(8x,char)"
|
||||
.asciz "rc4(1x,char)"
|
||||
.asciz "RC4 for x86_64, OpenSSL project"
|
||||
.align 64
|
||||
.size RC4_options,.-RC4_options
|
||||
___
|
||||
|
||||
$code =~ s/#([bwd])/$1/gm;
|
||||
|
||||
print $code;
|
||||
|
@ -48,8 +48,37 @@ OPENSSL_wipe_cpu ENDP
|
||||
|
||||
OPENSSL_ia32_cpuid PROC
|
||||
mov r8,rbx
|
||||
|
||||
xor eax,eax
|
||||
cpuid
|
||||
xor eax,eax
|
||||
cmp ebx,0756e6547h
|
||||
setne al
|
||||
mov r9d,eax
|
||||
cmp edx,049656e69h
|
||||
setne al
|
||||
or r9d,eax
|
||||
cmp ecx,06c65746eh
|
||||
setne al
|
||||
or r9d,eax
|
||||
|
||||
mov eax,1
|
||||
cpuid
|
||||
bt edx,28
|
||||
jnc \$Ldone
|
||||
cmp r9,0
|
||||
jne \$Lnotintel
|
||||
or edx,000100000h
|
||||
and ah,15
|
||||
cmp ah,15
|
||||
je \$Lnotintel
|
||||
or edx,040000000h
|
||||
\$Lnotintel:
|
||||
shr ebx,16
|
||||
cmp bl,1
|
||||
ja \$Ldone
|
||||
and edx,0efffffffh
|
||||
\$Ldone:
|
||||
shl rcx,32
|
||||
mov eax,edx
|
||||
mov rbx,r8
|
||||
@ -124,8 +153,37 @@ OPENSSL_wipe_cpu:
|
||||
.align 16
|
||||
OPENSSL_ia32_cpuid:
|
||||
movq %rbx,%r8
|
||||
|
||||
xor %eax,%eax
|
||||
cpuid
|
||||
xor %eax,%eax
|
||||
cmp \$0x756e6547,%ebx # "Genu"
|
||||
setne %al
|
||||
mov %eax,%r9d
|
||||
cmp \$0x49656e69,%edx # "ineI"
|
||||
setne %al
|
||||
or %eax,%r9d
|
||||
cmp \$0x6c65746e,%ecx # "ntel"
|
||||
setne %al
|
||||
or %eax,%r9d
|
||||
|
||||
movl \$1,%eax
|
||||
cpuid
|
||||
bt \$28,%edx # test hyper-threading bit
|
||||
jnc .Ldone
|
||||
cmp \$0,%r9
|
||||
jne .Lnotintel
|
||||
or \$1<<20,%edx # use reserved bit to engage RC4_CHAR
|
||||
and \$15,%ah
|
||||
cmp \$15,%ah # examine Family ID
|
||||
je .Lnotintel
|
||||
or \$1<<30,%edx # use reserved bit to skip unrolled loop
|
||||
.Lnotintel:
|
||||
shr \$16,%ebx
|
||||
cmp \$1,%bl # see if cache is shared
|
||||
ja .Ldone
|
||||
and \$~(1<<28),%edx
|
||||
.Ldone:
|
||||
shlq \$32,%rcx
|
||||
movl %edx,%eax
|
||||
movq %r8,%rbx
|
||||
|
Loading…
Reference in New Issue
Block a user