aesni-x86_64.pl: optimize CBC decrypt.

Give CBC decrypt approximately same treatment as to CTR and collect 25%.
This commit is contained in:
Andy Polyakov 2013-04-04 15:56:23 +02:00
parent a42abde699
commit 73325b221c

View File

@ -129,7 +129,7 @@
#
# Further data for other parallelizable modes:
#
# CBC decrypt 1.16 0.93 0.93
# CBC decrypt 1.16 0.93 0.74
# CTR 1.14 0.91 0.77
#
# Well, given 3x column it's probably inappropriate to call the limit
@ -159,7 +159,7 @@
######################################################################
# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
# with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70
# with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70
# in ECB, 0.71 in CTR, 0.95 in XTS... This means that aes[enc|dec]
# instruction latency is 9 cycles and that they can be issued every
# cycle.
@ -1302,18 +1302,18 @@ $code.=<<___;
call .Lenc_loop8_enter
xorps $in0,$inout0
movups 0x30($inp),$in3
xorps $in1,$inout1
movups 0x40($inp),$in0
xorps $in2,$inout2
movups $inout0,($out)
xorps $in3,$inout3
movups $inout1,0x10($out)
xorps $in0,$inout4
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movdqu 0x30($inp),$in3
pxor $in0,$inout0
movdqu 0x40($inp),$in0
pxor $in1,$inout1
movdqu $inout0,($out)
pxor $in2,$inout2
movdqu $inout1,0x10($out)
pxor $in3,$inout3
movdqu $inout2,0x20($out)
pxor $in0,$inout4
movdqu $inout3,0x30($out)
movdqu $inout4,0x40($out)
cmp \$6,$len
jb .Lctr32_done
@ -1350,10 +1350,10 @@ $code.=<<___;
movups $inout0,($out)
xorps $in1,$inout1
movups $inout1,0x10($out)
xorps $in2,$inout2
movups $inout2,0x20($out)
xorps $in3,$inout3
movups $inout3,0x30($out)
pxor $in2,$inout2
movdqu $inout2,0x20($out)
pxor $in3,$inout3
movdqu $inout3,0x30($out)
jmp .Lctr32_done
.align 32
@ -2264,7 +2264,10 @@ ___
# size_t length, const AES_KEY *key,
# unsigned char *ivp,const int enc);
{
my $frame_size = 0x10 + ($win64?0x40:0); # used in decrypt
my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
my $inp_=$key_;
$code.=<<___;
.globl ${PREFIX}_cbc_encrypt
.type ${PREFIX}_cbc_encrypt,\@function,6
@ -2330,248 +2333,301 @@ $code.=<<___ if ($win64);
movaps %xmm7,0x20(%rsp)
movaps %xmm8,0x30(%rsp)
movaps %xmm9,0x40(%rsp)
movaps %xmm10,0x50(%rsp)
movaps %xmm11,0x60(%rsp)
movaps %xmm12,0x70(%rsp)
movaps %xmm13,0x80(%rsp)
movaps %xmm14,0x90(%rsp)
movaps %xmm15,0xa0(%rsp)
.Lcbc_decrypt_body:
___
$code.=<<___;
lea -8(%rax),%rbp
movups ($ivp),$iv
mov $rnds_,$rounds
cmp \$0x70,$len
cmp \$0x50,$len
jbe .Lcbc_dec_tail
shr \$1,$rnds_
$movkey ($key),$rndkey0
movdqu 0x00($inp),$inout0 # load input
movdqu 0x10($inp),$inout1
movdqa $inout0,$in0
movdqu 0x20($inp),$inout2
movdqa $inout1,$in1
movdqu 0x30($inp),$inout3
movdqa $inout2,$in2
movdqu 0x40($inp),$inout4
movdqa $inout3,$in3
movdqu 0x50($inp),$inout5
movdqa $inout4,$in4
cmp \$0x70,$len
jbe .Lcbc_dec_six_or_seven
sub \$0x70,$len
mov $rnds_,$rounds
movaps $iv,(%rsp)
lea 0x70($key),$key # size optimization
jmp .Lcbc_dec_loop8_enter
.align 16
.Lcbc_dec_loop8:
movaps $rndkey0,(%rsp) # save IV
movups $inout7,($out)
lea 0x10($out),$out
.Lcbc_dec_loop8_enter:
$movkey ($key),$rndkey0
movups ($inp),$inout0 # load input
movups 0x10($inp),$inout1
$movkey 16($key),$rndkey1
lea 32($key),$key
movdqu 0x20($inp),$inout2
xorps $rndkey0,$inout0
movdqu 0x30($inp),$inout3
xorps $rndkey0,$inout1
movdqu 0x40($inp),$inout4
aesdec $rndkey1,$inout0
movdqu 0x60($inp),$inout6
pxor $rndkey0,$inout0
movdqu 0x70($inp),$inout7
pxor $rndkey0,$inout1
$movkey 0x10-0x70($key),$rndkey1
pxor $rndkey0,$inout2
movdqu 0x50($inp),$inout5
aesdec $rndkey1,$inout1
xor $inp_,$inp_
cmp \$0x70,$len # is there at least 0x60 bytes ahead?
pxor $rndkey0,$inout3
movdqu 0x60($inp),$inout6
aesdec $rndkey1,$inout2
pxor $rndkey0,$inout4
movdqu 0x70($inp),$inout7
aesdec $rndkey1,$inout3
pxor $rndkey0,$inout5
dec $rounds
aesdec $rndkey1,$inout4
pxor $rndkey0,$inout6
aesdec $rndkey1,$inout5
aesdec $rndkey1,$inout0
pxor $rndkey0,$inout7
$movkey ($key),$rndkey0
$movkey 0x20-0x70($key),$rndkey0
aesdec $rndkey1,$inout1
aesdec $rndkey1,$inout2
aesdec $rndkey1,$inout3
aesdec $rndkey1,$inout4
aesdec $rndkey1,$inout5
setnc ${inp_}b
aesdec $rndkey1,$inout6
shl \$7,$inp_
aesdec $rndkey1,$inout7
add $inp,$inp_
$movkey 0x30-0x70($key),$rndkey1
___
for($i=1;$i<12;$i++) {
my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
$code.=<<___;
aesdec $rndkeyx,$inout0
aesdec $rndkeyx,$inout1
aesdec $rndkeyx,$inout2
aesdec $rndkeyx,$inout3
aesdec $rndkeyx,$inout4
aesdec $rndkeyx,$inout5
aesdec $rndkeyx,$inout6
aesdec $rndkeyx,$inout7
$movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
___
$code.=<<___ if ($i==7);
cmp \$11,$rounds
jb .Lcbc_dec_done
___
$code.=<<___ if ($i==9);
je .Lcbc_dec_done
___
}
$code.=<<___;
.Lcbc_dec_done:
aesdec $rndkey1,$inout0
pxor $rndkey0,$iv
aesdec $rndkey1,$inout1
pxor $rndkey0,$in0
aesdec $rndkey1,$inout2
pxor $rndkey0,$in1
aesdec $rndkey1,$inout3
pxor $rndkey0,$in2
aesdec $rndkey1,$inout4
pxor $rndkey0,$in3
aesdec $rndkey1,$inout5
pxor $rndkey0,$in4
aesdec $rndkey1,$inout6
aesdec $rndkey1,$inout7
$movkey 16($key),$rndkey1
movdqu 0x50($inp),$rndkey1
call .Ldec_loop8_enter
aesdeclast $iv,$inout0
movdqu 0x60($inp),$iv # borrow $iv
pxor $rndkey0,$rndkey1
aesdeclast $in0,$inout1
pxor $rndkey0,$iv
movdqu 0x70($inp),$rndkey0 # next IV
lea 0x80($inp),$inp
aesdeclast $in1,$inout2
movdqu 0x00($inp_),$in0
aesdeclast $in2,$inout3
movdqu 0x10($inp_),$in1
aesdeclast $in3,$inout4
movdqu 0x20($inp_),$in2
aesdeclast $in4,$inout5
movdqu 0x30($inp_),$in3
aesdeclast $rndkey1,$inout6
movdqu 0x40($inp_),$in4
aesdeclast $iv,$inout7
movdqa $rndkey0,$iv # return $iv
movdqu 0x50($inp_),$rndkey1
$movkey -0x70($key),$rndkey0
movups $inout0,($out) # store output
movdqa $in0,$inout0
movups $inout1,0x10($out)
movdqa $in1,$inout1
movups $inout2,0x20($out)
movdqa $in2,$inout2
movups $inout3,0x30($out)
movdqa $in3,$inout3
movups $inout4,0x40($out)
movdqa $in4,$inout4
movups $inout5,0x50($out)
movdqa $rndkey1,$inout5
movups $inout6,0x60($out)
lea 0x70($out),$out
movups ($inp),$rndkey1 # re-load input
movups 0x10($inp),$rndkey0
xorps (%rsp),$inout0 # ^= IV
xorps $rndkey1,$inout1
movups 0x20($inp),$rndkey1
xorps $rndkey0,$inout2
movups 0x30($inp),$rndkey0
xorps $rndkey1,$inout3
movups 0x40($inp),$rndkey1
xorps $rndkey0,$inout4
movups 0x50($inp),$rndkey0
xorps $rndkey1,$inout5
movups 0x60($inp),$rndkey1
xorps $rndkey0,$inout6
movups 0x70($inp),$rndkey0 # IV
xorps $rndkey1,$inout7
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
mov $rnds_,$rounds # restore $rounds
movups $inout4,0x40($out)
mov $key_,$key # restore $key
movups $inout5,0x50($out)
lea 0x80($inp),$inp
movups $inout6,0x60($out)
lea 0x70($out),$out
sub \$0x80,$len
ja .Lcbc_dec_loop8
movaps $inout7,$inout0
movaps $rndkey0,$iv
lea -0x70($key),$key
add \$0x70,$len
jle .Lcbc_dec_tail_collected
movups $inout0,($out)
lea 1($rnds_,$rnds_),$rounds
movups $inout7,($out)
lea 0x10($out),$out
cmp \$0x50,$len
jbe .Lcbc_dec_tail
movaps $in0,$inout0
.Lcbc_dec_six_or_seven:
cmp \$0x60,$len
ja .Lcbc_dec_seven
movaps $inout5,$inout6
call _aesni_decrypt6
pxor $iv,$inout0 # ^= IV
movaps $inout6,$iv
pxor $in0,$inout1
movdqu $inout0,($out)
pxor $in1,$inout2
movdqu $inout1,0x10($out)
pxor $in2,$inout3
movdqu $inout2,0x20($out)
pxor $in3,$inout4
movdqu $inout3,0x30($out)
pxor $in4,$inout5
movdqu $inout4,0x40($out)
lea 0x50($out),$out
movdqa $inout5,$inout0
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_seven:
movups 0x60($inp),$inout6
xorps $inout7,$inout7
call _aesni_decrypt8
movups 0x50($inp),$inout7
pxor $iv,$inout0 # ^= IV
movups 0x60($inp),$iv
pxor $in0,$inout1
movdqu $inout0,($out)
pxor $in1,$inout2
movdqu $inout1,0x10($out)
pxor $in2,$inout3
movdqu $inout2,0x20($out)
pxor $in3,$inout4
movdqu $inout3,0x30($out)
pxor $in4,$inout5
movdqu $inout4,0x40($out)
pxor $inout7,$inout6
movdqu $inout5,0x50($out)
lea 0x60($out),$out
movdqa $inout6,$inout0
jmp .Lcbc_dec_tail_collected
.Lcbc_dec_tail:
movups ($inp),$inout0
movaps $inout0,$in0
cmp \$0x10,$len
sub \$0x10,$len
jbe .Lcbc_dec_one
movups 0x10($inp),$inout1
movaps $inout1,$in1
cmp \$0x20,$len
movaps $inout0,$in0
sub \$0x10,$len
jbe .Lcbc_dec_two
movups 0x20($inp),$inout2
movaps $inout2,$in2
cmp \$0x30,$len
movaps $inout1,$in1
sub \$0x10,$len
jbe .Lcbc_dec_three
movups 0x30($inp),$inout3
cmp \$0x40,$len
movaps $inout2,$in2
sub \$0x10,$len
jbe .Lcbc_dec_four
movups 0x40($inp),$inout4
cmp \$0x50,$len
jbe .Lcbc_dec_five
movups 0x50($inp),$inout5
cmp \$0x60,$len
jbe .Lcbc_dec_six
movups 0x60($inp),$inout6
movaps $iv,(%rsp) # save IV
call _aesni_decrypt8
movups ($inp),$rndkey1
movups 0x10($inp),$rndkey0
xorps (%rsp),$inout0 # ^= IV
xorps $rndkey1,$inout1
movups 0x20($inp),$rndkey1
xorps $rndkey0,$inout2
movups 0x30($inp),$rndkey0
xorps $rndkey1,$inout3
movups 0x40($inp),$rndkey1
xorps $rndkey0,$inout4
movups 0x50($inp),$rndkey0
xorps $rndkey1,$inout5
movups 0x60($inp),$iv # IV
xorps $rndkey0,$inout6
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
lea 0x60($out),$out
movaps $inout6,$inout0
sub \$0x70,$len
movaps $inout3,$in3
movaps $inout4,$in4
xorps $inout5,$inout5
call _aesni_decrypt6
pxor $iv,$inout0
movaps $in4,$iv
pxor $in0,$inout1
movdqu $inout0,($out)
pxor $in1,$inout2
movdqu $inout1,0x10($out)
pxor $in2,$inout3
movdqu $inout2,0x20($out)
pxor $in3,$inout4
movdqu $inout3,0x30($out)
lea 0x40($out),$out
movdqa $inout4,$inout0
sub \$0x10,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_one:
movaps $inout0,$in0
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
xorps $iv,$inout0
movaps $in0,$iv
sub \$0x10,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_two:
movaps $inout1,$in1
xorps $inout2,$inout2
call _aesni_decrypt3
xorps $iv,$inout0
xorps $in0,$inout1
movups $inout0,($out)
pxor $iv,$inout0
movaps $in1,$iv
movaps $inout1,$inout0
pxor $in0,$inout1
movdqu $inout0,($out)
movdqa $inout1,$inout0
lea 0x10($out),$out
sub \$0x20,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_three:
movaps $inout2,$in2
call _aesni_decrypt3
xorps $iv,$inout0
xorps $in0,$inout1
movups $inout0,($out)
xorps $in1,$inout2
movups $inout1,0x10($out)
pxor $iv,$inout0
movaps $in2,$iv
movaps $inout2,$inout0
pxor $in0,$inout1
movdqu $inout0,($out)
pxor $in1,$inout2
movdqu $inout1,0x10($out)
movdqa $inout2,$inout0
lea 0x20($out),$out
sub \$0x30,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_four:
movaps $inout3,$in3
call _aesni_decrypt4
xorps $iv,$inout0
movups 0x30($inp),$iv
xorps $in0,$inout1
movups $inout0,($out)
xorps $in1,$inout2
movups $inout1,0x10($out)
xorps $in2,$inout3
movups $inout2,0x20($out)
movaps $inout3,$inout0
pxor $iv,$inout0
movaps $in3,$iv
pxor $in0,$inout1
movdqu $inout0,($out)
pxor $in1,$inout2
movdqu $inout1,0x10($out)
pxor $in2,$inout3
movdqu $inout2,0x20($out)
movdqa $inout3,$inout0
lea 0x30($out),$out
sub \$0x40,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_five:
xorps $inout5,$inout5
call _aesni_decrypt6
movups 0x10($inp),$rndkey1
movups 0x20($inp),$rndkey0
xorps $iv,$inout0
xorps $in0,$inout1
xorps $rndkey1,$inout2
movups 0x30($inp),$rndkey1
xorps $rndkey0,$inout3
movups 0x40($inp),$iv
xorps $rndkey1,$inout4
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
lea 0x40($out),$out
movaps $inout4,$inout0
sub \$0x50,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_six:
call _aesni_decrypt6
movups 0x10($inp),$rndkey1
movups 0x20($inp),$rndkey0
xorps $iv,$inout0
xorps $in0,$inout1
xorps $rndkey1,$inout2
movups 0x30($inp),$rndkey1
xorps $rndkey0,$inout3
movups 0x40($inp),$rndkey0
xorps $rndkey1,$inout4
movups 0x50($inp),$iv
xorps $rndkey0,$inout5
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
lea 0x50($out),$out
movaps $inout5,$inout0
sub \$0x60,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_tail_collected:
and \$15,$len
movups $iv,($ivp)
and \$15,$len
jnz .Lcbc_dec_tail_partial
movups $inout0,($out)
jmp .Lcbc_dec_ret
@ -2591,6 +2647,12 @@ $code.=<<___ if ($win64);
movaps 0x20(%rsp),%xmm7
movaps 0x30(%rsp),%xmm8
movaps 0x40(%rsp),%xmm9
movaps 0x50(%rsp),%xmm10
movaps 0x60(%rsp),%xmm11
movaps 0x70(%rsp),%xmm12
movaps 0x80(%rsp),%xmm13
movaps 0x90(%rsp),%xmm14
movaps 0xa0(%rsp),%xmm15
___
$code.=<<___;
lea (%rbp),%rsp
@ -3013,7 +3075,7 @@ cbc_se_handler:
lea 16(%rax),%rsi # %xmm save area
lea 512($context),%rdi # &context.Xmm6
mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
.Lcommon_rbp_tail: