~15% better AES x86_64 assembler.

This commit is contained in:
Andy Polyakov 2005-07-18 09:15:04 +00:00
parent f42e6d24f2
commit afbe674edb
2 changed files with 526 additions and 180 deletions

View File

@ -6,9 +6,9 @@
# forms are granted according to the OpenSSL license.
# ====================================================================
#
# Version 1.0.
# Version 1.1.
#
# aes-*-cbc benchmarks are improved by 50% [compared to gcc 3.3.2 on
# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
# [you'll notice a lot of resemblance], such as compressed S-boxes
# in little-endian byte order, prefetch of these tables in CBC mode,
@ -18,14 +18,14 @@
# Performance in number of cycles per processed byte for 128-bit key:
#
# ECB CBC encrypt
# AMD64 15.6 14.6(*)
# EM64T 23.3(**) 21.4(*)
# AMD64 13.7 13.0(*)
# EM64T 20.2 18.6(*)
#
# (*) CBC benchmarks are better than ECB thanks to custom ABI used
# by the private block encryption function.
# (**) This module exhibits virtually same ECB performance as 32-bit
# counterpart on [current] Intel CPU.
$verticalspin=1; # unlike 32-bit version $verticalspin performs
# ~15% better on both AMD and Intel cores
$output=shift;
open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
@ -35,20 +35,22 @@ $s0="%eax";
$s1="%ebx";
$s2="%ecx";
$s3="%edx";
$inp="%rdi";
$out="%rsi";
$acc0="%ebp";
$acc1="%r8d";
$acc2="%r9d";
$acc0="%esi";
$acc1="%edi";
$acc2="%ebp";
$inp="%r8";
$out="%r9";
$t0="%r10d";
$t1="%r11d";
$t2="%r12d";
$cnt="%r13d";
$tbl="%r14";
$rnds="%r13d";
$sbox="%r14";
$key="%r15";
sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; $r; }
sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
$r =~ s/%[er]([sd]i)/%\1l/;
$r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
sub _data_word()
{ my $i;
while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@ -61,6 +63,169 @@ sub data_word()
$code.=sprintf"0x%08x\n",$last;
}
sub encvert()
{ my $t3="%r8d"; # zaps $inp!
my $qs0='"$s0"';
my $qs1='"$s1"';
my $qs2='"$s2"';
my $qs3='"$s3"';
$code.=<<___;
# favor 3-way issue Opteron pipeline...
movzb `&lo($qs0)`,$acc0
movzb `&lo($qs1)`,$acc1
movzb `&lo($qs2)`,$acc2
mov 0($sbox,$acc0,8),$t0
mov 0($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t2
movzb `&hi($qs1)`,$acc0
movzb `&hi($qs2)`,$acc1
movzb `&lo($qs3)`,$acc2
xor 3($sbox,$acc0,8),$t0
xor 3($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t3
movzb `&hi($qs3)`,$acc0
shr \$16,$s2
movzb `&hi($qs0)`,$acc2
xor 3($sbox,$acc0,8),$t2
shr \$16,$s3
xor 3($sbox,$acc2,8),$t3
shr \$16,$s1
lea 16($key),$key
shr \$16,$s0
movzb `&lo($qs2)`,$acc0
movzb `&lo($qs3)`,$acc1
movzb `&lo($qs0)`,$acc2
xor 2($sbox,$acc0,8),$t0
xor 2($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t2
movzb `&hi($qs3)`,$acc0
movzb `&hi($qs0)`,$acc1
movzb `&lo($qs1)`,$acc2
xor 1($sbox,$acc0,8),$t0
xor 1($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t3
mov 12($key),$s3
movzb `&hi($qs1)`,$acc1
movzb `&hi($qs2)`,$acc2
mov 0($key),$s0
xor 1($sbox,$acc1,8),$t2
xor 1($sbox,$acc2,8),$t3
mov 4($key),$s1
mov 8($key),$s2
xor $t0,$s0
xor $t1,$s1
xor $t2,$s2
xor $t3,$s3
___
}
sub enclastvert()
{ my $t3="%r8d"; # zaps $inp!
my $qs0='"$s0"';
my $qs1='"$s1"';
my $qs2='"$s2"';
my $qs3='"$s3"';
$code.=<<___;
movzb `&lo($qs0)`,$acc0
movzb `&lo($qs1)`,$acc1
movzb `&lo($qs2)`,$acc2
mov 2($sbox,$acc0,8),$t0
mov 2($sbox,$acc1,8),$t1
mov 2($sbox,$acc2,8),$t2
and \$0x000000ff,$t0
and \$0x000000ff,$t1
and \$0x000000ff,$t2
movzb `&lo($qs3)`,$acc0
movzb `&hi($qs1)`,$acc1
movzb `&hi($qs2)`,$acc2
mov 2($sbox,$acc0,8),$t3
mov 0($sbox,$acc1,8),$acc1 #$t0
mov 0($sbox,$acc2,8),$acc2 #$t1
and \$0x000000ff,$t3
and \$0x0000ff00,$acc1
and \$0x0000ff00,$acc2
xor $acc1,$t0
xor $acc2,$t1
shr \$16,$s2
movzb `&hi($qs3)`,$acc0
movzb `&hi($qs0)`,$acc1
shr \$16,$s3
mov 0($sbox,$acc0,8),$acc0 #$t2
mov 0($sbox,$acc1,8),$acc1 #$t3
and \$0x0000ff00,$acc0
and \$0x0000ff00,$acc1
shr \$16,$s1
xor $acc0,$t2
xor $acc1,$t3
shr \$16,$s0
movzb `&lo($qs2)`,$acc0
movzb `&lo($qs3)`,$acc1
movzb `&lo($qs0)`,$acc2
mov 0($sbox,$acc0,8),$acc0 #$t0
mov 0($sbox,$acc1,8),$acc1 #$t1
mov 0($sbox,$acc2,8),$acc2 #$t2
and \$0x00ff0000,$acc0
and \$0x00ff0000,$acc1
and \$0x00ff0000,$acc2
xor $acc0,$t0
xor $acc1,$t1
xor $acc2,$t2
movzb `&lo($qs1)`,$acc0
movzb `&hi($qs3)`,$acc1
movzb `&hi($qs0)`,$acc2
mov 0($sbox,$acc0,8),$acc0 #$t3
mov 2($sbox,$acc1,8),$acc1 #$t0
mov 2($sbox,$acc2,8),$acc2 #$t1
and \$0x00ff0000,$acc0
and \$0xff000000,$acc1
and \$0xff000000,$acc2
xor $acc0,$t3
xor $acc1,$t0
xor $acc2,$t1
movzb `&hi($qs1)`,$acc0
movzb `&hi($qs2)`,$acc1
mov 16+12($key),$s3
mov 2($sbox,$acc0,8),$acc0 #$t2
mov 2($sbox,$acc1,8),$acc1 #$t3
mov 16+0($key),$s0
and \$0xff000000,$acc0
and \$0xff000000,$acc1
xor $acc0,$t2
xor $acc1,$t3
mov 16+4($key),$s1
mov 16+8($key),$s2
xor $t0,$s0
xor $t1,$s1
xor $t2,$s2
xor $t3,$s3
___
}
sub encstep()
{ my ($i,@s) = @_;
my $tmp0=$acc0;
@ -68,24 +233,28 @@ sub encstep()
my $tmp2=$acc2;
my $out=($t0,$t1,$t2,$s[0])[$i];
$code.=" mov $s[0],$out\n" if ($i!=3);
$tmp1=$s[2] if ($i==3);
if ($i==3) {
$tmp0=$s[1];
$tmp1=$s[2];
$tmp2=$s[3];
}
$code.=" movzb ".&lo($s[0]).",$out\n";
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" and \$0xFF,$out\n";
$code.=" lea 16($key),$key\n" if ($i==0);
$code.=" mov 0($tbl,$out,8),$out\n";
$code.=" shr \$16,$tmp1\n";
$tmp2=$s[3] if ($i==3);
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
$tmp0=$s[1] if ($i==3);
$code.=" movzb ".&hi($s[1]).",$tmp0\n";
$code.=" and \$0xFF,$tmp1\n";
$code.=" shr \$24,$tmp2\n";
$code.=" mov 0($sbox,$out,8),$out\n";
$code.=" xor 3($tbl,$tmp0,8),$out\n";
$code.=" xor 2($tbl,$tmp1,8),$out\n";
$code.=" xor 1($tbl,$tmp2,8),$out\n";
$code.=" shr \$16,$tmp1\n";
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
$code.=" xor 3($sbox,$tmp0,8),$out\n";
$code.=" movzb ".&lo($tmp1).",$tmp1\n";
$code.=" shr \$24,$tmp2\n";
$code.=" xor 4*$i($key),$out\n";
$code.=" xor 2($sbox,$tmp1,8),$out\n";
$code.=" xor 1($sbox,$tmp2,8),$out\n";
$code.=" mov $t0,$s[1]\n" if ($i==3);
$code.=" mov $t1,$s[2]\n" if ($i==3);
@ -100,25 +269,26 @@ sub enclast()
my $tmp2=$acc2;
my $out=($t0,$t1,$t2,$s[0])[$i];
$code.=" mov $s[0],$out\n" if ($i!=3);
$tmp1=$s[2] if ($i==3);
if ($i==3) {
$tmp0=$s[1];
$tmp1=$s[2];
$tmp2=$s[3];
}
$code.=" movzb ".&lo($s[0]).",$out\n";
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" and \$0xFF,$out\n";
$code.=" mov 2($tbl,$out,8),$out\n";
$code.=" mov 2($sbox,$out,8),$out\n";
$code.=" shr \$16,$tmp1\n";
$tmp2=$s[3] if ($i==3);
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
$code.=" and \$0x000000ff,$out\n";
$tmp0=$s[1] if ($i==3);
$code.=" movzb ".&hi($s[1]).",$tmp0\n";
$code.=" and \$0xFF,$tmp1\n";
$code.=" movzb ".&lo($tmp1).",$tmp1\n";
$code.=" shr \$24,$tmp2\n";
$code.=" mov 0($tbl,$tmp0,8),$tmp0\n";
$code.=" mov 0($tbl,$tmp1,8),$tmp1\n";
$code.=" mov 2($tbl,$tmp2,8),$tmp2\n";
$code.=" mov 0($sbox,$tmp0,8),$tmp0\n";
$code.=" mov 0($sbox,$tmp1,8),$tmp1\n";
$code.=" mov 2($sbox,$tmp2,8),$tmp2\n";
$code.=" and \$0x0000ff00,$tmp0\n";
$code.=" and \$0x00ff0000,$tmp1\n";
@ -142,36 +312,35 @@ _x86_64_AES_encrypt:
xor 8($key),$s2
xor 12($key),$s3
mov 240($key),$cnt # load key->rounds
sub \$1,$cnt
.align 4
mov 240($key),$rnds # load key->rounds
sub \$1,$rnds
jmp .Lenc_loop
.align 16
.Lenc_loop:
___
&encstep(0,$s0,$s1,$s2,$s3);
&encstep(1,$s1,$s2,$s3,$s0);
&encstep(2,$s2,$s3,$s0,$s1);
&encstep(3,$s3,$s0,$s1,$s2);
if ($verticalspin) { &encvert(); }
else { &encstep(0,$s0,$s1,$s2,$s3);
&encstep(1,$s1,$s2,$s3,$s0);
&encstep(2,$s2,$s3,$s0,$s1);
&encstep(3,$s3,$s0,$s1,$s2);
}
$code.=<<___;
lea 16($key),$key
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
sub \$1,$cnt
sub \$1,$rnds
jnz .Lenc_loop
___
&enclast(0,$s0,$s1,$s2,$s3);
&enclast(1,$s1,$s2,$s3,$s0);
&enclast(2,$s2,$s3,$s0,$s1);
&enclast(3,$s3,$s0,$s1,$s2);
if ($verticalspin) { &enclastvert(); }
else { &enclast(0,$s0,$s1,$s2,$s3);
&enclast(1,$s1,$s2,$s3,$s0);
&enclast(2,$s2,$s3,$s0,$s1);
&enclast(3,$s3,$s0,$s1,$s2);
$code.=<<___;
xor 16+0($key),$s0 # xor with key
xor 16+4($key),$s1
xor 16+8($key),$s2
xor 16+12($key),$s3
___
}
$code.=<<___;
lea 16($key),$key
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
.byte 0xf3,0xc3 # rep ret
.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
___
@ -190,9 +359,11 @@ AES_encrypt:
push %r15
mov %rdx,$key
mov %rdi,$inp
mov %rsi,$out
.picmeup $tbl
lea AES_Te-.($tbl),$tbl
.picmeup $sbox
lea AES_Te-.($sbox),$sbox
mov 0($inp),$s0
mov 4($inp),$s1
@ -218,6 +389,169 @@ ___
#------------------------------------------------------------------#
sub decvert()
{ my $t3="%r8d"; # zaps $inp!
my $qs0='"$s0"';
my $qs1='"$s1"';
my $qs2='"$s2"';
my $qs3='"$s3"';
$code.=<<___;
# favor 3-way issue Opteron pipeline...
movzb `&lo($qs0)`,$acc0
movzb `&lo($qs1)`,$acc1
movzb `&lo($qs2)`,$acc2
mov 0($sbox,$acc0,8),$t0
mov 0($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t2
movzb `&hi($qs3)`,$acc0
movzb `&hi($qs0)`,$acc1
movzb `&lo($qs3)`,$acc2
xor 3($sbox,$acc0,8),$t0
xor 3($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t3
movzb `&hi($qs1)`,$acc0
shr \$16,$s0
movzb `&hi($qs2)`,$acc2
xor 3($sbox,$acc0,8),$t2
shr \$16,$s3
xor 3($sbox,$acc2,8),$t3
shr \$16,$s1
lea 16($key),$key
shr \$16,$s2
movzb `&lo($qs2)`,$acc0
movzb `&lo($qs3)`,$acc1
movzb `&lo($qs0)`,$acc2
xor 2($sbox,$acc0,8),$t0
xor 2($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t2
movzb `&hi($qs1)`,$acc0
movzb `&hi($qs2)`,$acc1
movzb `&lo($qs1)`,$acc2
xor 1($sbox,$acc0,8),$t0
xor 1($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t3
movzb `&hi($qs3)`,$acc0
mov 12($key),$s3
movzb `&hi($qs0)`,$acc2
xor 1($sbox,$acc0,8),$t2
mov 0($key),$s0
xor 1($sbox,$acc2,8),$t3
xor $t0,$s0
mov 4($key),$s1
mov 8($key),$s2
xor $t2,$s2
xor $t1,$s1
xor $t3,$s3
___
}
sub declastvert()
{ my $t3="%r8d"; # zaps $inp!
my $qs0='"$s0"';
my $qs1='"$s1"';
my $qs2='"$s2"';
my $qs3='"$s3"';
$code.=<<___;
movzb `&lo($qs0)`,$acc0
movzb `&lo($qs1)`,$acc1
movzb `&lo($qs2)`,$acc2
mov 2048($sbox,$acc0,4),$t0
mov 2048($sbox,$acc1,4),$t1
mov 2048($sbox,$acc2,4),$t2
and \$0x000000ff,$t0
and \$0x000000ff,$t1
and \$0x000000ff,$t2
movzb `&lo($qs3)`,$acc0
movzb `&hi($qs3)`,$acc1
movzb `&hi($qs0)`,$acc2
mov 2048($sbox,$acc0,4),$t3
mov 2048($sbox,$acc1,4),$acc1 #$t0
mov 2048($sbox,$acc2,4),$acc2 #$t1
and \$0x000000ff,$t3
and \$0x0000ff00,$acc1
and \$0x0000ff00,$acc2
xor $acc1,$t0
xor $acc2,$t1
shr \$16,$s3
movzb `&hi($qs1)`,$acc0
movzb `&hi($qs2)`,$acc1
shr \$16,$s0
mov 2048($sbox,$acc0,4),$acc0 #$t2
mov 2048($sbox,$acc1,4),$acc1 #$t3
and \$0x0000ff00,$acc0
and \$0x0000ff00,$acc1
shr \$16,$s1
xor $acc0,$t2
xor $acc1,$t3
shr \$16,$s2
movzb `&lo($qs2)`,$acc0
movzb `&lo($qs3)`,$acc1
movzb `&lo($qs0)`,$acc2
mov 2048($sbox,$acc0,4),$acc0 #$t0
mov 2048($sbox,$acc1,4),$acc1 #$t1
mov 2048($sbox,$acc2,4),$acc2 #$t2
and \$0x00ff0000,$acc0
and \$0x00ff0000,$acc1
and \$0x00ff0000,$acc2
xor $acc0,$t0
xor $acc1,$t1
xor $acc2,$t2
movzb `&lo($qs1)`,$acc0
movzb `&hi($qs1)`,$acc1
movzb `&hi($qs2)`,$acc2
mov 2048($sbox,$acc0,4),$acc0 #$t3
mov 2048($sbox,$acc1,4),$acc1 #$t0
mov 2048($sbox,$acc2,4),$acc2 #$t1
and \$0x00ff0000,$acc0
and \$0xff000000,$acc1
and \$0xff000000,$acc2
xor $acc0,$t3
xor $acc1,$t0
xor $acc2,$t1
movzb `&hi($qs3)`,$acc0
movzb `&hi($qs0)`,$acc1
mov 16+12($key),$s3
mov 2048($sbox,$acc0,4),$acc0 #$t2
mov 2048($sbox,$acc1,4),$acc1 #$t3
mov 16+0($key),$s0
and \$0xff000000,$acc0
and \$0xff000000,$acc1
xor $acc0,$t2
xor $acc1,$t3
mov 16+4($key),$s1
mov 16+8($key),$s2
xor $t0,$s0
xor $t1,$s1
xor $t2,$s2
xor $t3,$s3
___
}
sub decstep()
{ my ($i,@s) = @_;
my $tmp0=$acc0;
@ -230,7 +564,7 @@ sub decstep()
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" and \$0xFF,$out\n";
$code.=" mov 0($tbl,$out,8),$out\n";
$code.=" mov 0($sbox,$out,8),$out\n";
$code.=" shr \$16,$tmp1\n";
$tmp2=$s[3] if ($i==3);
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
@ -240,9 +574,9 @@ sub decstep()
$code.=" and \$0xFF,$tmp1\n";
$code.=" shr \$24,$tmp2\n";
$code.=" xor 3($tbl,$tmp0,8),$out\n";
$code.=" xor 2($tbl,$tmp1,8),$out\n";
$code.=" xor 1($tbl,$tmp2,8),$out\n";
$code.=" xor 3($sbox,$tmp0,8),$out\n";
$code.=" xor 2($sbox,$tmp1,8),$out\n";
$code.=" xor 1($sbox,$tmp2,8),$out\n";
$code.=" mov $t2,$s[1]\n" if ($i==3);
$code.=" mov $t1,$s[2]\n" if ($i==3);
@ -262,7 +596,7 @@ sub declast()
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" and \$0xFF,$out\n";
$code.=" mov 2048($tbl,$out,4),$out\n";
$code.=" mov 2048($sbox,$out,4),$out\n";
$code.=" shr \$16,$tmp1\n";
$tmp2=$s[3] if ($i==3);
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
@ -273,9 +607,9 @@ sub declast()
$code.=" and \$0xFF,$tmp1\n";
$code.=" shr \$24,$tmp2\n";
$code.=" mov 2048($tbl,$tmp0,4),$tmp0\n";
$code.=" mov 2048($tbl,$tmp1,4),$tmp1\n";
$code.=" mov 2048($tbl,$tmp2,4),$tmp2\n";
$code.=" mov 2048($sbox,$tmp0,4),$tmp0\n";
$code.=" mov 2048($sbox,$tmp1,4),$tmp1\n";
$code.=" mov 2048($sbox,$tmp2,4),$tmp2\n";
$code.=" and \$0x0000ff00,$tmp0\n";
$code.=" and \$0x00ff0000,$tmp1\n";
@ -299,36 +633,42 @@ _x86_64_AES_decrypt:
xor 8($key),$s2
xor 12($key),$s3
mov 240($key),$cnt # load key->rounds
sub \$1,$cnt
.align 4
mov 240($key),$rnds # load key->rounds
sub \$1,$rnds
jmp .Ldec_loop
.align 16
.Ldec_loop:
___
&decstep(0,$s0,$s3,$s2,$s1);
&decstep(1,$s1,$s0,$s3,$s2);
&decstep(2,$s2,$s1,$s0,$s3);
&decstep(3,$s3,$s2,$s1,$s0);
if ($verticalspin) { &decvert(); }
else { &decstep(0,$s0,$s3,$s2,$s1);
&decstep(1,$s1,$s0,$s3,$s2);
&decstep(2,$s2,$s1,$s0,$s3);
&decstep(3,$s3,$s2,$s1,$s0);
$code.=<<___;
lea 16($key),$key
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
___
}
$code.=<<___;
lea 16($key),$key
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
sub \$1,$cnt
sub \$1,$rnds
jnz .Ldec_loop
___
&declast(0,$s0,$s3,$s2,$s1);
&declast(1,$s1,$s0,$s3,$s2);
&declast(2,$s2,$s1,$s0,$s3);
&declast(3,$s3,$s2,$s1,$s0);
if ($verticalspin) { &declastvert(); }
else { &declast(0,$s0,$s3,$s2,$s1);
&declast(1,$s1,$s0,$s3,$s2);
&declast(2,$s2,$s1,$s0,$s3);
&declast(3,$s3,$s2,$s1,$s0);
$code.=<<___;
xor 16+0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
___
}
$code.=<<___;
lea 16($key),$key
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
.byte 0xf3,0xc3 # rep ret
.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
___
@ -347,9 +687,11 @@ AES_decrypt:
push %r15
mov %rdx,$key
mov %rdi,$inp
mov %rsi,$out
.picmeup $tbl
lea AES_Td-.($tbl),$tbl
.picmeup $sbox
lea AES_Td-.($sbox),$sbox
mov 0($inp),$s0
mov 4($inp),$s1
@ -719,24 +1061,24 @@ AES_cbc_encrypt:
pushfq
cld
.picmeup $tbl
.picmeup $sbox
.Lcbc_pic_point:
cmp \$0,%r9
je .LDECRYPT
lea AES_Te-.Lcbc_pic_point($tbl),$tbl
lea AES_Te-.Lcbc_pic_point($sbox),$sbox
# allocate aligned stack frame...
lea -64-248(%rsp),$key
and \$-64,$key
# ... and make it doesn't alias with AES_Te modulo 4096
mov $tbl,%r10
lea 2048($tbl),%r11
mov $sbox,%r10
lea 2048($sbox),%r11
mov $key,%r12
and \$0xFFF,%r10 # s = $tbl&0xfff
and \$0xFFF,%r11 # e = ($tbl+2048)&0xfff
and \$0xFFF,%r10 # s = $sbox&0xfff
and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff
and \$0xFFF,%r12 # p = %rsp&0xfff
cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
@ -758,13 +1100,15 @@ AES_cbc_encrypt:
mov %rdx,$_len # save copy of len
mov %rcx,$_key # save copy of key
mov %r8,$_ivp # save copy of ivp
movl \$0,$mark # copy of aes_key->rounds = 0;
mov %r8,%rbp # rearrange input arguments
mov %rsi,$out
mov %rdi,$inp
mov %rcx,$key
# do we copy key schedule to stack?
mov $key,%r10
sub $tbl,%r10
sub $sbox,%r10
and \$0xfff,%r10
cmp \$2048,%r10
jb .Lcbc_do_ecopy
@ -772,8 +1116,6 @@ AES_cbc_encrypt:
jb .Lcbc_skip_ecopy
.align 4
.Lcbc_do_ecopy:
mov %rsi,%r10 # backup $inp,$out
mov %rdi,%r11
mov $key,%rsi
lea $aes_key,%rdi
lea $aes_key,$key
@ -781,29 +1123,27 @@ AES_cbc_encrypt:
.long 0x90A548F3 # rep movsq
mov (%rsi),%eax # copy aes_key->rounds
mov %eax,(%rdi)
mov %r10,%rsi # restore $inp,$out
mov %r11,%rdi
.Lcbc_skip_ecopy:
mov $key,$keyp # save key pointer
mov \$16,%ecx
.align 4
.Lcbc_prefetch_te:
mov 0($tbl),%r10
mov 32($tbl),%r11
mov 64($tbl),%r12
mov 96($tbl),%r13
lea 128($tbl),$tbl
mov 0($sbox),%r10
mov 32($sbox),%r11
mov 64($sbox),%r12
mov 96($sbox),%r13
lea 128($sbox),$sbox
sub \$1,%ecx
jnz .Lcbc_prefetch_te
sub \$2048,$tbl
sub \$2048,$sbox
test \$-16,%rdx # check upon length
mov %rdx,%r10
mov 0(%r8),$s0 # load iv
mov 4(%r8),$s1
mov 8(%r8),$s2
mov 12(%r8),$s3
mov 0(%rbp),$s0 # load iv
mov 4(%rbp),$s1
mov 8(%rbp),$s2
mov 12(%rbp),$s3
jz .Lcbc_enc_tail # short input...
.align 4
@ -812,10 +1152,12 @@ AES_cbc_encrypt:
xor 4($inp),$s1
xor 8($inp),$s2
xor 12($inp),$s3
mov $inp,$ivec # if ($verticalspin) save inp
mov $keyp,$key # restore key
call _x86_64_AES_encrypt
mov $ivec,$inp # if ($verticalspin) restore inp
mov $s0,0($out)
mov $s1,4($out)
mov $s2,8($out)
@ -830,11 +1172,11 @@ AES_cbc_encrypt:
jnz .Lcbc_enc_loop
test \$15,%r10
jnz .Lcbc_enc_tail
mov $_ivp,%r10 # restore ivp
mov $s0,0(%r10) # save ivec
mov $s1,4(%r10)
mov $s2,8(%r10)
mov $s3,12(%r10)
mov $_ivp,%rbp # restore ivp
mov $s0,0(%rbp) # save ivec
mov $s1,4(%rbp)
mov $s2,8(%rbp)
mov $s3,12(%rbp)
.align 4
.Lcbc_cleanup:
@ -858,36 +1200,34 @@ AES_cbc_encrypt:
.align 4
.Lcbc_enc_tail:
cmp $inp,$out
mov $inp,%r11
mov $out,%r12
je .Lcbc_enc_in_place
mov %r10,%rcx
xchg %rsi,%rdi
mov $inp,%rsi
mov $out,%rdi
.long 0xF689A4F3 # rep movsb
.Lcbc_enc_in_place:
mov \$16,%rcx # zero tail
sub %r10,%rcx
xor %rax,%rax
.long 0xF689AAF3 # rep stosb
mov %r12,$inp # this is not a mistake!
mov %r12,$out
mov $out,$inp # this is not a mistake!
movq \$16,$_len # len=16
jmp .Lcbc_enc_loop # one more spin...
#----------------------------- DECRYPT -----------------------------#
.align 16
.LDECRYPT:
lea AES_Td-.Lcbc_pic_point($tbl),$tbl
lea AES_Td-.Lcbc_pic_point($sbox),$sbox
# allocate aligned stack frame...
lea -64-248(%rsp),$key
and \$-64,$key
# ... and make it doesn't alias with AES_Td modulo 4096
mov $tbl,%r10
lea 3072($tbl),%r11
mov $sbox,%r10
lea 3072($sbox),%r11
mov $key,%r12
and \$0xFFF,%r10 # s = $tbl&0xfff
and \$0xFFF,%r11 # e = ($tbl+2048)&0xfff
and \$0xFFF,%r10 # s = $sbox&0xfff
and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff
and \$0xFFF,%r12 # p = %rsp&0xfff
cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
@ -909,13 +1249,15 @@ AES_cbc_encrypt:
mov %rdx,$_len # save copy of len
mov %rcx,$_key # save copy of key
mov %r8,$_ivp # save copy of ivp
movl \$0,$mark # copy of aes_key->rounds = 0;
mov %r8,%rbp # rearrange input arguments
mov %rsi,$out
mov %rdi,$inp
mov %rcx,$key
# do we copy key schedule to stack?
mov $key,%r10
sub $tbl,%r10
sub $sbox,%r10
and \$0xfff,%r10
cmp \$3072,%r10
jb .Lcbc_do_dcopy
@ -923,8 +1265,6 @@ AES_cbc_encrypt:
jb .Lcbc_skip_dcopy
.align 4
.Lcbc_do_dcopy:
mov %rsi,%r10 # backup $inp,$out
mov %rdi,%r11
mov $key,%rsi
lea $aes_key,%rdi
lea $aes_key,$key
@ -932,51 +1272,51 @@ AES_cbc_encrypt:
.long 0x90A548F3 # rep movsq
mov (%rsi),%eax # copy aes_key->rounds
mov %eax,(%rdi)
mov %r10,%rsi # restore $inp,$out
mov %r11,%rdi
.Lcbc_skip_dcopy:
mov $key,$keyp # save key pointer
mov \$24,%ecx
.align 4
.Lcbc_prefetch_td:
mov 0($tbl),%r10
mov 32($tbl),%r11
mov 64($tbl),%r12
mov 96($tbl),%r13
lea 128($tbl),$tbl
mov 0($sbox),%r10
mov 32($sbox),%r11
mov 64($sbox),%r12
mov 96($sbox),%r13
lea 128($sbox),$sbox
sub \$1,%ecx
jnz .Lcbc_prefetch_td
sub \$3072,$tbl
sub \$3072,$sbox
cmp $inp,$out
je .Lcbc_dec_in_place
mov %r8,$ivec
mov %rbp,$ivec
.align 4
.Lcbc_dec_loop:
mov 0($inp),$s0 # read input
mov 4($inp),$s1
mov 8($inp),$s2
mov 12($inp),$s3
mov $inp,8+$ivec # if ($verticalspin) save inp
mov $keyp,$key # load key
mov $keyp,$key # restore key
call _x86_64_AES_decrypt
mov $ivec,%r8 # load ivp
xor 0(%r8),$s0 # xor iv
xor 4(%r8),$s1
xor 8(%r8),$s2
xor 12(%r8),$s3
mov $inp,%r8 # current input, next iv
mov $ivec,%rbp # load ivp
mov 8+$ivec,$inp # if ($verticalspin) restore inp
xor 0(%rbp),$s0 # xor iv
xor 4(%rbp),$s1
xor 8(%rbp),$s2
xor 12(%rbp),$s3
mov $inp,%rbp # current input, next iv
mov $_len,%r10 # load len
mov $_len,%r10 # load len
sub \$16,%r10
jc .Lcbc_dec_partial
mov %r10,$_len # update len
mov %r8,$ivec # update ivp
mov %r10,$_len # update len
mov %rbp,$ivec # update ivp
mov $s0,0($out) # write output
mov $s0,0($out) # write output
mov $s1,4($out)
mov $s2,8($out)
mov $s3,12($out)
@ -985,11 +1325,11 @@ AES_cbc_encrypt:
lea 16($out),$out
jnz .Lcbc_dec_loop
.Lcbc_dec_end:
mov $_ivp,%r9 # load user ivp
mov 0(%r8),%r10 # load iv
mov 8(%r8),%r11
mov %r10,0(%r9) # copy back to user
mov %r11,8(%r9)
mov $_ivp,%r12 # load user ivp
mov 0(%rbp),%r10 # load iv
mov 8(%rbp),%r11
mov %r10,0(%r12) # copy back to user
mov %r11,8(%r12)
jmp .Lcbc_cleanup
.align 4
@ -1007,26 +1347,28 @@ AES_cbc_encrypt:
.align 16
.Lcbc_dec_in_place:
mov 0($inp),$s0 # load input
mov 0($inp),$s0 # load input
mov 4($inp),$s1
mov 8($inp),$s2
mov 12($inp),$s3
mov $inp,$ivec # if ($verticalspin) save inp
mov $keyp,$key
call _x86_64_AES_decrypt
mov $_ivp,%r8
xor 0(%r8),$s0
xor 4(%r8),$s1
xor 8(%r8),$s2
xor 12(%r8),$s3
mov $ivec,$inp # if ($verticalspin) restore inp
mov $_ivp,%rbp
xor 0(%rbp),$s0
xor 4(%rbp),$s1
xor 8(%rbp),$s2
xor 12(%rbp),$s3
mov 0($inp),%r10 # copy input to iv
mov 0($inp),%r10 # copy input to iv
mov 8($inp),%r11
mov %r10,0(%r8)
mov %r11,8(%r8)
mov %r10,0(%rbp)
mov %r11,8(%rbp)
mov $s0,0($out) # save output [zaps input]
mov $s0,0($out) # save output [zaps input]
mov $s1,4($out)
mov $s2,8($out)
mov $s3,12($out)
@ -1044,7 +1386,7 @@ AES_cbc_encrypt:
.Lcbc_dec_in_place_partial:
# one can argue if this is actually required
lea ($out,%rcx),%rdi
lea (%r8,%rcx),%rsi
lea (%rbp,%rcx),%rsi
neg %rcx
.long 0xF689A4F3 # rep movsb # restore tail
jmp .Lcbc_cleanup
@ -1262,6 +1604,8 @@ ___
&data_word(0xe1e1e1e1, 0x69696969, 0x14141414, 0x63636363);
&data_word(0x55555555, 0x21212121, 0x0c0c0c0c, 0x7d7d7d7d);
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

View File

@ -167,10 +167,12 @@ my $current_function;
my $self = shift;
my $sz = shift;
# silently convert all EAs to 64-bit, required for elder GNU
# assembler and results in more compact code
$self->{index} =~ s/^[er](.?[0-9xp])[d]?$/r\1/;
$self->{base} =~ s/^[er](.?[0-9xp])[d]?$/r\1/;
# Silently convert all EAs to 64-bit. This is required for
# elder GNU assembler and results in more compact code,
# *but* most importantly AES module depends on this feature!
$self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
$self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
if (!$masm) {
# Solaris /usr/ccs/bin/as can't handle multiplications
# in $self->{label}