Add support for Intel SHA extension.

This commit is contained in:
Andy Polyakov 2014-06-11 10:27:45 +02:00
parent fd2309aa29
commit 619b94667c
8 changed files with 2387 additions and 52 deletions

View File

@ -118,7 +118,9 @@ $code.=<<___;
aesni_cbc_sha1_enc:
# caller should check for SSSE3 and AES-NI bits
mov OPENSSL_ia32cap_P+0(%rip),%r10d
mov OPENSSL_ia32cap_P+4(%rip),%r11d
mov OPENSSL_ia32cap_P+4(%rip),%r11
bt \$61,%r11 # check SHA bit
jc aesni_cbc_sha1_enc_shaext
___
$code.=<<___ if ($avx);
and \$`1<<28`,%r11d # mask AVX bit
@ -200,7 +202,7 @@ $code.=<<___;
mov $in0,%r12 # reassign arguments
mov $out,%r13
mov $len,%r14
mov $key,%r15
lea 112($key),%r15 # size optimization
movdqu ($ivp),$iv # load IV
mov $ivp,88(%rsp) # save $ivp
___
@ -209,7 +211,7 @@ my $rounds="${ivp}d";
$code.=<<___;
shl \$6,$len
sub $in0,$out
mov 240($key),$rounds
mov 240-112($key),$rounds
add $inp,$len # end of input
lea K_XX_XX(%rip),$K_XX_XX
@ -243,8 +245,8 @@ $code.=<<___;
psubd @Tx[1],@X[-3&7]
movdqa @X[-2&7],32(%rsp)
psubd @Tx[1],@X[-2&7]
movups ($key),$rndkey0 # $key[0]
movups 16($key),$rndkey[0] # forward reference
movups -112($key),$rndkey0 # $key[0]
movups 16-112($key),$rndkey[0] # forward reference
jmp .Loop_ssse3
___
@ -261,31 +263,31 @@ ___
___
$code.=<<___;
xorps $in,$iv
movups `32+16*$k-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
movups `32+16*$k`($key),$rndkey[1]
___
} elsif ($k==9) {
$sn++;
$code.=<<___;
cmp \$11,$rounds
jb .Laesenclast$sn
movups `32+16*($k+0)`($key),$rndkey[1]
movups `32+16*($k+0)-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
movups `32+16*($k+1)`($key),$rndkey[0]
movups `32+16*($k+1)-112`($key),$rndkey[0]
aesenc $rndkey[1],$iv
je .Laesenclast$sn
movups `32+16*($k+2)`($key),$rndkey[1]
movups `32+16*($k+2)-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
movups `32+16*($k+3)`($key),$rndkey[0]
movups `32+16*($k+3)-112`($key),$rndkey[0]
aesenc $rndkey[1],$iv
.Laesenclast$sn:
aesenclast $rndkey[0],$iv
movups 16($key),$rndkey[1] # forward reference
movups 16-112($key),$rndkey[1] # forward reference
___
} else {
$code.=<<___;
movups `32+16*$k-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
movups `32+16*$k`($key),$rndkey[1]
___
}
$r++; unshift(@rndkey,pop(@rndkey));
@ -1041,7 +1043,7 @@ $code.=<<___;
mov $in0,%r12 # reassign arguments
mov $out,%r13
mov $len,%r14
mov $key,%r15
lea 112($key),%r15 # size optimization
vmovdqu ($ivp),$iv # load IV
mov $ivp,88(%rsp) # save $ivp
___
@ -1050,8 +1052,7 @@ my $rounds="${ivp}d";
$code.=<<___;
shl \$6,$len
sub $in0,$out
mov 240($key),$rounds
add \$112,$key # size optimization
mov 240-112($key),$rounds
add $inp,$len # end of input
lea K_XX_XX(%rip),$K_XX_XX
@ -1651,11 +1652,180 @@ K_XX_XX:
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 64
___
{{{
($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
$rounds="%r11d";
($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
@rndkey=("%xmm0","%xmm1");
$r=0;
my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12));
my @MSG=map("%xmm$_",(3..6));
$code.=<<___;
.type aesni_cbc_sha1_enc_shaext,\@function,6
.align 32
aesni_cbc_sha1_enc_shaext:
mov `($win64?56:8)`(%rsp),$inp # load 7th argument
___
$code.=<<___ if ($win64);
lea `-8-4*16`(%rsp),%rsp
movaps %xmm6,-8-10*16(%rax)
movaps %xmm7,-8-9*16(%rax)
movaps %xmm8,-8-8*16(%rax)
movaps %xmm9,-8-7*16(%rax)
movaps %xmm10,-8-6*16(%rax)
movaps %xmm11,-8-5*16(%rax)
movaps %xmm12,-8-4*16(%rax)
movaps %xmm13,-8-3*16(%rax)
movaps %xmm14,-8-2*16(%rax)
movaps %xmm15,-8-1*16(%rax)
.Lprologue_shaext:
___
$code.=<<___;
movdqu ($ctx),$ABCD
movd 16($ctx),$E
movdqa K_XX_XX+0x50(%rip),$BSWAP # byte-n-word swap
mov 240($key),$rounds
sub $in0,$out
movups ($key),$rndkey0 # $key[0]
movups 16($key),$rndkey[0] # forward reference
lea 112($key),$key # size optimization
pshufd \$0b00011011,$ABCD,$ABCD # flip word order
pshufd \$0b00011011,$E,$E # flip word order
jmp .Loop_shaext
.align 16
.Loop_shaext:
___
&$aesenc();
$code.=<<___;
movdqu ($inp),@MSG[0]
movdqa $E,$E_SAVE # offload $E
pshufb $BSWAP,@MSG[0]
movdqu 0x10($inp),@MSG[1]
movdqa $ABCD,$ABCD_SAVE # offload $ABCD
___
&$aesenc();
$code.=<<___;
pshufb $BSWAP,@MSG[1]
paddd @MSG[0],$E
movdqu 0x20($inp),@MSG[2]
lea 0x40($inp),$inp
pxor $E_SAVE,@MSG[0] # black magic
___
&$aesenc();
$code.=<<___;
pxor $E_SAVE,@MSG[0] # black magic
movdqa $ABCD,$E_
pshufb $BSWAP,@MSG[2]
sha1rnds4 \$0,$E,$ABCD # 0-3
sha1nexte @MSG[1],$E_
___
&$aesenc();
$code.=<<___;
sha1msg1 @MSG[1],@MSG[0]
movdqu -0x10($inp),@MSG[3]
movdqa $ABCD,$E
pshufb $BSWAP,@MSG[3]
___
&$aesenc();
$code.=<<___;
sha1rnds4 \$0,$E_,$ABCD # 4-7
sha1nexte @MSG[2],$E
pxor @MSG[2],@MSG[0]
sha1msg1 @MSG[2],@MSG[1]
___
&$aesenc();
for($i=2;$i<20-4;$i++) {
$code.=<<___;
movdqa $ABCD,$E_
sha1rnds4 \$`int($i/5)`,$E,$ABCD # 8-11
sha1nexte @MSG[3],$E_
___
&$aesenc();
$code.=<<___;
sha1msg2 @MSG[3],@MSG[0]
pxor @MSG[3],@MSG[1]
sha1msg1 @MSG[3],@MSG[2]
___
($E,$E_)=($E_,$E);
push(@MSG,shift(@MSG));
&$aesenc();
}
$code.=<<___;
movdqa $ABCD,$E_
sha1rnds4 \$3,$E,$ABCD # 64-67
sha1nexte @MSG[3],$E_
sha1msg2 @MSG[3],@MSG[0]
pxor @MSG[3],@MSG[1]
___
&$aesenc();
$code.=<<___;
movdqa $ABCD,$E
sha1rnds4 \$3,$E_,$ABCD # 68-71
sha1nexte @MSG[0],$E
sha1msg2 @MSG[0],@MSG[1]
___
&$aesenc();
$code.=<<___;
movdqa $E_SAVE,@MSG[0]
movdqa $ABCD,$E_
sha1rnds4 \$3,$E,$ABCD # 72-75
sha1nexte @MSG[1],$E_
___
&$aesenc();
$code.=<<___;
movdqa $ABCD,$E
sha1rnds4 \$3,$E_,$ABCD # 76-79
sha1nexte $MSG[0],$E
___
while($r<40) { &$aesenc(); } # remaining aesenc's
$code.=<<___;
dec $len
paddd $ABCD_SAVE,$ABCD
movups $iv,48($out,$in0) # write output
lea 64($in0),$in0
jnz .Loop_shaext
pshufd \$0b00011011,$ABCD,$ABCD
pshufd \$0b00011011,$E,$E
movups $iv,($ivp) # write IV
movdqu $ABCD,($ctx)
movd $E,16($ctx)
___
$code.=<<___ if ($win64);
movaps -8-10*16(%rax),%xmm6
movaps -8-9*16(%rax),%xmm7
movaps -8-8*16(%rax),%xmm8
movaps -8-7*16(%rax),%xmm9
movaps -8-6*16(%rax),%xmm10
movaps -8-5*16(%rax),%xmm11
movaps -8-4*16(%rax),%xmm12
movaps -8-3*16(%rax),%xmm13
movaps -8-2*16(%rax),%xmm14
movaps -8-1*16(%rax),%xmm15
mov %rax,%rsp
.Lepilogue_shaext:
___
$code.=<<___;
ret
.size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
___
}}}
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
@ -1793,12 +1963,43 @@ sub rex {
$rex|=0x04 if($dst>=8);
$rex|=0x01 if($src>=8);
push @opcode,$rex|0x40 if($rex);
unshift @opcode,$rex|0x40 if($rex);
}
sub sha1rnds4 {
if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
my @opcode=(0x0f,0x3a,0xcc);
rex(\@opcode,$3,$2);
push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
my $c=$1;
push @opcode,$c=~/^0/?oct($c):$c;
return ".byte\t".join(',',@opcode);
} else {
return "sha1rnds4\t".@_[0];
}
}
sub sha1op38 {
my $instr = shift;
my %opcodelet = (
"sha1nexte" => 0xc8,
"sha1msg1" => 0xc9,
"sha1msg2" => 0xca );
if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
my @opcode=(0x0f,0x38);
rex(\@opcode,$2,$1);
push @opcode,$opcodelet{$instr};
push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
return ".byte\t".join(',',@opcode);
} else {
return $instr."\t".@_[0];
}
}
sub aesni {
my $line=shift;
my @opcode=(0x66);
my @opcode=(0x0f,0x38);
if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
my %opcodelet = (
@ -1807,15 +2008,20 @@ sub aesni {
);
return undef if (!defined($opcodelet{$1}));
rex(\@opcode,$3,$2);
push @opcode,0x0f,0x38,$opcodelet{$1};
push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3); # ModR/M
unshift @opcode,0x66;
return ".byte\t".join(',',@opcode);
}
return $line;
}
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
print $code;
s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or
s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo;
print $_,"\n";
}
close STDOUT;

View File

@ -112,8 +112,13 @@ $code.=<<___ if ($avx);
cmp \$0,`$win64?"%rcx":"%rdi"`
je .Lprobe
mov 0(%r11),%eax
mov 4(%r11),%r10d
mov 8(%r11),%r11d
mov 4(%r11),%r10
bt \$61,%r10 # check for SHA
jc ${func}_shaext
mov %r10,%r11
shr \$32,%r11
test \$`1<<11`,%r10d # check for XOP
jnz ${func}_xop
@ -1196,6 +1201,288 @@ $code.=<<___;
.size ${func}_avx2,.-${func}_avx2
___
}}
}}
{{
my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
my ($rounds,$Tbl)=("%r11d","%rbx");
my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
my @rndkey=("%xmm4","%xmm5");
my $r=0;
my $sn=0;
my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
my @MSG=map("%xmm$_",(10..13));
my $aesenc=sub {
use integer;
my ($n,$k)=($r/10,$r%10);
if ($k==0) {
$code.=<<___;
movups `16*$n`($in0),$in # load input
xorps $rndkey0,$in
___
$code.=<<___ if ($n);
movups $iv,`16*($n-1)`($out,$in0) # write output
___
$code.=<<___;
xorps $in,$iv
movups `32+16*$k-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
___
} elsif ($k==9) {
$sn++;
$code.=<<___;
cmp \$11,$rounds
jb .Laesenclast$sn
movups `32+16*($k+0)-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
movups `32+16*($k+1)-112`($key),$rndkey[0]
aesenc $rndkey[1],$iv
je .Laesenclast$sn
movups `32+16*($k+2)-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
movups `32+16*($k+3)-112`($key),$rndkey[0]
aesenc $rndkey[1],$iv
.Laesenclast$sn:
aesenclast $rndkey[0],$iv
movups 16-112($key),$rndkey[1] # forward reference
nop
___
} else {
$code.=<<___;
movups `32+16*$k-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
___
}
$r++; unshift(@rndkey,pop(@rndkey));
};
$code.=<<___;
.type ${func}_shaext,\@function,6
.align 32
${func}_shaext:
mov %rsp,%rax
mov `($win64?56:8)`(%rsp),$inp # load 7th argument
push %rbx
___
$code.=<<___ if ($win64);
lea `-4*16`(%rsp),%rsp
movaps %xmm6,-8-10*16(%rax)
movaps %xmm7,-8-9*16(%rax)
movaps %xmm8,-8-8*16(%rax)
movaps %xmm9,-8-7*16(%rax)
movaps %xmm10,-8-6*16(%rax)
movaps %xmm11,-8-5*16(%rax)
movaps %xmm12,-8-4*16(%rax)
movaps %xmm13,-8-3*16(%rax)
movaps %xmm14,-8-2*16(%rax)
movaps %xmm15,-8-1*16(%rax)
.Lprologue_shaext:
___
$code.=<<___;
lea K256+0x80(%rip),$Tbl
movdqu ($ctx),$ABEF # DCBA
movdqu 16($ctx),$CDGH # HGFE
movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
mov 240($key),$rounds
sub $in0,$out
movups ($key),$rndkey0 # $key[0]
movups 16($key),$rndkey[0] # forward reference
lea 112($key),$key # size optimization
pshufd \$0x1b,$ABEF,$Wi # ABCD
pshufd \$0xb1,$ABEF,$ABEF # CDAB
pshufd \$0x1b,$CDGH,$CDGH # EFGH
movdqa $TMP,$BSWAP # offload
palignr \$8,$CDGH,$ABEF # ABEF
punpcklqdq $Wi,$CDGH # CDGH
jmp .Loop_shaext
.align 16
.Loop_shaext:
movdqu ($inp),@MSG[0]
movdqu 0x10($inp),@MSG[1]
movdqu 0x20($inp),@MSG[2]
pshufb $TMP,@MSG[0]
movdqu 0x30($inp),@MSG[3]
movdqa 0*32-0x80($Tbl),$Wi
paddd @MSG[0],$Wi
pshufb $TMP,@MSG[1]
movdqa $CDGH,$CDGH_SAVE # offload
movdqa $ABEF,$ABEF_SAVE # offload
___
&$aesenc();
$code.=<<___;
sha256rnds2 $ABEF,$CDGH # 0-3
pshufd \$0x0e,$Wi,$Wi
___
&$aesenc();
$code.=<<___;
sha256rnds2 $CDGH,$ABEF
movdqa 1*32-0x80($Tbl),$Wi
paddd @MSG[1],$Wi
pshufb $TMP,@MSG[2]
lea 0x40($inp),$inp
___
&$aesenc();
$code.=<<___;
sha256rnds2 $ABEF,$CDGH # 4-7
pshufd \$0x0e,$Wi,$Wi
___
&$aesenc();
$code.=<<___;
sha256rnds2 $CDGH,$ABEF
movdqa 2*32-0x80($Tbl),$Wi
paddd @MSG[2],$Wi
pshufb $TMP,@MSG[3]
sha256msg1 @MSG[1],@MSG[0]
___
&$aesenc();
$code.=<<___;
sha256rnds2 $ABEF,$CDGH # 8-11
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[3],$TMP
palignr \$4,@MSG[2],$TMP
paddd $TMP,@MSG[0]
___
&$aesenc();
$code.=<<___;
sha256rnds2 $CDGH,$ABEF
movdqa 3*32-0x80($Tbl),$Wi
paddd @MSG[3],$Wi
sha256msg2 @MSG[3],@MSG[0]
sha256msg1 @MSG[2],@MSG[1]
___
&$aesenc();
$code.=<<___;
sha256rnds2 $ABEF,$CDGH # 12-15
pshufd \$0x0e,$Wi,$Wi
___
&$aesenc();
$code.=<<___;
movdqa @MSG[0],$TMP
palignr \$4,@MSG[3],$TMP
paddd $TMP,@MSG[1]
sha256rnds2 $CDGH,$ABEF
___
for($i=4;$i<16-3;$i++) {
&$aesenc() if (($r%10)==0);
$code.=<<___;
movdqa $i*32-0x80($Tbl),$Wi
paddd @MSG[0],$Wi
sha256msg2 @MSG[0],@MSG[1]
sha256msg1 @MSG[3],@MSG[2]
___
&$aesenc();
$code.=<<___;
sha256rnds2 $ABEF,$CDGH # 16-19...
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[1],$TMP
palignr \$4,@MSG[0],$TMP
paddd $TMP,@MSG[2]
___
&$aesenc();
&$aesenc() if ($r==19);
$code.=<<___;
sha256rnds2 $CDGH,$ABEF
___
push(@MSG,shift(@MSG));
}
$code.=<<___;
movdqa 13*32-0x80($Tbl),$Wi
paddd @MSG[0],$Wi
sha256msg2 @MSG[0],@MSG[1]
sha256msg1 @MSG[3],@MSG[2]
___
&$aesenc();
$code.=<<___;
sha256rnds2 $ABEF,$CDGH # 52-55
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[1],$TMP
palignr \$4,@MSG[0],$TMP
paddd $TMP,@MSG[2]
___
&$aesenc();
&$aesenc();
$code.=<<___;
sha256rnds2 $CDGH,$ABEF
movdqa 14*32-0x80($Tbl),$Wi
paddd @MSG[1],$Wi
sha256msg2 @MSG[1],@MSG[2]
movdqa $BSWAP,$TMP
___
&$aesenc();
$code.=<<___;
sha256rnds2 $ABEF,$CDGH # 56-59
pshufd \$0x0e,$Wi,$Wi
___
&$aesenc();
$code.=<<___;
sha256rnds2 $CDGH,$ABEF
movdqa 15*32-0x80($Tbl),$Wi
paddd @MSG[2],$Wi
___
&$aesenc();
&$aesenc();
$code.=<<___;
sha256rnds2 $ABEF,$CDGH # 60-63
pshufd \$0x0e,$Wi,$Wi
___
&$aesenc();
$code.=<<___;
sha256rnds2 $CDGH,$ABEF
#pxor $CDGH,$rndkey0 # black magic
___
while ($r<40) { &$aesenc(); } # remaining aesenc's
$code.=<<___;
#xorps $CDGH,$rndkey0 # black magic
paddd $CDGH_SAVE,$CDGH
paddd $ABEF_SAVE,$ABEF
dec $len
movups $iv,48($out,$in0) # write output
lea 64($in0),$in0
jnz .Loop_shaext
pshufd \$0xb1,$CDGH,$CDGH # DCHG
pshufd \$0x1b,$ABEF,$TMP # FEBA
pshufd \$0xb1,$ABEF,$ABEF # BAFE
punpckhqdq $CDGH,$ABEF # DCBA
palignr \$8,$TMP,$CDGH # HGFE
movups $iv,($ivp) # write IV
movdqu $ABEF,($ctx)
movdqu $CDGH,16($ctx)
___
$code.=<<___ if ($win64);
movaps -8-10*16(%rax),%xmm6
movaps -8-9*16(%rax),%xmm7
movaps -8-8*16(%rax),%xmm8
movaps -8-7*16(%rax),%xmm9
movaps -8-6*16(%rax),%xmm10
movaps -8-5*16(%rax),%xmm11
movaps -8-4*16(%rax),%xmm12
movaps -8-3*16(%rax),%xmm13
movaps -8-2*16(%rax),%xmm14
movaps -8-1*16(%rax),%xmm15
.Lepilogue_shaext:
___
$code.=<<___;
mov -8(%rax),%rbx
mov %rax,%rsp
ret
.size ${func}_shaext,.-${func}_shaext
___
}}}}}
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
@ -1347,6 +1634,39 @@ $code.=<<___ if ($avx>1);
___
}
####################################################################
sub rex {
local *opcode=shift;
my ($dst,$src)=@_;
my $rex=0;
$rex|=0x04 if($dst>=8);
$rex|=0x01 if($src>=8);
unshift @opcode,$rex|0x40 if($rex);
}
{
my %opcodelet = (
"sha256rnds2" => 0xcb,
"sha256msg1" => 0xcc,
"sha256msg2" => 0xcd );
sub sha256op38 {
my $instr = shift;
if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
my @opcode=(0x0f,0x38);
rex(\@opcode,$2,$1);
push @opcode,$opcodelet{$instr};
push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
return ".byte\t".join(',',@opcode);
} else {
return $instr."\t".@_[0];
}
}
}
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
print $code;
close STDOUT;

View File

@ -79,6 +79,10 @@
# strongly, it's probably more appropriate to discuss possibility of
# using vector rotate XOP on AMD...
# March 2014.
#
# Add support for Intel SHA Extensions.
######################################################################
# Current performance is summarized in following table. Numbers are
# CPU clock cycles spent to process single byte (less is better).
@ -303,6 +307,7 @@ if ($alt) {
&function_begin("sha1_block_data_order");
if ($xmm) {
&static_label("shaext_shortcut");
&static_label("ssse3_shortcut");
&static_label("avx_shortcut") if ($ymm);
&static_label("K_XX_XX");
@ -317,8 +322,11 @@ if ($xmm) {
&mov ($D,&DWP(4,$T));
&test ($D,1<<9); # check SSSE3 bit
&jz (&label("x86"));
&mov ($C,&DWP(8,$T));
&test ($A,1<<24); # check FXSR bit
&jz (&label("x86"));
&test ($C,1<<29); # check SHA bit
&jnz (&label("shaext_shortcut"));
if ($ymm) {
&and ($D,1<<28); # mask AVX bit
&and ($A,1<<30); # mask "Intel CPU" bit
@ -397,6 +405,117 @@ if ($xmm) {
&function_end("sha1_block_data_order");
if ($xmm) {
{
######################################################################
# Intel SHA Extensions implementation of SHA1 update function.
#
my ($ctx,$inp,$num)=("edi","esi","ecx");
my ($ABCD,$E,$E_,$BSWAP)=map("xmm$_",(0..3));
my @MSG=map("xmm$_",(4..7));
sub sha1rnds4 {
my ($dst,$src,$imm)=@_;
if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
{ &data_byte(0x0f,0x3a,0xcc,0xc0|($1<<3)|$2,$imm); }
}
sub sha1op38 {
my ($opcodelet,$dst,$src)=@_;
if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
{ &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); }
}
sub sha1nexte { sha1op38(0xc8,@_); }
sub sha1msg1 { sha1op38(0xc9,@_); }
sub sha1msg2 { sha1op38(0xca,@_); }
&function_begin("_sha1_block_data_order_shaext");
&call (&label("pic_point")); # make it PIC!
&set_label("pic_point");
&blindpop($tmp1);
&lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
&set_label("shaext_shortcut");
&mov ($ctx,&wparam(0));
&mov ("ebx","esp");
&mov ($inp,&wparam(1));
&mov ($num,&wparam(2));
&sub ("esp",32);
&movdqu ($ABCD,&QWP(0,$ctx));
&movd ($E,&QWP(16,$ctx));
&and ("esp",-32);
&movdqa ($BSWAP,&QWP(0x50,$tmp1)); # byte-n-word swap
&movdqu (@MSG[0],&QWP(0,$inp));
&pshufd ($ABCD,$ABCD,0b00011011); # flip word order
&movdqu (@MSG[1],&QWP(0x10,$inp));
&pshufd ($E,$E,0b00011011); # flip word order
&movdqu (@MSG[2],&QWP(0x20,$inp));
&pshufb (@MSG[0],$BSWAP);
&movdqu (@MSG[3],&QWP(0x30,$inp));
&pshufb (@MSG[1],$BSWAP);
&pshufb (@MSG[2],$BSWAP);
&pshufb (@MSG[3],$BSWAP);
&jmp (&label("loop_shaext"));
&set_label("loop_shaext",16);
&dec ($num);
&lea ("eax",&DWP(0x40,$inp));
&movdqa (&QWP(0,"esp"),$E); # offload $E
&paddd ($E,@MSG[0]);
&cmovne ($inp,"eax");
&movdqa (&QWP(16,"esp"),$ABCD); # offload $ABCD
for($i=0;$i<20-4;$i+=2) {
&sha1msg1 (@MSG[0],@MSG[1]);
&movdqa ($E_,$ABCD);
&sha1rnds4 ($ABCD,$E,int($i/5)); # 0-3...
&sha1nexte ($E_,@MSG[1]);
&pxor (@MSG[0],@MSG[2]);
&sha1msg1 (@MSG[1],@MSG[2]);
&sha1msg2 (@MSG[0],@MSG[3]);
&movdqa ($E,$ABCD);
&sha1rnds4 ($ABCD,$E_,int(($i+1)/5));
&sha1nexte ($E,@MSG[2]);
&pxor (@MSG[1],@MSG[3]);
&sha1msg2 (@MSG[1],@MSG[0]);
push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG));
}
&movdqu (@MSG[0],&QWP(0,$inp));
&movdqa ($E_,$ABCD);
&sha1rnds4 ($ABCD,$E,3); # 64-67
&sha1nexte ($E_,@MSG[1]);
&movdqu (@MSG[1],&QWP(0x10,$inp));
&pshufb (@MSG[0],$BSWAP);
&movdqa ($E,$ABCD);
&sha1rnds4 ($ABCD,$E_,3); # 68-71
&sha1nexte ($E,@MSG[2]);
&movdqu (@MSG[2],&QWP(0x20,$inp));
&pshufb (@MSG[1],$BSWAP);
&movdqa ($E_,$ABCD);
&sha1rnds4 ($ABCD,$E,3); # 72-75
&sha1nexte ($E_,@MSG[3]);
&movdqu (@MSG[3],&QWP(0x30,$inp));
&pshufb (@MSG[2],$BSWAP);
&movdqa ($E,$ABCD);
&sha1rnds4 ($ABCD,$E_,3); # 76-79
&movdqa ($E_,&QWP(0,"esp"));
&pshufb (@MSG[3],$BSWAP);
&sha1nexte ($E,$E_);
&paddd ($ABCD,&QWP(16,"esp"));
&jnz (&label("loop_shaext"));
&pshufd ($ABCD,$ABCD,0b00011011);
&pshufd ($E,$E,0b00011011);
&movdqu (&QWP(0,$ctx),$ABCD)
&movd (&DWP(16,$ctx),$E);
&mov ("esp","ebx");
&function_end("_sha1_block_data_order_shaext");
}
######################################################################
# The SSSE3 implementation.
#
@ -1340,6 +1459,7 @@ sub Xtail_avx()
&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59
&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79
&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask
&data_byte(0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0);
}
&asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");

View File

@ -15,7 +15,7 @@
# this +aesni(i) sha1 aesni-sha1 gain(iv)
# -------------------------------------------------------------------
# Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68%
# Atom(ii) 18.9?/n +3.93=8.66(n=4) 10.0 14.0 +62%
# Atom(ii) 18.1/n +3.93=8.46(n=4) 9.37 12.8 +51%
# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80%
# Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68%
# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160%
@ -338,9 +338,11 @@ $code.=<<___;
.type sha1_multi_block,\@function,3
.align 32
sha1_multi_block:
mov OPENSSL_ia32cap_P+4(%rip),%rcx
bt \$61,%rcx # check SHA bit
jc _shaext_shortcut
___
$code.=<<___ if ($avx);
mov OPENSSL_ia32cap_P+4(%rip),%rcx
test \$`1<<28`,%ecx
jnz _avx_shortcut
___
@ -366,6 +368,7 @@ $code.=<<___;
sub \$`$REG_SZ*18`,%rsp
and \$-256,%rsp
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
.Lbody:
lea K_XX_XX(%rip),$Tbl
lea `$REG_SZ*16`(%rsp),%rbx
@ -476,9 +479,265 @@ $code.=<<___;
mov -16(%rax),%rbp
mov -8(%rax),%rbx
lea (%rax),%rsp
.Lepilogue:
ret
.size sha1_multi_block,.-sha1_multi_block
___
{{{
my ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10));
my @MSG0=map("%xmm$_",(4..7));
my @MSG1=map("%xmm$_",(11..14));
$code.=<<___;
.type sha1_multi_block_shaext,\@function,3
.align 32
sha1_multi_block_shaext:
_shaext_shortcut:
mov %rsp,%rax
push %rbx
push %rbp
___
$code.=<<___ if ($win64);
lea -0xa8(%rsp),%rsp
movaps %xmm6,(%rsp)
movaps %xmm7,0x10(%rsp)
movaps %xmm8,0x20(%rsp)
movaps %xmm9,0x30(%rsp)
movaps %xmm10,-0x78(%rax)
movaps %xmm11,-0x68(%rax)
movaps %xmm12,-0x58(%rax)
movaps %xmm13,-0x48(%rax)
movaps %xmm14,-0x38(%rax)
movaps %xmm15,-0x28(%rax)
___
$code.=<<___;
sub \$`$REG_SZ*18`,%rsp
shl \$1,$num # we process pair at a time
and \$-256,%rsp
lea 0x40($ctx),$ctx # size optimization
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
.Lbody_shaext:
lea `$REG_SZ*16`(%rsp),%rbx
movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap
.Loop_grande_shaext:
mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num
xor $num,$num
___
for($i=0;$i<2;$i++) {
$code.=<<___;
mov `16*$i+0`($inp),@ptr[$i] # input pointer
mov `16*$i+8`($inp),%ecx # number of blocks
cmp $num,%ecx
cmovg %ecx,$num # find maximum
test %ecx,%ecx
mov %ecx,`4*$i`(%rbx) # initialize counters
cmovle %rsp,@ptr[$i] # cancel input
___
}
$code.=<<___;
test $num,$num
jz .Ldone_shaext
movq 0x00-0x40($ctx),$ABCD0 # a1.a0
movq 0x20-0x40($ctx),@MSG0[0]# b1.b0
movq 0x40-0x40($ctx),@MSG0[1]# c1.c0
movq 0x60-0x40($ctx),@MSG0[2]# d1.d0
movq 0x80-0x40($ctx),@MSG0[3]# e1.e0
punpckldq @MSG0[0],$ABCD0 # b1.a1.b0.a0
punpckldq @MSG0[2],@MSG0[1] # d1.c1.d0.c0
movdqa $ABCD0,$ABCD1
punpcklqdq @MSG0[1],$ABCD0 # d0.c0.b0.a0
punpckhqdq @MSG0[1],$ABCD1 # d1.c1.b1.a1
pshufd \$0b00111111,@MSG0[3],$E0
pshufd \$0b01111111,@MSG0[3],$E1
pshufd \$0b00011011,$ABCD0,$ABCD0
pshufd \$0b00011011,$ABCD1,$ABCD1
jmp .Loop_shaext
.align 32
.Loop_shaext:
movdqu 0x00(@ptr[0]),@MSG0[0]
movdqu 0x00(@ptr[1]),@MSG1[0]
movdqu 0x10(@ptr[0]),@MSG0[1]
movdqu 0x10(@ptr[1]),@MSG1[1]
movdqu 0x20(@ptr[0]),@MSG0[2]
pshufb $BSWAP,@MSG0[0]
movdqu 0x20(@ptr[1]),@MSG1[2]
pshufb $BSWAP,@MSG1[0]
movdqu 0x30(@ptr[0]),@MSG0[3]
lea 0x40(@ptr[0]),@ptr[0]
pshufb $BSWAP,@MSG0[1]
movdqu 0x30(@ptr[1]),@MSG1[3]
lea 0x40(@ptr[1]),@ptr[1]
pshufb $BSWAP,@MSG1[1]
movdqa $E0,0x50(%rsp) # offload
paddd @MSG0[0],$E0
movdqa $E1,0x70(%rsp)
paddd @MSG1[0],$E1
movdqa $ABCD0,0x40(%rsp) # offload
movdqa $ABCD0,$E0_
movdqa $ABCD1,0x60(%rsp)
movdqa $ABCD1,$E1_
sha1rnds4 \$0,$E0,$ABCD0 # 0-3
sha1nexte @MSG0[1],$E0_
sha1rnds4 \$0,$E1,$ABCD1 # 0-3
sha1nexte @MSG1[1],$E1_
pshufb $BSWAP,@MSG0[2]
prefetcht0 127(@ptr[0])
sha1msg1 @MSG0[1],@MSG0[0]
pshufb $BSWAP,@MSG1[2]
prefetcht0 127(@ptr[1])
sha1msg1 @MSG1[1],@MSG1[0]
pshufb $BSWAP,@MSG0[3]
movdqa $ABCD0,$E0
pshufb $BSWAP,@MSG1[3]
movdqa $ABCD1,$E1
sha1rnds4 \$0,$E0_,$ABCD0 # 4-7
sha1nexte @MSG0[2],$E0
sha1rnds4 \$0,$E1_,$ABCD1 # 4-7
sha1nexte @MSG1[2],$E1
pxor @MSG0[2],@MSG0[0]
sha1msg1 @MSG0[2],@MSG0[1]
pxor @MSG1[2],@MSG1[0]
sha1msg1 @MSG1[2],@MSG1[1]
___
for($i=2;$i<20-4;$i++) {
$code.=<<___;
movdqa $ABCD0,$E0_
movdqa $ABCD1,$E1_
sha1rnds4 \$`int($i/5)`,$E0,$ABCD0 # 8-11
sha1nexte @MSG0[3],$E0_
sha1rnds4 \$`int($i/5)`,$E1,$ABCD1 # 8-11
sha1nexte @MSG1[3],$E1_
sha1msg2 @MSG0[3],@MSG0[0]
sha1msg2 @MSG1[3],@MSG1[0]
pxor @MSG0[3],@MSG0[1]
sha1msg1 @MSG0[3],@MSG0[2]
pxor @MSG1[3],@MSG1[1]
sha1msg1 @MSG1[3],@MSG1[2]
___
($E0,$E0_)=($E0_,$E0); ($E1,$E1_)=($E1_,$E1);
push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
}
$code.=<<___;
movdqa $ABCD0,$E0_
movdqa $ABCD1,$E1_
sha1rnds4 \$3,$E0,$ABCD0 # 64-67
sha1nexte @MSG0[3],$E0_
sha1rnds4 \$3,$E1,$ABCD1 # 64-67
sha1nexte @MSG1[3],$E1_
sha1msg2 @MSG0[3],@MSG0[0]
sha1msg2 @MSG1[3],@MSG1[0]
pxor @MSG0[3],@MSG0[1]
pxor @MSG1[3],@MSG1[1]
mov \$1,%ecx
pxor @MSG0[2],@MSG0[2] # zero
cmp 4*0(%rbx),%ecx # examine counters
cmovge %rsp,@ptr[0] # cancel input
movdqa $ABCD0,$E0
movdqa $ABCD1,$E1
sha1rnds4 \$3,$E0_,$ABCD0 # 68-71
sha1nexte @MSG0[0],$E0
sha1rnds4 \$3,$E1_,$ABCD1 # 68-71
sha1nexte @MSG1[0],$E1
sha1msg2 @MSG0[0],@MSG0[1]
sha1msg2 @MSG1[0],@MSG1[1]
cmp 4*1(%rbx),%ecx
cmovge %rsp,@ptr[1]
movq (%rbx),@MSG0[0] # pull counters
movdqa $ABCD0,$E0_
movdqa $ABCD1,$E1_
sha1rnds4 \$3,$E0,$ABCD0 # 72-75
sha1nexte @MSG0[1],$E0_
sha1rnds4 \$3,$E1,$ABCD1 # 72-75
sha1nexte @MSG1[1],$E1_
pshufd \$0x00,@MSG0[0],@MSG1[2]
pshufd \$0x55,@MSG0[0],@MSG1[3]
movdqa @MSG0[0],@MSG0[1]
pcmpgtd @MSG0[2],@MSG1[2]
pcmpgtd @MSG0[2],@MSG1[3]
movdqa $ABCD0,$E0
movdqa $ABCD1,$E1
sha1rnds4 \$3,$E0_,$ABCD0 # 76-79
sha1nexte $MSG0[2],$E0
sha1rnds4 \$3,$E1_,$ABCD1 # 76-79
sha1nexte $MSG0[2],$E1
pcmpgtd @MSG0[2],@MSG0[1] # counter mask
pand @MSG1[2],$ABCD0
pand @MSG1[2],$E0
pand @MSG1[3],$ABCD1
pand @MSG1[3],$E1
paddd @MSG0[1],@MSG0[0] # counters--
paddd 0x40(%rsp),$ABCD0
paddd 0x50(%rsp),$E0
paddd 0x60(%rsp),$ABCD1
paddd 0x70(%rsp),$E1
movq @MSG0[0],(%rbx) # save counters
dec $num
jnz .Loop_shaext
mov `$REG_SZ*17+8`(%rsp),$num
pshufd \$0b00011011,$ABCD0,$ABCD0
pshufd \$0b00011011,$ABCD1,$ABCD1
movdqa $ABCD0,@MSG0[0]
punpckldq $ABCD1,$ABCD0 # b1.b0.a1.a0
punpckhdq $ABCD1,@MSG0[0] # d1.d0.c1.c0
punpckhdq $E1,$E0 # e1.e0.xx.xx
movq $ABCD0,0x00-0x40($ctx) # a1.a0
psrldq \$8,$ABCD0
movq @MSG0[0],0x40-0x40($ctx)# c1.c0
psrldq \$8,@MSG0[0]
movq $ABCD0,0x20-0x40($ctx) # b1.b0
psrldq \$8,$E0
movq @MSG0[0],0x60-0x40($ctx)# d1.d0
movq $E0,0x80-0x40($ctx) # e1.e0
lea `$REG_SZ/2`($ctx),$ctx
lea `16*2`($inp),$inp
dec $num
jnz .Loop_grande_shaext
.Ldone_shaext:
#mov `$REG_SZ*17`(%rsp),%rax # original %rsp
___
$code.=<<___ if ($win64);
movaps -0xb8(%rax),%xmm6
movaps -0xa8(%rax),%xmm7
movaps -0x98(%rax),%xmm8
movaps -0x88(%rax),%xmm9
movaps -0x78(%rax),%xmm10
movaps -0x68(%rax),%xmm11
movaps -0x58(%rax),%xmm12
movaps -0x48(%rax),%xmm13
movaps -0x38(%rax),%xmm14
movaps -0x28(%rax),%xmm15
___
$code.=<<___;
mov -16(%rax),%rbp
mov -8(%rax),%rbx
lea (%rax),%rsp
.Lepilogue_shaext:
ret
.size sha1_multi_block_shaext,.-sha1_multi_block_shaext
___
}}}
if ($avx) {{{
sub BODY_00_19_avx {
@ -752,6 +1011,7 @@ $code.=<<___;
sub \$`$REG_SZ*18`, %rsp
and \$-256,%rsp
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
.Lbody_avx:
lea K_XX_XX(%rip),$Tbl
lea `$REG_SZ*16`(%rsp),%rbx
@ -858,6 +1118,7 @@ $code.=<<___;
mov -16(%rax),%rbp
mov -8(%rax),%rbx
lea (%rax),%rsp
.Lepilogue_avx:
ret
.size sha1_multi_block_avx,.-sha1_multi_block_avx
___
@ -904,6 +1165,7 @@ $code.=<<___;
sub \$`$REG_SZ*18`, %rsp
and \$-256,%rsp
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
.Lbody_avx2:
lea K_XX_XX(%rip),$Tbl
shr \$1,$num
@ -1015,6 +1277,7 @@ $code.=<<___;
mov -16(%rax),%rbp
mov -8(%rax),%rbx
lea (%rax),%rsp
.Lepilogue_avx2:
ret
.size sha1_multi_block_avx2,.-sha1_multi_block_avx2
___
@ -1033,17 +1296,261 @@ K_XX_XX:
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
.asciz "SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
if ($win64) {
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # end of prologue label
cmp %r10,%rbx # context->Rip<.Lbody
jb .Lin_prologue
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=.Lepilogue
jae .Lin_prologue
mov `16*17`(%rax),%rax # pull saved stack pointer
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
lea -24-10*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx
.long 0xa548f3fc # cld; rep movsq
.Lin_prologue:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size se_handler,.-se_handler
___
$code.=<<___ if ($avx>1);
.type avx2_handler,\@abi-omnipotent
.align 16
avx2_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # end of prologue label
cmp %r10,%rbx # context->Rip<body label
jb .Lin_prologue
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
mov `32*17`($context),%rax # pull saved stack pointer
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
mov -40(%rax),%r14
mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore cotnext->R12
mov %r13,224($context) # restore cotnext->R13
mov %r14,232($context) # restore cotnext->R14
mov %r15,240($context) # restore cotnext->R15
lea -56-10*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx
.long 0xa548f3fc # cld; rep movsq
jmp .Lin_prologue
.size avx2_handler,.-avx2_handler
___
$code.=<<___;
.section .pdata
.align 4
.rva .LSEH_begin_sha1_multi_block
.rva .LSEH_end_sha1_multi_block
.rva .LSEH_info_sha1_multi_block
.rva .LSEH_begin_sha1_multi_block_shaext
.rva .LSEH_end_sha1_multi_block_shaext
.rva .LSEH_info_sha1_multi_block_shaext
___
$code.=<<___ if ($avx);
.rva .LSEH_begin_sha1_multi_block_avx
.rva .LSEH_end_sha1_multi_block_avx
.rva .LSEH_info_sha1_multi_block_avx
___
$code.=<<___ if ($avx>1);
.rva .LSEH_begin_sha1_multi_block_avx2
.rva .LSEH_end_sha1_multi_block_avx2
.rva .LSEH_info_sha1_multi_block_avx2
___
$code.=<<___;
.section .xdata
.align 8
.LSEH_info_sha1_multi_block:
.byte 9,0,0,0
.rva se_handler
.rva .Lbody,.Lepilogue # HandlerData[]
.LSEH_info_sha1_multi_block_shaext:
.byte 9,0,0,0
.rva se_handler
.rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
___
$code.=<<___ if ($avx);
.LSEH_info_sha1_multi_block_avx:
.byte 9,0,0,0
.rva se_handler
.rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
___
$code.=<<___ if ($avx>1);
.LSEH_info_sha1_multi_block_avx2:
.byte 9,0,0,0
.rva avx2_handler
.rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
___
}
####################################################################
sub rex {
local *opcode=shift;
my ($dst,$src)=@_;
my $rex=0;
$rex|=0x04 if ($dst>=8);
$rex|=0x01 if ($src>=8);
unshift @opcode,$rex|0x40 if ($rex);
}
sub sha1rnds4 {
if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
my @opcode=(0x0f,0x3a,0xcc);
rex(\@opcode,$3,$2);
push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
my $c=$1;
push @opcode,$c=~/^0/?oct($c):$c;
return ".byte\t".join(',',@opcode);
} else {
return "sha1rnds4\t".@_[0];
}
}
sub sha1op38 {
my $instr = shift;
my %opcodelet = (
"sha1nexte" => 0xc8,
"sha1msg1" => 0xc9,
"sha1msg2" => 0xca );
if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
my @opcode=(0x0f,0x38);
rex(\@opcode,$2,$1);
push @opcode,$opcodelet{$instr};
push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
return ".byte\t".join(',',@opcode);
} else {
return $instr."\t".@_[0];
}
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or
s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
print $_,"\n";
}

View File

@ -57,6 +57,10 @@
# hint regarding the number of Xupdate iterations to pre-compute in
# advance was provided by Ilya Albrekht of Intel Corp.
# March 2014.
#
# Add support for Intel SHA Extensions.
######################################################################
# Current performance is summarized in following table. Numbers are
# CPU clock cycles spent to process single byte (less is better).
@ -71,7 +75,7 @@
# Haswell 5.45 4.15/+31% 3.57/+53%
# Bulldozer 9.11 5.95/+53%
# VIA Nano 9.32 7.15/+30%
# Atom [10.5?] [9.23?]/+14%
# Atom 10.3 9.17/+12%
# Silvermont 13.1(*) 9.37/+40%
#
# (*) obviously suboptimal result, nothing was done about it,
@ -241,6 +245,9 @@ sha1_block_data_order:
mov OPENSSL_ia32cap_P+8(%rip),%r10d
test \$`1<<9`,%r8d # check SSSE3 bit
jz .Lialu
test \$`1<<29`,%r10d # check SHA bit
jnz _shaext_shortcut
___
$code.=<<___ if ($avx>1);
and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2
@ -315,6 +322,120 @@ $code.=<<___;
.size sha1_block_data_order,.-sha1_block_data_order
___
{{{
######################################################################
# Intel SHA Extensions implementation of SHA1 update function.
#
my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
my @MSG=map("%xmm$_",(4..7));
$code.=<<___;
.type sha1_block_data_order_shaext,\@function,3
.align 32
sha1_block_data_order_shaext:
_shaext_shortcut:
___
$code.=<<___ if ($win64);
lea `-8-4*16`(%rsp),%rsp
movaps %xmm6,-8-4*16(%rax)
movaps %xmm7,-8-3*16(%rax)
movaps %xmm8,-8-2*16(%rax)
movaps %xmm9,-8-1*16(%rax)
.Lprologue_shaext:
___
$code.=<<___;
movdqu ($ctx),$ABCD
movd 16($ctx),$E
movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap
movdqu ($inp),@MSG[0]
pshufd \$0b00011011,$ABCD,$ABCD # flip word order
movdqu 0x10($inp),@MSG[1]
pshufd \$0b00011011,$E,$E # flip word order
movdqu 0x20($inp),@MSG[2]
pshufb $BSWAP,@MSG[0]
movdqu 0x30($inp),@MSG[3]
pshufb $BSWAP,@MSG[1]
pshufb $BSWAP,@MSG[2]
movdqa $E,$E_SAVE # offload $E
pshufb $BSWAP,@MSG[3]
jmp .Loop_shaext
.align 16
.Loop_shaext:
dec $num
lea 0x40($inp),%rax # next input block
paddd @MSG[0],$E
cmovne %rax,$inp
movdqa $ABCD,$ABCD_SAVE # offload $ABCD
___
for($i=0;$i<20-4;$i+=2) {
$code.=<<___;
sha1msg1 @MSG[1],@MSG[0]
movdqa $ABCD,$E_
sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3...
sha1nexte @MSG[1],$E_
pxor @MSG[2],@MSG[0]
sha1msg1 @MSG[2],@MSG[1]
sha1msg2 @MSG[3],@MSG[0]
movdqa $ABCD,$E
sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD
sha1nexte @MSG[2],$E
pxor @MSG[3],@MSG[1]
sha1msg2 @MSG[0],@MSG[1]
___
push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG));
}
$code.=<<___;
movdqu ($inp),@MSG[0]
movdqa $ABCD,$E_
sha1rnds4 \$3,$E,$ABCD # 64-67
sha1nexte @MSG[1],$E_
movdqu 0x10($inp),@MSG[1]
pshufb $BSWAP,@MSG[0]
movdqa $ABCD,$E
sha1rnds4 \$3,$E_,$ABCD # 68-71
sha1nexte @MSG[2],$E
movdqu 0x20($inp),@MSG[2]
pshufb $BSWAP,@MSG[1]
movdqa $ABCD,$E_
sha1rnds4 \$3,$E,$ABCD # 72-75
sha1nexte @MSG[3],$E_
movdqu 0x30($inp),@MSG[3]
pshufb $BSWAP,@MSG[2]
movdqa $ABCD,$E
sha1rnds4 \$3,$E_,$ABCD # 76-79
sha1nexte $E_SAVE,$E
pshufb $BSWAP,@MSG[3]
paddd $ABCD_SAVE,$ABCD
movdqa $E,$E_SAVE # offload $E
jnz .Loop_shaext
pshufd \$0b00011011,$ABCD,$ABCD
pshufd \$0b00011011,$E,$E
movdqu $ABCD,($ctx)
movd $E,16($ctx)
___
$code.=<<___ if ($win64);
movaps -8-4*16(%rax),%xmm6
movaps -8-3*16(%rax),%xmm7
movaps -8-2*16(%rax),%xmm8
movaps -8-1*16(%rax),%xmm9
mov %rax,%rsp
.Lepilogue_shaext:
___
$code.=<<___;
ret
.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
___
}}}
{{{
my $Xi=4;
my @X=map("%xmm$_",(4..7,0..3));
my @Tx=map("%xmm$_",(8..10));
@ -1646,6 +1767,7 @@ K_XX_XX:
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
___
}}}
$code.=<<___;
@ -1706,6 +1828,39 @@ se_handler:
jmp .Lcommon_seh_tail
.size se_handler,.-se_handler
.type shaext_handler,\@abi-omnipotent
.align 16
shaext_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
lea .Lprologue_shaext(%rip),%r10
cmp %r10,%rbx # context->Rip<.Lprologue
jb .Lcommon_seh_tail
lea .Lepilogue_shaext(%rip),%r10
cmp %r10,%rbx # context->Rip>=.Lepilogue
jae .Lcommon_seh_tail
lea -8-4*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$8,%ecx
.long 0xa548f3fc # cld; rep movsq
jmp .Lcommon_seh_tail
.size shaext_handler,.-shaext_handler
.type ssse3_handler,\@abi-omnipotent
.align 16
ssse3_handler:
@ -1801,6 +1956,9 @@ ssse3_handler:
.rva .LSEH_begin_sha1_block_data_order
.rva .LSEH_end_sha1_block_data_order
.rva .LSEH_info_sha1_block_data_order
.rva .LSEH_begin_sha1_block_data_order_shaext
.rva .LSEH_end_sha1_block_data_order_shaext
.rva .LSEH_info_sha1_block_data_order_shaext
.rva .LSEH_begin_sha1_block_data_order_ssse3
.rva .LSEH_end_sha1_block_data_order_ssse3
.rva .LSEH_info_sha1_block_data_order_ssse3
@ -1821,6 +1979,9 @@ $code.=<<___;
.LSEH_info_sha1_block_data_order:
.byte 9,0,0,0
.rva se_handler
.LSEH_info_sha1_block_data_order_shaext:
.byte 9,0,0,0
.rva shaext_handler
.LSEH_info_sha1_block_data_order_ssse3:
.byte 9,0,0,0
.rva ssse3_handler
@ -1842,6 +2003,41 @@ ___
####################################################################
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
sub sha1rnds4 {
if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
my @opcode=(0x0f,0x3a,0xcc);
push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
my $c=$1;
push @opcode,$c=~/^0/?oct($c):$c;
return ".byte\t".join(',',@opcode);
} else {
return "sha1rnds4\t".@_[0];
}
}
sub sha1op38 {
my $instr = shift;
my %opcodelet = (
"sha1nexte" => 0xc8,
"sha1msg1" => 0xc9,
"sha1msg2" => 0xca );
if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
my @opcode=(0x0f,0x38);
push @opcode,$opcodelet{$instr};
push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
return ".byte\t".join(',',@opcode);
} else {
return $instr."\t".@_[0];
}
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
print $_,"\n";
}
close STDOUT;

View File

@ -34,6 +34,10 @@
# (Biggest improvement coefficient is on upcoming Atom Silvermont,
# not shown.) Add AVX+BMI code path.
#
# March 2014.
#
# Add support for Intel SHA Extensions.
#
# Performance in clock cycles per processed byte (less is better):
#
# gcc icc x86 asm(*) SIMD x86_64 asm(**)
@ -196,8 +200,13 @@ sub BODY_00_15() {
&mov ("ebx",&DWP(4,"edx"));
&test ("ecx",1<<20); # check for P4
&jnz (&label("loop"));
&mov ("edx",&DWP(8,"edx")) if ($xmm);
&test ("ecx",1<<24); # check for FXSR
&jz ($unroll_after?&label("no_xmm"):&label("loop"));
&and ("ecx",1<<30); # mask "Intel CPU" bit
&and ("ebx",1<<28|1<<9); # mask AVX and SSSE3 bits
&test ("edx",1<<29) if ($xmm); # check for SHA
&jnz (&label("shaext")) if ($xmm);
&or ("ecx","ebx");
&and ("ecx",1<<28|1<<30);
&cmp ("ecx",1<<28|1<<30);
@ -209,6 +218,7 @@ sub BODY_00_15() {
&je (&label("loop_shrd"));
}
if ($unroll_after) {
&set_label("no_xmm");
&sub ("eax","edi");
&cmp ("eax",$unroll_after);
&jae (&label("unrolled"));
@ -495,6 +505,146 @@ my @AH=($A,$K256);
&function_end_A();
}
if (!$i386 && $xmm) {{{
{
######################################################################
# Intel SHA Extensions implementation of SHA256 update function.
#
my ($ctx,$inp,$end)=("esi","edi","eax");
my ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7));
my @MSG=map("xmm$_",(3..6));
sub sha256op38 {
my ($opcodelet,$dst,$src)=@_;
if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
{ &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); }
}
sub sha256rnds2 { sha256op38(0xcb,@_); }
sub sha256msg1 { sha256op38(0xcc,@_); }
sub sha256msg2 { sha256op38(0xcd,@_); }
&set_label("shaext",32);
&sub ("esp",32);
&movdqu ($ABEF,&QWP(0,$ctx)); # DCBA
&lea ($K256,&DWP(0x80,$K256));
&movdqu ($CDGH,&QWP(16,$ctx)); # HGFE
&movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask
&pshufd ($Wi,$ABEF,0x1b); # ABCD
&pshufd ($ABEF,$ABEF,0xb1); # CDAB
&pshufd ($CDGH,$CDGH,0x1b); # EFGH
&palignr ($ABEF,$CDGH,8); # ABEF
&punpcklqdq ($CDGH,$Wi); # CDGH
&jmp (&label("loop_shaext"));
&set_label("loop_shaext",16);
&movdqu (@MSG[0],&QWP(0,$inp));
&movdqu (@MSG[1],&QWP(0x10,$inp));
&movdqu (@MSG[2],&QWP(0x20,$inp));
&pshufb (@MSG[0],$TMP);
&movdqu (@MSG[3],&QWP(0x30,$inp));
&movdqa (&QWP(16,"esp"),$CDGH); # offload
&movdqa ($Wi,&QWP(0*16-0x80,$K256));
&paddd ($Wi,@MSG[0]);
&pshufb (@MSG[1],$TMP);
&sha256rnds2 ($CDGH,$ABEF); # 0-3
&pshufd ($Wi,$Wi,0x0e);
&nop ();
&movdqa (&QWP(0,"esp"),$ABEF); # offload
&sha256rnds2 ($ABEF,$CDGH);
&movdqa ($Wi,&QWP(1*16-0x80,$K256));
&paddd ($Wi,@MSG[1]);
&pshufb (@MSG[2],$TMP);
&sha256rnds2 ($CDGH,$ABEF); # 4-7
&pshufd ($Wi,$Wi,0x0e);
&lea ($inp,&DWP(0x40,$inp));
&sha256msg1 (@MSG[0],@MSG[1]);
&sha256rnds2 ($ABEF,$CDGH);
&movdqa ($Wi,&QWP(2*16-0x80,$K256));
&paddd ($Wi,@MSG[2]);
&pshufb (@MSG[3],$TMP);
&sha256rnds2 ($CDGH,$ABEF); # 8-11
&pshufd ($Wi,$Wi,0x0e);
&movdqa ($TMP,@MSG[3]);
&palignr ($TMP,@MSG[2],4);
&nop ();
&paddd (@MSG[0],$TMP);
&sha256msg1 (@MSG[1],@MSG[2]);
&sha256rnds2 ($ABEF,$CDGH);
&movdqa ($Wi,&QWP(3*16-0x80,$K256));
&paddd ($Wi,@MSG[3]);
&sha256msg2 (@MSG[0],@MSG[3]);
&sha256rnds2 ($CDGH,$ABEF); # 12-15
&pshufd ($Wi,$Wi,0x0e);
&movdqa ($TMP,@MSG[0]);
&palignr ($TMP,@MSG[3],4);
&nop ();
&paddd (@MSG[1],$TMP);
&sha256msg1 (@MSG[2],@MSG[3]);
&sha256rnds2 ($ABEF,$CDGH);
for($i=4;$i<16-3;$i++) {
&movdqa ($Wi,&QWP($i*16-0x80,$K256));
&paddd ($Wi,@MSG[0]);
&sha256msg2 (@MSG[1],@MSG[0]);
&sha256rnds2 ($CDGH,$ABEF); # 16-19...
&pshufd ($Wi,$Wi,0x0e);
&movdqa ($TMP,@MSG[1]);
&palignr ($TMP,@MSG[0],4);
&nop ();
&paddd (@MSG[2],$TMP);
&sha256msg1 (@MSG[3],@MSG[0]);
&sha256rnds2 ($ABEF,$CDGH);
push(@MSG,shift(@MSG));
}
&movdqa ($Wi,&QWP(13*16-0x80,$K256));
&paddd ($Wi,@MSG[0]);
&sha256msg2 (@MSG[1],@MSG[0]);
&sha256rnds2 ($CDGH,$ABEF); # 52-55
&pshufd ($Wi,$Wi,0x0e);
&movdqa ($TMP,@MSG[1])
&palignr ($TMP,@MSG[0],4);
&sha256rnds2 ($ABEF,$CDGH);
&paddd (@MSG[2],$TMP);
&movdqa ($Wi,&QWP(14*16-0x80,$K256));
&paddd ($Wi,@MSG[1]);
&sha256rnds2 ($CDGH,$ABEF); # 56-59
&pshufd ($Wi,$Wi,0x0e);
&sha256msg2 (@MSG[2],@MSG[1]);
&movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask
&sha256rnds2 ($ABEF,$CDGH);
&movdqa ($Wi,&QWP(15*16-0x80,$K256));
&paddd ($Wi,@MSG[2]);
&nop ();
&sha256rnds2 ($CDGH,$ABEF); # 60-63
&pshufd ($Wi,$Wi,0x0e);
&cmp ($end,$inp);
&nop ();
&sha256rnds2 ($ABEF,$CDGH);
&paddd ($CDGH,&QWP(16,"esp"));
&paddd ($ABEF,&QWP(0,"esp"));
&jnz (&label("loop_shaext"));
&pshufd ($CDGH,$CDGH,0xb1); # DCHG
&pshufd ($TMP,$ABEF,0x1b); # FEBA
&pshufd ($ABEF,$ABEF,0xb1); # BAFE
&punpckhqdq ($ABEF,$CDGH); # DCBA
&palignr ($CDGH,$TMP,8); # HGFE
&mov ("esp",&DWP(32+12,"esp"));
&movdqu (&QWP(0,$ctx),$ABEF);
&movdqu (&QWP(16,$ctx),$CDGH);
&function_end_A();
}
my @X = map("xmm$_",(0..3));
my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7));
my @AH = ($A,$T);
@ -811,7 +961,6 @@ sub body_00_15 () {
if ($avx) {
&set_label("AVX",32);
if ($avx>1) {
&mov ("edx",&DWP(8,"edx"));
&and ("edx",1<<8|1<<3); # check for BMI2+BMI1
&cmp ("edx",1<<8|1<<3);
&je (&label("AVX_BMI"));

View File

@ -15,7 +15,7 @@
# this +aesni(i) sha256 aesni-sha256 gain(iv)
# -------------------------------------------------------------------
# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
# Atom(ii) ?39.1/n +3.93=13.7(n=4) 20.8 +5.69=26.5 +93%
# Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
@ -29,7 +29,7 @@
# for n=4 is 20.3+4.44=24.7;
# (iv) presented improvement coefficients are asymptotic limits and
# in real-life application are somewhat lower, e.g. for 2KB
# fragments they range from 75% to 13% (on Haswell);
# fragments they range from 75% to 130% (on Haswell);
$flavour = shift;
$output = shift;
@ -103,7 +103,6 @@ $code.=<<___ if ($i<15);
punpckldq $t2,$Xi
punpckldq $t3,$t1
punpckldq $t1,$Xi
pshufb $Xn,$Xi
___
$code.=<<___ if ($i==15);
movd `4*$i`(@ptr[0]),$Xi
@ -117,11 +116,12 @@ $code.=<<___ if ($i==15);
punpckldq $t2,$Xi
punpckldq $t3,$t1
punpckldq $t1,$Xi
pshufb $Xn,$Xi
___
$code.=<<___;
movdqa $e,$sigma
`"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
movdqa $e,$t3
`"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
psrld \$6,$sigma
movdqa $e,$t2
pslld \$7,$t3
@ -136,7 +136,7 @@ $code.=<<___;
psrld \$25-11,$t2
movdqa $e,$t1
`"prefetch 63(@ptr[0])" if ($i==15)`
`"prefetcht0 63(@ptr[0])" if ($i==15)`
pxor $t3,$sigma
movdqa $e,$axb # borrow $axb
pslld \$26-21,$t3
@ -144,7 +144,7 @@ $code.=<<___;
pand $f,$axb
pxor $t2,$sigma
`"prefetch 63(@ptr[1])" if ($i==15)`
`"prefetcht0 63(@ptr[1])" if ($i==15)`
movdqa $a,$t2
pxor $t3,$sigma # Sigma1(e)
movdqa $a,$t3
@ -156,7 +156,7 @@ $code.=<<___;
pslld \$10,$t3
pxor $a,$axb # a^b, b^c in next round
`"prefetch 63(@ptr[2])" if ($i==15)`
`"prefetcht0 63(@ptr[2])" if ($i==15)`
psrld \$13,$sigma
pxor $t3,$t2
paddd $t1,$Xi # Xi+=Ch(e,f,g)
@ -164,7 +164,7 @@ $code.=<<___;
pand $axb,$bxc
pxor $sigma,$t2
`"prefetch 63(@ptr[3])" if ($i==15)`
`"prefetcht0 63(@ptr[3])" if ($i==15)`
psrld \$22-13,$sigma
pxor $t3,$t2
movdqa $b,$h
@ -232,9 +232,11 @@ $code.=<<___;
.type sha256_multi_block,\@function,3
.align 32
sha256_multi_block:
mov OPENSSL_ia32cap_P+4(%rip),%rcx
bt \$61,%rcx # check SHA bit
jc _shaext_shortcut
___
$code.=<<___ if ($avx);
mov OPENSSL_ia32cap_P+4(%rip),%rcx
test \$`1<<28`,%ecx
jnz _avx_shortcut
___
@ -260,6 +262,7 @@ $code.=<<___;
sub \$`$REG_SZ*18`, %rsp
and \$-256,%rsp
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
.Lbody:
lea K256+128(%rip),$Tbl
lea `$REG_SZ*16`(%rsp),%rbx
lea 0x80($ctx),$ctx # size optimization
@ -393,9 +396,363 @@ $code.=<<___;
mov -16(%rax),%rbp
mov -8(%rax),%rbx
lea (%rax),%rsp
.Lepilogue:
ret
.size sha256_multi_block,.-sha256_multi_block
___
{{{
my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
my @MSG0=map("%xmm$_",(4..7));
my @MSG1=map("%xmm$_",(8..11));
$code.=<<___;
.type sha256_multi_block_shaext,\@function,3
.align 32
sha256_multi_block_shaext:
_shaext_shortcut:
mov %rsp,%rax
push %rbx
push %rbp
___
$code.=<<___ if ($win64);
lea -0xa8(%rsp),%rsp
movaps %xmm6,(%rsp)
movaps %xmm7,0x10(%rsp)
movaps %xmm8,0x20(%rsp)
movaps %xmm9,0x30(%rsp)
movaps %xmm10,-0x78(%rax)
movaps %xmm11,-0x68(%rax)
movaps %xmm12,-0x58(%rax)
movaps %xmm13,-0x48(%rax)
movaps %xmm14,-0x38(%rax)
movaps %xmm15,-0x28(%rax)
___
$code.=<<___;
sub \$`$REG_SZ*18`,%rsp
shl \$1,$num # we process pair at a time
and \$-256,%rsp
lea 0x80($ctx),$ctx # size optimization
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
.Lbody_shaext:
lea `$REG_SZ*16`(%rsp),%rbx
lea K256_shaext+0x80(%rip),$Tbl
.Loop_grande_shaext:
mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num
xor $num,$num
___
for($i=0;$i<2;$i++) {
$code.=<<___;
mov `16*$i+0`($inp),@ptr[$i] # input pointer
mov `16*$i+8`($inp),%ecx # number of blocks
cmp $num,%ecx
cmovg %ecx,$num # find maximum
test %ecx,%ecx
mov %ecx,`4*$i`(%rbx) # initialize counters
cmovle %rsp,@ptr[$i] # cancel input
___
}
$code.=<<___;
test $num,$num
jz .Ldone_shaext
movq 0x00-0x80($ctx),$ABEF0 # A1.A0
movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
movq 0x40-0x80($ctx),$CDGH0 # C1.C0
movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
movdqa $ABEF0,$ABEF1
movdqa $CDGH0,$CDGH1
punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
pshufd \$0b00011011,$ABEF0,$ABEF0
pshufd \$0b00011011,$CDGH0,$CDGH0
pshufd \$0b00011011,$ABEF1,$ABEF1
pshufd \$0b00011011,$CDGH1,$CDGH1
jmp .Loop_shaext
.align 32
.Loop_shaext:
movdqu 0x00(@ptr[0]),@MSG0[0]
movdqu 0x00(@ptr[1]),@MSG1[0]
movdqu 0x10(@ptr[0]),@MSG0[1]
movdqu 0x10(@ptr[1]),@MSG1[1]
movdqu 0x20(@ptr[0]),@MSG0[2]
pshufb $TMPx,@MSG0[0]
movdqu 0x20(@ptr[1]),@MSG1[2]
pshufb $TMPx,@MSG1[0]
movdqu 0x30(@ptr[0]),@MSG0[3]
lea 0x40(@ptr[0]),@ptr[0]
movdqu 0x30(@ptr[1]),@MSG1[3]
lea 0x40(@ptr[1]),@ptr[1]
movdqa 0*16-0x80($Tbl),$Wi
pshufb $TMPx,@MSG0[1]
paddd @MSG0[0],$Wi
pxor $ABEF0,@MSG0[0] # black magic
movdqa $Wi,$TMP0
movdqa 0*16-0x80($Tbl),$TMP1
pshufb $TMPx,@MSG1[1]
paddd @MSG1[0],$TMP1
movdqa $CDGH0,0x50(%rsp) # offload
sha256rnds2 $ABEF0,$CDGH0 # 0-3
pxor $ABEF1,@MSG1[0] # black magic
movdqa $TMP1,$Wi
movdqa $CDGH1,0x70(%rsp)
sha256rnds2 $ABEF1,$CDGH1 # 0-3
pshufd \$0x0e,$TMP0,$Wi
pxor $ABEF0,@MSG0[0] # black magic
movdqa $ABEF0,0x40(%rsp) # offload
sha256rnds2 $CDGH0,$ABEF0
pshufd \$0x0e,$TMP1,$Wi
pxor $ABEF1,@MSG1[0] # black magic
movdqa $ABEF1,0x60(%rsp)
movdqa 1*16-0x80($Tbl),$TMP0
paddd @MSG0[1],$TMP0
pshufb $TMPx,@MSG0[2]
sha256rnds2 $CDGH1,$ABEF1
movdqa $TMP0,$Wi
movdqa 1*16-0x80($Tbl),$TMP1
paddd @MSG1[1],$TMP1
sha256rnds2 $ABEF0,$CDGH0 # 4-7
movdqa $TMP1,$Wi
prefetcht0 127(@ptr[0])
pshufb $TMPx,@MSG0[3]
pshufb $TMPx,@MSG1[2]
prefetcht0 127(@ptr[1])
sha256rnds2 $ABEF1,$CDGH1 # 4-7
pshufd \$0x0e,$TMP0,$Wi
pshufb $TMPx,@MSG1[3]
sha256msg1 @MSG0[1],@MSG0[0]
sha256rnds2 $CDGH0,$ABEF0
pshufd \$0x0e,$TMP1,$Wi
movdqa 2*16-0x80($Tbl),$TMP0
paddd @MSG0[2],$TMP0
sha256rnds2 $CDGH1,$ABEF1
movdqa $TMP0,$Wi
movdqa 2*16-0x80($Tbl),$TMP1
paddd @MSG1[2],$TMP1
sha256rnds2 $ABEF0,$CDGH0 # 8-11
sha256msg1 @MSG1[1],@MSG1[0]
movdqa $TMP1,$Wi
movdqa @MSG0[3],$TMPx
sha256rnds2 $ABEF1,$CDGH1 # 8-11
pshufd \$0x0e,$TMP0,$Wi
palignr \$4,@MSG0[2],$TMPx
paddd $TMPx,@MSG0[0]
movdqa @MSG1[3],$TMPx
palignr \$4,@MSG1[2],$TMPx
sha256msg1 @MSG0[2],@MSG0[1]
sha256rnds2 $CDGH0,$ABEF0
pshufd \$0x0e,$TMP1,$Wi
movdqa 3*16-0x80($Tbl),$TMP0
paddd @MSG0[3],$TMP0
sha256rnds2 $CDGH1,$ABEF1
sha256msg1 @MSG1[2],@MSG1[1]
movdqa $TMP0,$Wi
movdqa 3*16-0x80($Tbl),$TMP1
paddd $TMPx,@MSG1[0]
paddd @MSG1[3],$TMP1
sha256msg2 @MSG0[3],@MSG0[0]
sha256rnds2 $ABEF0,$CDGH0 # 12-15
movdqa $TMP1,$Wi
movdqa @MSG0[0],$TMPx
palignr \$4,@MSG0[3],$TMPx
sha256rnds2 $ABEF1,$CDGH1 # 12-15
sha256msg2 @MSG1[3],@MSG1[0]
pshufd \$0x0e,$TMP0,$Wi
paddd $TMPx,@MSG0[1]
movdqa @MSG1[0],$TMPx
palignr \$4,@MSG1[3],$TMPx
sha256msg1 @MSG0[3],@MSG0[2]
sha256rnds2 $CDGH0,$ABEF0
pshufd \$0x0e,$TMP1,$Wi
movdqa 4*16-0x80($Tbl),$TMP0
paddd @MSG0[0],$TMP0
sha256rnds2 $CDGH1,$ABEF1
sha256msg1 @MSG1[3],@MSG1[2]
___
for($i=4;$i<16-3;$i++) {
$code.=<<___;
movdqa $TMP0,$Wi
movdqa $i*16-0x80($Tbl),$TMP1
paddd $TMPx,@MSG1[1]
paddd @MSG1[0],$TMP1
sha256msg2 @MSG0[0],@MSG0[1]
sha256rnds2 $ABEF0,$CDGH0 # 16-19...
movdqa $TMP1,$Wi
movdqa @MSG0[1],$TMPx
palignr \$4,@MSG0[0],$TMPx
sha256rnds2 $ABEF1,$CDGH1 # 16-19...
sha256msg2 @MSG1[0],@MSG1[1]
pshufd \$0x0e,$TMP0,$Wi
paddd $TMPx,@MSG0[2]
movdqa @MSG1[1],$TMPx
palignr \$4,@MSG1[0],$TMPx
sha256msg1 @MSG0[0],@MSG0[3]
sha256rnds2 $CDGH0,$ABEF0
pshufd \$0x0e,$TMP1,$Wi
movdqa `($i+1)*16`-0x80($Tbl),$TMP0
paddd @MSG0[1],$TMP0
sha256rnds2 $CDGH1,$ABEF1
sha256msg1 @MSG1[0],@MSG1[3]
___
push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
}
$code.=<<___;
movdqa $TMP0,$Wi
movdqa 13*16-0x80($Tbl),$TMP1
paddd $TMPx,@MSG1[1]
paddd @MSG1[0],$TMP1
sha256msg2 @MSG0[0],@MSG0[1]
sha256rnds2 $ABEF0,$CDGH0 # 52-55
movdqa $TMP1,$Wi
movdqa @MSG0[1],$TMPx
palignr \$4,@MSG0[0],$TMPx
sha256rnds2 $ABEF1,$CDGH1 # 52-55
sha256msg2 @MSG1[0],@MSG1[1]
pshufd \$0x0e,$TMP0,$Wi
paddd $TMPx,@MSG0[2]
movdqa @MSG1[1],$TMPx
palignr \$4,@MSG1[0],$TMPx
nop
sha256rnds2 $CDGH0,$ABEF0
pshufd \$0x0e,$TMP1,$Wi
movdqa 14*16-0x80($Tbl),$TMP0
paddd @MSG0[1],$TMP0
sha256rnds2 $CDGH1,$ABEF1
movdqa $TMP0,$Wi
movdqa 14*16-0x80($Tbl),$TMP1
paddd $TMPx,@MSG1[2]
paddd @MSG1[1],$TMP1
sha256msg2 @MSG0[1],@MSG0[2]
nop
sha256rnds2 $ABEF0,$CDGH0 # 56-59
movdqa $TMP1,$Wi
mov \$1,%ecx
pxor @MSG0[1],@MSG0[1] # zero
sha256rnds2 $ABEF1,$CDGH1 # 56-59
sha256msg2 @MSG1[1],@MSG1[2]
pshufd \$0x0e,$TMP0,$Wi
movdqa 15*16-0x80($Tbl),$TMP0
paddd @MSG0[2],$TMP0
movq (%rbx),@MSG0[2] # pull counters
nop
sha256rnds2 $CDGH0,$ABEF0
pshufd \$0x0e,$TMP1,$Wi
movdqa 15*16-0x80($Tbl),$TMP1
paddd @MSG1[2],$TMP1
sha256rnds2 $CDGH1,$ABEF1
movdqa $TMP0,$Wi
cmp 4*0(%rbx),%ecx # examine counters
cmovge %rsp,@ptr[0] # cancel input
cmp 4*1(%rbx),%ecx
cmovge %rsp,@ptr[1]
pshufd \$0x00,@MSG0[2],@MSG1[0]
sha256rnds2 $ABEF0,$CDGH0 # 60-63
movdqa $TMP1,$Wi
pshufd \$0x55,@MSG0[2],@MSG1[1]
movdqa @MSG0[2],@MSG1[2]
sha256rnds2 $ABEF1,$CDGH1 # 60-63
pshufd \$0x0e,$TMP0,$Wi
pcmpgtd @MSG0[1],@MSG1[0]
pcmpgtd @MSG0[1],@MSG1[1]
sha256rnds2 $CDGH0,$ABEF0
pshufd \$0x0e,$TMP1,$Wi
pcmpgtd @MSG0[1],@MSG1[2] # counter mask
movdqa K256_shaext-0x10(%rip),$TMPx
sha256rnds2 $CDGH1,$ABEF1
pand @MSG1[0],$CDGH0
pand @MSG1[1],$CDGH1
pand @MSG1[0],$ABEF0
pand @MSG1[1],$ABEF1
paddd @MSG0[2],@MSG1[2] # counters--
paddd 0x50(%rsp),$CDGH0
paddd 0x70(%rsp),$CDGH1
paddd 0x40(%rsp),$ABEF0
paddd 0x60(%rsp),$ABEF1
movq @MSG1[2],(%rbx) # save counters
dec $num
jnz .Loop_shaext
mov `$REG_SZ*17+8`(%rsp),$num
pshufd \$0b00011011,$ABEF0,$ABEF0
pshufd \$0b00011011,$CDGH0,$CDGH0
pshufd \$0b00011011,$ABEF1,$ABEF1
pshufd \$0b00011011,$CDGH1,$CDGH1
movdqa $ABEF0,@MSG0[0]
movdqa $CDGH0,@MSG0[1]
punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
movq $ABEF0,0x00-0x80($ctx) # A1.A0
psrldq \$8,$ABEF0
movq @MSG0[0],0x80-0x80($ctx) # E1.E0
psrldq \$8,@MSG0[0]
movq $ABEF0,0x20-0x80($ctx) # B1.B0
movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
movq $CDGH0,0x40-0x80($ctx) # C1.C0
psrldq \$8,$CDGH0
movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
psrldq \$8,@MSG0[1]
movq $CDGH0,0x60-0x80($ctx) # D1.D0
movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
lea `$REG_SZ/2`($ctx),$ctx
lea `16*2`($inp),$inp
dec $num
jnz .Loop_grande_shaext
.Ldone_shaext:
#mov `$REG_SZ*17`(%rsp),%rax # original %rsp
___
$code.=<<___ if ($win64);
movaps -0xb8(%rax),%xmm6
movaps -0xa8(%rax),%xmm7
movaps -0x98(%rax),%xmm8
movaps -0x88(%rax),%xmm9
movaps -0x78(%rax),%xmm10
movaps -0x68(%rax),%xmm11
movaps -0x58(%rax),%xmm12
movaps -0x48(%rax),%xmm13
movaps -0x38(%rax),%xmm14
movaps -0x28(%rax),%xmm15
___
$code.=<<___;
mov -16(%rax),%rbp
mov -8(%rax),%rbx
lea (%rax),%rsp
.Lepilogue_shaext:
ret
.size sha256_multi_block_shaext,.-sha256_multi_block_shaext
___
}}}
if ($avx) {{{
sub ROUND_00_15_avx {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
@ -470,38 +827,38 @@ $code.=<<___;
vpsrld \$25,$e,$t2
vpxor $t3,$sigma,$sigma
`"prefetch 63(@ptr[0])" if ($i==15)`
`"prefetcht0 63(@ptr[0])" if ($i==15)`
vpslld \$7,$e,$t3
vpandn $g,$e,$t1
vpand $f,$e,$axb # borrow $axb
`"prefetch 63(@ptr[1])" if ($i==15)`
`"prefetcht0 63(@ptr[1])" if ($i==15)`
vpxor $t2,$sigma,$sigma
vpsrld \$2,$a,$h # borrow $h
vpxor $t3,$sigma,$sigma # Sigma1(e)
`"prefetch 63(@ptr[2])" if ($i==15)`
`"prefetcht0 63(@ptr[2])" if ($i==15)`
vpslld \$30,$a,$t2
vpxor $axb,$t1,$t1 # Ch(e,f,g)
vpxor $a,$b,$axb # a^b, b^c in next round
`"prefetch 63(@ptr[3])" if ($i==15)`
`"prefetcht0 63(@ptr[3])" if ($i==15)`
vpxor $t2,$h,$h
vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
vpsrld \$13,$a,$t2
`"prefetch 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
`"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
vpslld \$19,$a,$t3
vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
vpand $axb,$bxc,$bxc
`"prefetch 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
`"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
vpxor $t2,$h,$sigma
vpsrld \$22,$a,$t2
vpxor $t3,$sigma,$sigma
`"prefetch 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
`"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
vpslld \$10,$a,$t3
vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
vpaddd $Xi,$d,$d # d+=Xi
`"prefetch 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
`"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
vpxor $t2,$sigma,$sigma
vpxor $t3,$sigma,$sigma # Sigma0(a)
@ -586,6 +943,7 @@ $code.=<<___;
sub \$`$REG_SZ*18`, %rsp
and \$-256,%rsp
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
.Lbody_avx:
lea K256+128(%rip),$Tbl
lea `$REG_SZ*16`(%rsp),%rbx
lea 0x80($ctx),$ctx # size optimization
@ -718,6 +1076,7 @@ $code.=<<___;
mov -16(%rax),%rbp
mov -8(%rax),%rbx
lea (%rax),%rsp
.Lepilogue_avx:
ret
.size sha256_multi_block_avx,.-sha256_multi_block_avx
___
@ -760,6 +1119,7 @@ $code.=<<___;
sub \$`$REG_SZ*18`, %rsp
and \$-256,%rsp
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
.Lbody_avx2:
lea K256+128(%rip),$Tbl
lea 0x80($ctx),$ctx # size optimization
@ -896,6 +1256,7 @@ $code.=<<___;
mov -16(%rax),%rbp
mov -8(%rax),%rbx
lea (%rax),%rsp
.Lepilogue_avx2:
ret
.size sha256_multi_block_avx2,.-sha256_multi_block_avx2
___
@ -932,17 +1293,263 @@ $code.=<<___;
.Lpbswap:
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
K256_shaext:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
if ($win64) {
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # end of prologue label
cmp %r10,%rbx # context->Rip<.Lbody
jb .Lin_prologue
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=.Lepilogue
jae .Lin_prologue
mov `16*17`(%rax),%rax # pull saved stack pointer
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
lea -24-10*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx
.long 0xa548f3fc # cld; rep movsq
.Lin_prologue:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size se_handler,.-se_handler
___
$code.=<<___ if ($avx>1);
.type avx2_handler,\@abi-omnipotent
.align 16
avx2_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # end of prologue label
cmp %r10,%rbx # context->Rip<body label
jb .Lin_prologue
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
mov `32*17`($context),%rax # pull saved stack pointer
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
mov -40(%rax),%r14
mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore cotnext->R12
mov %r13,224($context) # restore cotnext->R13
mov %r14,232($context) # restore cotnext->R14
mov %r15,240($context) # restore cotnext->R15
lea -56-10*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx
.long 0xa548f3fc # cld; rep movsq
jmp .Lin_prologue
.size avx2_handler,.-avx2_handler
___
$code.=<<___;
.section .pdata
.align 4
.rva .LSEH_begin_sha256_multi_block
.rva .LSEH_end_sha256_multi_block
.rva .LSEH_info_sha256_multi_block
.rva .LSEH_begin_sha256_multi_block_shaext
.rva .LSEH_end_sha256_multi_block_shaext
.rva .LSEH_info_sha256_multi_block_shaext
___
$code.=<<___ if ($avx);
.rva .LSEH_begin_sha256_multi_block_avx
.rva .LSEH_end_sha256_multi_block_avx
.rva .LSEH_info_sha256_multi_block_avx
___
$code.=<<___ if ($avx>1);
.rva .LSEH_begin_sha256_multi_block_avx2
.rva .LSEH_end_sha256_multi_block_avx2
.rva .LSEH_info_sha256_multi_block_avx2
___
$code.=<<___;
.section .xdata
.align 8
.LSEH_info_sha256_multi_block:
.byte 9,0,0,0
.rva se_handler
.rva .Lbody,.Lepilogue # HandlerData[]
.LSEH_info_sha256_multi_block_shaext:
.byte 9,0,0,0
.rva se_handler
.rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
___
$code.=<<___ if ($avx);
.LSEH_info_sha256_multi_block_avx:
.byte 9,0,0,0
.rva se_handler
.rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
___
$code.=<<___ if ($avx>1);
.LSEH_info_sha256_multi_block_avx2:
.byte 9,0,0,0
.rva avx2_handler
.rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
___
}
####################################################################
sub rex {
local *opcode=shift;
my ($dst,$src)=@_;
my $rex=0;
$rex|=0x04 if ($dst>=8);
$rex|=0x01 if ($src>=8);
unshift @opcode,$rex|0x40 if ($rex);
}
sub sha256op38 {
my $instr = shift;
my %opcodelet = (
"sha256rnds2" => 0xcb,
"sha256msg1" => 0xcc,
"sha256msg2" => 0xcd );
if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
my @opcode=(0x0f,0x38);
rex(\@opcode,$2,$1);
push @opcode,$opcodelet{$instr};
push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
return ".byte\t".join(',',@opcode);
} else {
return $instr."\t".@_[0];
}
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
print $_,"\n";
}

View File

@ -67,7 +67,12 @@
# significant 128-bit halves and data from second to most significant.
# The data is then processed with same SIMD instruction sequence as
# for AVX, but with %ymm as operands. Side effect is increased stack
# frame, 448 additional bytes in SHA256 and 1152 in SHA512.
# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
# code size increase.
#
# March 2014.
#
# Add support for Intel SHA Extensions.
######################################################################
# Current performance in cycles per processed byte (less is better):
@ -254,6 +259,10 @@ $code.=<<___ if ($SZ==4 || $avx);
mov 4(%r11),%r10d
mov 8(%r11),%r11d
___
$code.=<<___ if ($SZ==4);
test \$`1<<29`,%r11d # check for SHA
jnz _shaext_shortcut
___
$code.=<<___ if ($avx && $SZ==8);
test \$`1<<11`,%r10d # check for XOP
jnz .Lxop_shortcut
@ -509,6 +518,166 @@ ___
######################################################################
# SIMD code paths
#
if ($SZ==4) {{{
######################################################################
# Intel SHA Extensions implementation of SHA256 update function.
#
my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
my @MSG=map("%xmm$_",(3..6));
$code.=<<___;
.type sha256_block_data_order_shaext,\@function,3
.align 64
sha256_block_data_order_shaext:
_shaext_shortcut:
___
$code.=<<___ if ($win64);
lea `-8-5*16`(%rsp),%rsp
movaps %xmm6,-8-5*16(%rax)
movaps %xmm7,-8-4*16(%rax)
movaps %xmm8,-8-3*16(%rax)
movaps %xmm9,-8-2*16(%rax)
movaps %xmm10,-8-1*16(%rax)
.Lprologue_shaext:
___
$code.=<<___;
lea K256+0x80(%rip),$Tbl
movdqu ($ctx),$ABEF # DCBA
movdqu 16($ctx),$CDGH # HGFE
movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
pshufd \$0x1b,$ABEF,$Wi # ABCD
pshufd \$0xb1,$ABEF,$ABEF # CDAB
pshufd \$0x1b,$CDGH,$CDGH # EFGH
movdqa $TMP,$BSWAP # offload
palignr \$8,$CDGH,$ABEF # ABEF
punpcklqdq $Wi,$CDGH # CDGH
jmp .Loop_shaext
.align 16
.Loop_shaext:
movdqu ($inp),@MSG[0]
movdqu 0x10($inp),@MSG[1]
movdqu 0x20($inp),@MSG[2]
pshufb $TMP,@MSG[0]
movdqu 0x30($inp),@MSG[3]
movdqa 0*32-0x80($Tbl),$Wi
paddd @MSG[0],$Wi
pshufb $TMP,@MSG[1]
movdqa $CDGH,$CDGH_SAVE # offload
sha256rnds2 $ABEF,$CDGH # 0-3
pshufd \$0x0e,$Wi,$Wi
nop
movdqa $ABEF,$ABEF_SAVE # offload
sha256rnds2 $CDGH,$ABEF
movdqa 1*32-0x80($Tbl),$Wi
paddd @MSG[1],$Wi
pshufb $TMP,@MSG[2]
sha256rnds2 $ABEF,$CDGH # 4-7
pshufd \$0x0e,$Wi,$Wi
lea 0x40($inp),$inp
sha256msg1 @MSG[1],@MSG[0]
sha256rnds2 $CDGH,$ABEF
movdqa 2*32-0x80($Tbl),$Wi
paddd @MSG[2],$Wi
pshufb $TMP,@MSG[3]
sha256rnds2 $ABEF,$CDGH # 8-11
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[3],$TMP
palignr \$4,@MSG[2],$TMP
nop
paddd $TMP,@MSG[0]
sha256msg1 @MSG[2],@MSG[1]
sha256rnds2 $CDGH,$ABEF
movdqa 3*32-0x80($Tbl),$Wi
paddd @MSG[3],$Wi
sha256msg2 @MSG[3],@MSG[0]
sha256rnds2 $ABEF,$CDGH # 12-15
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[0],$TMP
palignr \$4,@MSG[3],$TMP
nop
paddd $TMP,@MSG[1]
sha256msg1 @MSG[3],@MSG[2]
sha256rnds2 $CDGH,$ABEF
___
for($i=4;$i<16-3;$i++) {
$code.=<<___;
movdqa $i*32-0x80($Tbl),$Wi
paddd @MSG[0],$Wi
sha256msg2 @MSG[0],@MSG[1]
sha256rnds2 $ABEF,$CDGH # 16-19...
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[1],$TMP
palignr \$4,@MSG[0],$TMP
nop
paddd $TMP,@MSG[2]
sha256msg1 @MSG[0],@MSG[3]
sha256rnds2 $CDGH,$ABEF
___
push(@MSG,shift(@MSG));
}
$code.=<<___;
movdqa 13*32-0x80($Tbl),$Wi
paddd @MSG[0],$Wi
sha256msg2 @MSG[0],@MSG[1]
sha256rnds2 $ABEF,$CDGH # 52-55
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[1],$TMP
palignr \$4,@MSG[0],$TMP
sha256rnds2 $CDGH,$ABEF
paddd $TMP,@MSG[2]
movdqa 14*32-0x80($Tbl),$Wi
paddd @MSG[1],$Wi
sha256rnds2 $ABEF,$CDGH # 56-59
pshufd \$0x0e,$Wi,$Wi
sha256msg2 @MSG[1],@MSG[2]
movdqa $BSWAP,$TMP
sha256rnds2 $CDGH,$ABEF
movdqa 15*32-0x80($Tbl),$Wi
paddd @MSG[2],$Wi
nop
sha256rnds2 $ABEF,$CDGH # 60-63
pshufd \$0x0e,$Wi,$Wi
dec $num
nop
sha256rnds2 $CDGH,$ABEF
paddd $CDGH_SAVE,$CDGH
paddd $ABEF_SAVE,$ABEF
jnz .Loop_shaext
pshufd \$0xb1,$CDGH,$CDGH # DCHG
pshufd \$0x1b,$ABEF,$TMP # FEBA
pshufd \$0xb1,$ABEF,$ABEF # BAFE
punpckhqdq $CDGH,$ABEF # DCBA
palignr \$8,$TMP,$CDGH # HGFE
movdqu $ABEF,($ctx)
movdqu $CDGH,16($ctx)
___
$code.=<<___ if ($win64);
movaps -8-5*16(%rax),%xmm6
movaps -8-4*16(%rax),%xmm7
movaps -8-3*16(%rax),%xmm8
movaps -8-2*16(%rax),%xmm9
movaps -8-1*16(%rax),%xmm10
mov %rax,%rsp
.Lepilogue_shaext:
___
$code.=<<___;
ret
.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
___
}}}
{{{
my $a4=$T1;
@ -620,13 +789,13 @@ $code.=<<___;
movdqu 0x00($inp),@X[0]
movdqu 0x10($inp),@X[1]
movdqu 0x20($inp),@X[2]
movdqu 0x30($inp),@X[3]
pshufb $t3,@X[0]
movdqu 0x30($inp),@X[3]
lea $TABLE(%rip),$Tbl
pshufb $t3,@X[1]
movdqa 0x00($Tbl),$t0
pshufb $t3,@X[2]
movdqa 0x20($Tbl),$t1
pshufb $t3,@X[2]
paddd @X[0],$t0
movdqa 0x40($Tbl),$t2
pshufb $t3,@X[3]
@ -2087,6 +2256,39 @@ $code.=<<___;
ret
.size se_handler,.-se_handler
.type shaext_handler,\@abi-omnipotent
.align 16
shaext_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
lea .Lprologue_shaext(%rip),%r10
cmp %r10,%rbx # context->Rip<.Lprologue
jb .Lin_prologue
lea .Lepilogue_shaext(%rip),%r10
cmp %r10,%rbx # context->Rip>=.Lepilogue
jae .Lin_prologue
lea -8-5*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$10,%ecx
.long 0xa548f3fc # cld; rep movsq
jmp .Lin_prologue
.size shaext_handler,.-shaext_handler
.section .pdata
.align 4
.rva .LSEH_begin_$func
@ -2094,6 +2296,9 @@ $code.=<<___;
.rva .LSEH_info_$func
___
$code.=<<___ if ($SZ==4);
.rva .LSEH_begin_${func}_shaext
.rva .LSEH_end_${func}_shaext
.rva .LSEH_info_${func}_shaext
.rva .LSEH_begin_${func}_ssse3
.rva .LSEH_end_${func}_ssse3
.rva .LSEH_info_${func}_ssse3
@ -2122,6 +2327,9 @@ $code.=<<___;
.rva .Lprologue,.Lepilogue # HandlerData[]
___
$code.=<<___ if ($SZ==4);
.LSEH_info_${func}_shaext:
.byte 9,0,0,0
.rva shaext_handler
.LSEH_info_${func}_ssse3:
.byte 9,0,0,0
.rva se_handler
@ -2147,6 +2355,28 @@ $code.=<<___ if ($avx>1);
___
}
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
sub sha256op38 {
my $instr = shift;
my %opcodelet = (
"sha256rnds2" => 0xcb,
"sha256msg1" => 0xcc,
"sha256msg2" => 0xcd );
if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
my @opcode=(0x0f,0x38);
push @opcode,$opcodelet{$instr};
push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
return ".byte\t".join(',',@opcode);
} else {
return $instr."\t".@_[0];
}
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
print $_,"\n";
}
close STDOUT;