crypto/sha/asm/sha1-x86_64.pl update:

+5% on Atom Silvermont, up to +8% improvement of legacy code.
Harmonize sha1-586.pl and aesni-sha1-x86_86.p with sha1-x86_64.pl.
This commit is contained in:
Andy Polyakov 2014-02-01 21:07:16 +01:00
parent 30ea570f0f
commit b217ca63b1
3 changed files with 260 additions and 156 deletions

View File

@ -21,24 +21,24 @@
# subroutine:
#
# AES-128-CBC +SHA1 stitch gain
# Westmere 3.77[+5.5] 9.26 6.66 +39%
# Sandy Bridge 5.05[+5.0(6.2)] 10.06(11.21) 5.98(7.01) +68%(+60%)
# Westmere 3.77[+5.3] 9.07 6.55 +38%
# Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%)
# Ivy Bridge 5.05[+4.6] 9.65 5.54 +74%
# Haswell 4.43[+3.6(4.1)] 8.00(8.55) 4.55(5.21) +75%(+64%)
# Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%)
# Bulldozer 5.77[+6.0] 11.72 6.37 +84%
#
# AES-192-CBC
# Westmere 4.51 10.00 6.91 +45%
# Sandy Bridge 6.05 11.06(12.21) 6.11(7.18) +81%(+70%)
# Westmere 4.51 9.81 6.80 +44%
# Sandy Bridge 6.05 11.06(12.15) 6.11(7.19) +81%(+69%)
# Ivy Bridge 6.05 10.65 6.07 +75%
# Haswell 5.29 8.86(9.42) 5.32(5.32) +67%(+77%)
# Haswell 5.29 8.86(9.44) 5.32(5.32) +67%(+77%)
# Bulldozer 6.89 12.84 6.96 +84%
#
# AES-256-CBC
# Westmere 5.25 10.74 7.24 +48%
# Sandy Bridge 7.05 12.06(13.21) 7.12(7.63) +69%(+73%)
# Westmere 5.25 10.55 7.21 +46%
# Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%)
# Ivy Bridge 7.05 11.65 7.12 +64%
# Haswell 6.19 9.76(10.3) 6.21(6.25) +57%(+65%)
# Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%)
# Bulldozer 8.00 13.95 8.25 +69%
#
# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
@ -230,11 +230,11 @@ $code.=<<___;
movdqu 32($inp),@X[-2&7]
movdqu 48($inp),@X[-1&7]
pshufb @Tx[2],@X[-4&7] # byte swap
add \$64,$inp
pshufb @Tx[2],@X[-3&7]
pshufb @Tx[2],@X[-2&7]
pshufb @Tx[2],@X[-1&7]
add \$64,$inp
paddd @Tx[1],@X[-4&7] # add K_00_19
pshufb @Tx[2],@X[-1&7]
paddd @Tx[1],@X[-3&7]
paddd @Tx[1],@X[-2&7]
movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
@ -297,74 +297,75 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
my ($a,$b,$c,$d,$e);
&pshufd (@X[0],@X[-4&7],0xee); # was &movdqa(@X[0],@X[-3&7]);
eval(shift(@insns));
eval(shift(@insns)); # ror
&pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
eval(shift(@insns));
&movdqa (@Tx[0],@X[-1&7]);
&punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
&paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
&paddd (@Tx[1],@X[-1&7]);
&punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns));
&psrldq (@Tx[0],4); # "X[-3]", 3 dwords
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
&pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
&movdqa (@Tx[2],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
&movdqa (@Tx[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
&paddd (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&psrld (@Tx[0],31);
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns));
&movdqa (@Tx[1],@Tx[2]);
eval(shift(@insns));
eval(shift(@insns));
&psrld (@Tx[2],30);
&por (@X[0],@Tx[0]); # "X[0]"<<<=1
eval(shift(@insns));
eval(shift(@insns)); # ror
&por (@X[0],@Tx[0]); # "X[0]"<<<=1
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&pslld (@Tx[1],2);
&pxor (@X[0],@Tx[2]);
eval(shift(@insns));
eval(shift(@insns));
&movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
eval(shift(@insns)); # rol
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
&pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
foreach (@insns) { eval; } # remaining instructions [if any]
@ -375,27 +376,30 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
sub Xupdate_ssse3_32_79()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
&pshufd (@Tx[0],@X[-2&7],0xee) if ($Xi==8); # was &movdqa (@Tx[0],@X[-1&7])
eval(shift(@insns)); # body_20_39
eval(shift(@insns)) if ($Xi==8);
&pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
&punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
eval(shift(@insns)) if ($Xi==8);
eval(shift(@insns)); # body_20_39
eval(shift(@insns));
eval(shift(@insns)) if (@insns[1] =~ /_ror/);
eval(shift(@insns)) if (@insns[0] =~ /_ror/);
&punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
eval(shift(@insns));
eval(shift(@insns)); # rol
&pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
eval(shift(@insns));
eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
eval(shift(@insns));
if ($Xi%5) {
&movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
} else { # ... or load next one
&movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
}
&paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns)); # ror
&paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
@ -403,28 +407,30 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns)) if (@insns[0] =~ /_ror/);
&movdqa (@Tx[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
eval(shift(@insns));
eval(shift(@insns)); # body_20_39
&pslld (@X[0],2);
eval(shift(@insns)); # body_20_39
eval(shift(@insns));
eval(shift(@insns));
&psrld (@Tx[0],30);
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
eval(shift(@insns));
&por (@X[0],@Tx[0]); # "X[0]"<<<=2
eval(shift(@insns)); # body_20_39
eval(shift(@insns));
eval(shift(@insns)); # body_20_39
eval(shift(@insns)) if (@insns[1] =~ /_rol/);
eval(shift(@insns)) if (@insns[0] =~ /_rol/);
&pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
eval(shift(@insns));
eval(shift(@insns)); # rol
@ -445,11 +451,12 @@ sub Xuplast_ssse3_80()
my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
my ($a,$b,$c,$d,$e);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
@ -481,9 +488,12 @@ sub Xloop_ssse3()
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&pshufb (@X[($Xi-3)&7],@Tx[2]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&paddd (@X[($Xi-4)&7],@Tx[1]);
eval(shift(@insns));
eval(shift(@insns));
@ -492,6 +502,8 @@ sub Xloop_ssse3()
&movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&psubd (@X[($Xi-4)&7],@Tx[1]);
foreach (@insns) { eval; }

View File

@ -93,8 +93,9 @@
# Westmere 7.3 5.5/+33% -
# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73%
# Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53%
# Haswell 6.5 4.3/+51% 4.1(**)/+58%
# Bulldozer 11.6 6.0/+92%
# VIA Nano 10.6 7.4/+43%
# VIA Nano 10.6 7.5/+41%
#
# (*) Loop is 1056 instructions long and expected result is ~8.25.
# It remains mystery [to me] why ILP is limited to 1.7.
@ -512,7 +513,7 @@ my $_ror=sub { &ror(@_) };
&mov (@T[1],$C);
&psubd (@X[-2&7],@X[3]);
&xor (@T[1],$D);
&movdqa (@X[0],@X[-3&7]);
&pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
&and (@T[0],@T[1]);
&jmp (&label("loop"));
@ -539,76 +540,77 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
my ($a,$b,$c,$d,$e);
eval(shift(@insns)); # ror
eval(shift(@insns));
eval(shift(@insns));
&palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
&punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
&movdqa (@X[2],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
&paddd (@X[3],@X[-1&7]);
&movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns));
&psrldq (@X[2],4); # "X[-3]", 3 dwords
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
&pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
&movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
&movdqa (@X[4],@X[0]);
&movdqa (@X[2],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
&movdqa (@X[2],@X[0]);
eval(shift(@insns));
&pslldq (@X[4],12); # "X[0]"<<96, extract one dword
&paddd (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&psrld (@X[2],31);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
&movdqa (@X[3],@X[4]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&psrld (@X[4],30);
&por (@X[0],@X[2]); # "X[0]"<<<=1
eval(shift(@insns));
eval(shift(@insns)); # ror
&por (@X[0],@X[2]); # "X[0]"<<<=1
eval(shift(@insns));
&movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
eval(shift(@insns));
eval(shift(@insns));
&pslld (@X[3],2);
&pxor (@X[0],@X[4]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
&pxor (@X[0],@X[4]);
&movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2
&movdqa (@X[1],@X[-2&7]) if ($Xi<7);
&pshufd (@X[1],@X[-3&7],0xee) if ($Xi<7); # was &movdqa (@X[1],@X[-2&7])
&pshufd (@X[3],@X[-1&7],0xee) if ($Xi==7);
eval(shift(@insns));
eval(shift(@insns));
@ -623,10 +625,9 @@ sub Xupdate_ssse3_32_79()
my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
&movdqa (@X[2],@X[-1&7]) if ($Xi==8);
eval(shift(@insns)); # body_20_39
&pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
&palignr(@X[2],@X[-2&7],8); # compose "X[-6]"
&punpcklqdq(@X[2],@X[-1&7]); # compose "X[-6]", was &palignr(@X[2],@X[-2&7],8)
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
@ -635,13 +636,14 @@ sub Xupdate_ssse3_32_79()
&movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)) if (@insns[0] =~ /_rol/);
if ($Xi%5) {
&movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
} else { # ... or load next one
&movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
}
&paddd (@X[3],@X[-1&7]);
eval(shift(@insns)); # ror
&paddd (@X[3],@X[-1&7]);
eval(shift(@insns));
&pxor (@X[0],@X[2]); # "X[0]"^="X[-6]"
@ -656,6 +658,7 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns)); # ror
eval(shift(@insns));
eval(shift(@insns)) if (@insns[0] =~ /_rol/);
&pslld (@X[0],2);
eval(shift(@insns)); # body_20_39
@ -667,6 +670,8 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns)); # ror
eval(shift(@insns));
eval(shift(@insns)) if (@insns[1] =~ /_rol/);
eval(shift(@insns)) if (@insns[0] =~ /_rol/);
&por (@X[0],@X[2]); # "X[0]"<<<=2
eval(shift(@insns)); # body_20_39
@ -677,7 +682,7 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
&movdqa (@X[3],@X[0]) if ($Xi<19);
&pshufd (@X[3],@X[-1],0xee) if ($Xi<19); # was &movdqa (@X[3],@X[0])
eval(shift(@insns));
foreach (@insns) { eval; } # remaining instructions
@ -691,6 +696,12 @@ sub Xuplast_ssse3_80()
my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
my ($a,$b,$c,$d,$e);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&paddd (@X[3],@X[-1&7]);
eval(shift(@insns));
@ -728,9 +739,16 @@ sub Xloop_ssse3()
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&pshufb (@X[($Xi-3)&7],@X[2]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&paddd (@X[($Xi-4)&7],@X[3]);
eval(shift(@insns));
eval(shift(@insns));
@ -739,6 +757,8 @@ sub Xloop_ssse3()
&movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&psubd (@X[($Xi-4)&7],@X[3]);
foreach (@insns) { eval; }
@ -816,6 +836,64 @@ sub body_40_59 () { # ((b^c)&(c^d))^c
'&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
######
sub bodyx_00_19 () { # ((c^d)&b)^d
# on start @T[0]=(b&c)^(~b&d), $e+=X[]+K
return &bodyx_20_39() if ($rx==19); $rx++;
(
'($a,$b,$c,$d,$e)=@V;'.
'&rorx ($b,$b,2) if ($j==0);'. # $b>>>2
'&rorx ($b,@T[1],7) if ($j!=0);', # $b>>>2
'&lea ($e,&DWP(0,$e,@T[0]));',
'&rorx (@T[0],$a,5);',
'&andn (@T[1],$a,$c);',
'&and ($a,$b)',
'&add ($d,&DWP(4*(($j+1)&15),"esp"));', # X[]+K xfer
'&xor (@T[1],$a)',
'&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
sub bodyx_20_39 () { # b^d^c
# on start $b=b^c^d
return &bodyx_40_59() if ($rx==39); $rx++;
(
'($a,$b,$c,$d,$e)=@V;'.
'&add ($e,($j==19?@T[0]:$b))',
'&rorx ($b,@T[1],7);', # $b>>>2
'&rorx (@T[0],$a,5);',
'&xor ($a,$b) if ($j<79);',
'&add ($d,&DWP(4*(($j+1)&15),"esp")) if ($j<79);', # X[]+K xfer
'&xor ($a,$c) if ($j<79);',
'&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
sub bodyx_40_59 () { # ((b^c)&(c^d))^c
# on start $b=((b^c)&(c^d))^c
return &bodyx_20_39() if ($rx==59); $rx++;
(
'($a,$b,$c,$d,$e)=@V;'.
'&rorx (@T[0],$a,5)',
'&lea ($e,&DWP(0,$e,$b))',
'&rorx ($b,@T[1],7)', # $b>>>2
'&add ($d,&DWP(4*(($j+1)&15),"esp"))', # X[]+K xfer
'&mov (@T[1],$c)',
'&xor ($a,$b)', # b^c for next round
'&xor (@T[1],$b)', # c^d for next round
'&and ($a,@T[1])',
'&add ($e,@T[0])',
'&xor ($a,$b)' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
&set_label("loop",16);
&Xupdate_ssse3_16_31(\&body_00_19);
@ -855,9 +933,10 @@ sub body_40_59 () { # ((b^c)&(c^d))^c
&mov (&DWP(12,@T[1]),$D);
&xor ($B,$D);
&mov (&DWP(16,@T[1]),$E);
&and ($B,@T[0]);
&movdqa (@X[0],@X[-3&7]);
&xchg ($B,@T[0]);
&mov (@T[1],@T[0]);
&pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
&and (@T[0],$B);
&mov ($B,$T[1]);
&jmp (&label("loop"));
@ -1226,9 +1305,10 @@ sub Xtail_avx()
&mov (&DWP(8,@T[1]),$C);
&xor ($B,$D);
&mov (&DWP(12,@T[1]),$D);
&and ($B,@T[0]);
&mov (&DWP(16,@T[1]),$E);
&xchg ($B,@T[0]);
&mov (@T[1],@T[0]);
&and (@T[0],$B);
&mov ($B,@T[1]);
&jmp (&label("loop"));

View File

@ -62,16 +62,20 @@
# CPU clock cycles spent to process single byte (less is better).
#
# x86_64 SSSE3 AVX[2]
# P4 9.8 -
# Opteron 6.65 -
# Core2 6.70 6.05/+11% -
# Westmere 7.08 5.44/+30% -
# Sandy Bridge 7.93 6.16/+28% 4.99/+59%
# Ivy Bridge 6.30 4.63/+36% 4.60/+37%
# Haswell 5.98 4.12/+45% 3.57/+67%
# Bulldozer 10.9 5.95/+82%
# VIA Nano 10.2 7.46/+37%
# Atom 11.0 9.61/+14%
# P4 9.05 -
# Opteron 6.26 -
# Core2 6.55 6.05/+8% -
# Westmere 6.73 5.30/+27% -
# Sandy Bridge 7.70 6.10/+26% 4.99/+54%
# Ivy Bridge 6.06 4.67/+30% 4.60/+32%
# Haswell 5.45 4.15/+31% 3.57/+53%
# Bulldozer 9.11 5.95/+53%
# VIA Nano 9.32 7.15/+30%
# Atom [10.5?] [9.23?]/+14%
# Silvermont 13.1(*) 9.37/+40%
#
# (*) obviously suboptimal result, nothing was done about it,
# because SSSE3 code is compiled unconditionally;
$flavour = shift;
$output = shift;
@ -114,7 +118,7 @@ $num="%r10";
$t0="%eax";
$t1="%ebx";
$t2="%ecx";
@xi=("%edx","%ebp");
@xi=("%edx","%ebp","%r14d");
$A="%esi";
$B="%edi";
$C="%r11d";
@ -129,42 +133,40 @@ my $j=$i+1;
$code.=<<___ if ($i==0);
mov `4*$i`($inp),$xi[0]
bswap $xi[0]
mov $xi[0],`4*$i`(%rsp)
___
$code.=<<___ if ($i<15);
mov $c,$t0
mov `4*$j`($inp),$xi[1]
mov $d,$t0
mov $xi[0],`4*$i`(%rsp)
mov $a,$t2
xor $d,$t0
bswap $xi[1]
xor $c,$t0
rol \$5,$t2
lea 0x5a827999($xi[0],$e),$e
and $b,$t0
mov $xi[1],`4*$j`(%rsp)
lea 0x5a827999($xi[0],$e),$e
add $t2,$e
xor $d,$t0
rol \$30,$b
add $t0,$e
___
$code.=<<___ if ($i>=15);
mov `4*($j%16)`(%rsp),$xi[1]
mov $c,$t0
xor `4*($j%16)`(%rsp),$xi[1]
mov $d,$t0
mov $xi[0],`4*($i%16)`(%rsp)
mov $a,$t2
xor `4*(($j+2)%16)`(%rsp),$xi[1]
xor $d,$t0
xor $c,$t0
rol \$5,$t2
xor `4*(($j+8)%16)`(%rsp),$xi[1]
and $b,$t0
lea 0x5a827999($xi[0],$e),$e
xor `4*(($j+13)%16)`(%rsp),$xi[1]
xor $d,$t0
rol \$1,$xi[1]
add $t2,$e
rol \$30,$b
mov $xi[1],`4*($j%16)`(%rsp)
xor $d,$t0
add $t2,$e
rol \$1,$xi[1]
add $t0,$e
___
unshift(@xi,pop(@xi));
push(@xi,shift(@xi));
}
sub BODY_20_39 {
@ -172,62 +174,58 @@ my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
$code.=<<___ if ($i<79);
mov `4*($j%16)`(%rsp),$xi[1]
mov $c,$t0
xor `4*($j%16)`(%rsp),$xi[1]
mov $b,$t0
`"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)`
mov $a,$t2
xor `4*(($j+2)%16)`(%rsp),$xi[1]
xor $b,$t0
rol \$5,$t2
lea $K($xi[0],$e),$e
xor `4*(($j+8)%16)`(%rsp),$xi[1]
xor $d,$t0
rol \$5,$t2
xor `4*(($j+8)%16)`(%rsp),$xi[1]
lea $K($xi[0],$e),$e
xor $c,$t0
add $t2,$e
xor `4*(($j+13)%16)`(%rsp),$xi[1]
rol \$30,$b
add $t0,$e
rol \$1,$xi[1]
___
$code.=<<___ if ($i<76);
mov $xi[1],`4*($j%16)`(%rsp)
___
$code.=<<___ if ($i==79);
mov $c,$t0
mov $b,$t0
mov $a,$t2
xor $b,$t0
xor $d,$t0
lea $K($xi[0],$e),$e
rol \$5,$t2
xor $d,$t0
xor $c,$t0
add $t2,$e
rol \$30,$b
add $t0,$e
___
unshift(@xi,pop(@xi));
push(@xi,shift(@xi));
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___;
mov `4*($j%16)`(%rsp),$xi[1]
mov $c,$t0
mov $c,$t1
xor `4*($j%16)`(%rsp),$xi[1]
mov $d,$t0
mov $xi[0],`4*($i%16)`(%rsp)
mov $d,$t1
xor `4*(($j+2)%16)`(%rsp),$xi[1]
and $d,$t0
and $c,$t0
mov $a,$t2
xor `4*(($j+8)%16)`(%rsp),$xi[1]
xor $d,$t1
lea 0x8f1bbcdc($xi[0],$e),$e
xor $c,$t1
rol \$5,$t2
xor `4*(($j+13)%16)`(%rsp),$xi[1]
add $t0,$e
and $b,$t1
rol \$1,$xi[1]
add $t1,$e
rol \$30,$b
mov $xi[1],`4*($j%16)`(%rsp)
and $b,$t1
add $t2,$e
rol \$30,$b
add $t1,$e
___
unshift(@xi,pop(@xi));
push(@xi,shift(@xi));
}
$code.=<<___;
@ -261,17 +259,18 @@ $code.=<<___;
.align 16
.Lialu:
mov %rsp,%rax
push %rbx
push %rbp
push %r12
push %r13
mov %rsp,%r11
push %r14
mov %rdi,$ctx # reassigned argument
sub \$`8+16*4`,%rsp
mov %rsi,$inp # reassigned argument
and \$-64,%rsp
mov %rdx,$num # reassigned argument
mov %r11,`16*4`(%rsp)
mov %rax,`16*4`(%rsp)
.Lprologue:
mov 0($ctx),$A
@ -305,11 +304,12 @@ $code.=<<___;
jnz .Lloop
mov `16*4`(%rsp),%rsi
mov (%rsi),%r13
mov 8(%rsi),%r12
mov 16(%rsi),%rbp
mov 24(%rsi),%rbx
lea 32(%rsi),%rsp
mov -40(%rsi),%r14
mov -32(%rsi),%r13
mov -24(%rsi),%r12
mov -16(%rsi),%rbp
mov -8(%rsi),%rbx
lea (%rsi),%rsp
.Lepilogue:
ret
.size sha1_block_data_order,.-sha1_block_data_order
@ -389,11 +389,11 @@ $code.=<<___;
movdqu 32($inp),@X[-2&7]
movdqu 48($inp),@X[-1&7]
pshufb @X[2],@X[-4&7] # byte swap
add \$64,$inp
pshufb @X[2],@X[-3&7]
pshufb @X[2],@X[-2&7]
pshufb @X[2],@X[-1&7]
add \$64,$inp
paddd @Tx[1],@X[-4&7] # add K_00_19
pshufb @X[2],@X[-1&7]
paddd @Tx[1],@X[-3&7]
paddd @Tx[1],@X[-2&7]
movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
@ -418,74 +418,75 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
my ($a,$b,$c,$d,$e);
&movdqa (@X[0],@X[-3&7]);
eval(shift(@insns));
eval(shift(@insns)); # ror
&pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
eval(shift(@insns));
&movdqa (@Tx[0],@X[-1&7]);
&palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
&paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
&paddd (@Tx[1],@X[-1&7]);
&punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns));
&psrldq (@Tx[0],4); # "X[-3]", 3 dwords
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
&pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
&movdqa (@Tx[2],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
&movdqa (@Tx[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
&paddd (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&psrld (@Tx[0],31);
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns));
&movdqa (@Tx[1],@Tx[2]);
eval(shift(@insns));
eval(shift(@insns));
&psrld (@Tx[2],30);
&por (@X[0],@Tx[0]); # "X[0]"<<<=1
eval(shift(@insns));
eval(shift(@insns)); # ror
&por (@X[0],@Tx[0]); # "X[0]"<<<=1
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&pslld (@Tx[1],2);
&pxor (@X[0],@Tx[2]);
eval(shift(@insns));
eval(shift(@insns));
&movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX
eval(shift(@insns)); # rol
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
&pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
foreach (@insns) { eval; } # remaining instructions [if any]
@ -499,24 +500,27 @@ sub Xupdate_ssse3_32_79()
my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
&movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
eval(shift(@insns)); # body_20_39
eval(shift(@insns)) if ($Xi==8);
&pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
&palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
eval(shift(@insns)) if ($Xi==8);
eval(shift(@insns)); # body_20_39
eval(shift(@insns));
eval(shift(@insns)) if (@insns[1] =~ /_ror/);
eval(shift(@insns)) if (@insns[0] =~ /_ror/);
&punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
eval(shift(@insns));
eval(shift(@insns)); # rol
&pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
eval(shift(@insns));
eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
eval(shift(@insns));
if ($Xi%5) {
&movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
} else { # ... or load next one
&movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
}
&paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns)); # ror
&paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
@ -524,29 +528,31 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns)) if (@insns[0] =~ /_ror/);
&movdqa (@Tx[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
eval(shift(@insns));
eval(shift(@insns)); # body_20_39
&pslld (@X[0],2);
eval(shift(@insns)); # body_20_39
eval(shift(@insns));
eval(shift(@insns));
&psrld (@Tx[0],30);
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
eval(shift(@insns));
&por (@X[0],@Tx[0]); # "X[0]"<<<=2
eval(shift(@insns)); # body_20_39
eval(shift(@insns));
&movdqa (@Tx[1],@X[0]) if ($Xi<19);
eval(shift(@insns)); # body_20_39
eval(shift(@insns)) if (@insns[1] =~ /_rol/);
eval(shift(@insns)) if (@insns[0] =~ /_rol/);
&pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns));
@ -566,11 +572,12 @@ sub Xuplast_ssse3_80()
my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
my ($a,$b,$c,$d,$e);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
@ -602,10 +609,12 @@ sub Xloop_ssse3()
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&pshufb (@X[($Xi-3)&7],@X[2]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&paddd (@X[($Xi-4)&7],@Tx[1]);
eval(shift(@insns));
eval(shift(@insns));
@ -614,6 +623,8 @@ sub Xloop_ssse3()
&movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&psubd (@X[($Xi-4)&7],@Tx[1]);
foreach (@insns) { eval; }
@ -1680,16 +1691,17 @@ se_handler:
jae .Lcommon_seh_tail
mov `16*4`(%rax),%rax # pull saved stack pointer
lea 32(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
mov -40(%rax),%r14
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
jmp .Lcommon_seh_tail
.size se_handler,.-se_handler