mirror of
https://github.com/openssl/openssl.git
synced 2024-12-20 07:23:54 +08:00
poly1305/asm/poly1305-x86_64.pl: switch to vpermdd in table expansion.
Effectively it's minor size optimization, 5-6% per affected subroutine. Reviewed-by: Rich Salz <rsalz@openssl.org>
This commit is contained in:
parent
c1e1fc500d
commit
73e8a5c826
@ -1631,8 +1631,9 @@ $code.=<<___ if ($win64);
|
||||
.Ldo_avx2_body:
|
||||
___
|
||||
$code.=<<___;
|
||||
lea 48+64($ctx),$ctx # size optimization
|
||||
lea .Lconst(%rip),%rcx
|
||||
lea 48+64($ctx),$ctx # size optimization
|
||||
vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
|
||||
|
||||
# expand and copy pre-calculated table to stack
|
||||
vmovdqu `16*0-64`($ctx),%x#$T2
|
||||
@ -1642,36 +1643,28 @@ $code.=<<___;
|
||||
vmovdqu `16*3-64`($ctx),%x#$D0
|
||||
vmovdqu `16*4-64`($ctx),%x#$D1
|
||||
vmovdqu `16*5-64`($ctx),%x#$D2
|
||||
lea 0x90(%rsp),%rax # size optimization
|
||||
vmovdqu `16*6-64`($ctx),%x#$D3
|
||||
vpermq \$0x15,$T2,$T2 # 00003412 -> 12343434
|
||||
vpermd $T2,$T0,$T2 # 00003412 -> 14243444
|
||||
vmovdqu `16*7-64`($ctx),%x#$D4
|
||||
vpermq \$0x15,$T3,$T3
|
||||
vpshufd \$0xc8,$T2,$T2 # 12343434 -> 14243444
|
||||
vpermd $T3,$T0,$T3
|
||||
vmovdqu `16*8-64`($ctx),%x#$MASK
|
||||
vpermq \$0x15,$T4,$T4
|
||||
vpshufd \$0xc8,$T3,$T3
|
||||
vpermd $T4,$T0,$T4
|
||||
vmovdqa $T2,0x00(%rsp)
|
||||
vpermq \$0x15,$D0,$D0
|
||||
vpshufd \$0xc8,$T4,$T4
|
||||
vmovdqa $T3,0x20(%rsp)
|
||||
vpermq \$0x15,$D1,$D1
|
||||
vpshufd \$0xc8,$D0,$D0
|
||||
vmovdqa $T4,0x40(%rsp)
|
||||
vpermq \$0x15,$D2,$D2
|
||||
vpshufd \$0xc8,$D1,$D1
|
||||
vmovdqa $D0,0x60(%rsp)
|
||||
vpermq \$0x15,$D3,$D3
|
||||
vpshufd \$0xc8,$D2,$D2
|
||||
vmovdqa $D1,0x80(%rsp)
|
||||
vpermq \$0x15,$D4,$D4
|
||||
vpshufd \$0xc8,$D3,$D3
|
||||
vmovdqa $D2,0xa0(%rsp)
|
||||
vpermq \$0x15,$MASK,$MASK
|
||||
vpshufd \$0xc8,$D4,$D4
|
||||
vmovdqa $D3,0xc0(%rsp)
|
||||
vpshufd \$0xc8,$MASK,$MASK
|
||||
vmovdqa $D4,0xe0(%rsp)
|
||||
vmovdqa $MASK,0x100(%rsp)
|
||||
vpermd $D0,$T0,$D0
|
||||
vmovdqa $T3,0x20-0x90(%rax)
|
||||
vpermd $D1,$T0,$D1
|
||||
vmovdqa $T4,0x40-0x90(%rax)
|
||||
vpermd $D2,$T0,$D2
|
||||
vmovdqa $D0,0x60-0x90(%rax)
|
||||
vpermd $D3,$T0,$D3
|
||||
vmovdqa $D1,0x80-0x90(%rax)
|
||||
vpermd $D4,$T0,$D4
|
||||
vmovdqa $D2,0xa0-0x90(%rax)
|
||||
vpermd $MASK,$T0,$MASK
|
||||
vmovdqa $D3,0xc0-0x90(%rax)
|
||||
vmovdqa $D4,0xe0-0x90(%rax)
|
||||
vmovdqa $MASK,0x100-0x90(%rax)
|
||||
vmovdqa 64(%rcx),$MASK # .Lmask26
|
||||
|
||||
################################################################
|
||||
@ -1698,7 +1691,6 @@ $code.=<<___;
|
||||
vpand $MASK,$T3,$T3 # 3
|
||||
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
|
||||
|
||||
lea 0x90(%rsp),%rax # size optimization
|
||||
vpaddq $H2,$T2,$H2 # accumulate input
|
||||
sub \$64,$len
|
||||
jz .Ltail_avx2
|
||||
@ -2055,8 +2047,9 @@ $code.=<<___ if ($win64);
|
||||
.Ldo_avx512_body:
|
||||
___
|
||||
$code.=<<___;
|
||||
lea 48+64($ctx),$ctx # size optimization
|
||||
lea .Lconst(%rip),%rcx
|
||||
lea 48+64($ctx),$ctx # size optimization
|
||||
vmovdqa 96(%rcx),$T2 # .Lpermd_avx2
|
||||
|
||||
# expand pre-calculated table
|
||||
vmovdqu32 `16*0-64`($ctx),%x#$R0
|
||||
@ -2069,33 +2062,23 @@ $code.=<<___;
|
||||
vmovdqu32 `16*6-64`($ctx),%x#$S3
|
||||
vmovdqu32 `16*7-64`($ctx),%x#$R4
|
||||
vmovdqu32 `16*8-64`($ctx),%x#$S4
|
||||
vpermq \$0x15,$R0,$R0 # 00003412 -> 12343434
|
||||
vpermd $R0,$T2,$R0 # 00003412 -> 14243444
|
||||
vmovdqa64 64(%rcx),$MASK # .Lmask26
|
||||
vpermq \$0x15,$R1,$R1
|
||||
vmovdqa32 128(%rcx),$GATHER # .Lgather
|
||||
vpermq \$0x15,$S1,$S1
|
||||
vpshufd \$0xc8,$R0,$R0 # 12343434 -> 14243444
|
||||
vpermq \$0x15,$R2,$R2
|
||||
vpshufd \$0xc8,$R1,$R1
|
||||
vpermd $R1,$T2,$R1
|
||||
vpermd $S1,$T2,$S1
|
||||
vpermd $R2,$T2,$R2
|
||||
vmovdqa32 $R0,0x00(%rsp) # save in case $len%128 != 0
|
||||
vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
|
||||
vpermq \$0x15,$S2,$S2
|
||||
vpshufd \$0xc8,$S1,$S1
|
||||
vpermd $S2,$T2,$S2
|
||||
vmovdqa32 $R1,0x20(%rsp)
|
||||
vpsrlq \$32,$R1,$T1
|
||||
vpermq \$0x15,$R3,$R3
|
||||
vpshufd \$0xc8,$R2,$R2
|
||||
vpermd $R3,$T2,$R3
|
||||
vmovdqa32 $S1,0x40(%rsp)
|
||||
vpermq \$0x15,$S3,$S3
|
||||
vpshufd \$0xc8,$S2,$S2
|
||||
vpermq \$0x15,$R4,$R4
|
||||
vpshufd \$0xc8,$R3,$R3
|
||||
vpermd $S3,$T2,$S3
|
||||
vpermd $R4,$T2,$R4
|
||||
vmovdqa32 $R2,0x60(%rsp)
|
||||
vpermq \$0x15,$S4,$S4
|
||||
vpshufd \$0xc8,$S3,$S3
|
||||
vpermd $S4,$T2,$S4
|
||||
vmovdqa32 $S2,0x80(%rsp)
|
||||
vpshufd \$0xc8,$R4,$R4
|
||||
vpshufd \$0xc8,$S4,$S4
|
||||
vmovdqa32 $R3,0xa0(%rsp)
|
||||
vmovdqa32 $S3,0xc0(%rsp)
|
||||
vmovdqa32 $R4,0xe0(%rsp)
|
||||
@ -2275,14 +2258,16 @@ $code.=<<___;
|
||||
vpandq $MASK,$T2,$T2 # 2
|
||||
vpandq $MASK,$T0,$T0 # 0
|
||||
vpandq $MASK,$T1,$T1 # 1
|
||||
#vpandq $MASK,$T3,$T3 # 3
|
||||
vpandq $MASK,$T3,$T3 # 3
|
||||
#vporq $PADBIT,$T4,$T4 # padbit, yes, always
|
||||
|
||||
vpaddq $H2,$T2,$H2 # accumulate input
|
||||
mov \$0x0f,%eax
|
||||
sub \$192,$len
|
||||
jbe .Ltail_avx512
|
||||
jmp .Loop_avx512
|
||||
|
||||
.align 32
|
||||
.Loop_avx512:
|
||||
################################################################
|
||||
# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
|
||||
@ -2316,7 +2301,6 @@ $code.=<<___;
|
||||
vpaddq $H0,$T0,$H0
|
||||
vpmuludq $H2,$R2,$D4 # d4 = h2*r2
|
||||
vpmuludq $H2,$S3,$D0 # d0 = h2*s3
|
||||
vpandq $MASK,$T3,$T3 # 3, module-scheduled
|
||||
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
|
||||
vporq $PADBIT,$T4,$T4 # padbit, yes, always
|
||||
vpmuludq $H2,$R0,$D2 # d2 = h2*r0
|
||||
@ -2425,7 +2409,7 @@ $code.=<<___;
|
||||
|
||||
vpandq $MASK,$T0,$T0 # 0
|
||||
vpandq $MASK,$T1,$T1 # 1
|
||||
#vpandq $MASK,$T3,$T3 # 3
|
||||
vpandq $MASK,$T3,$T3 # 3
|
||||
#vporq $PADBIT,$T4,$T4 # padbit, yes, always
|
||||
|
||||
sub \$128,$len
|
||||
@ -2459,7 +2443,6 @@ $code.=<<___;
|
||||
vpmuludq $H2,$S3,$D0 # d0 = h2*s3
|
||||
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
|
||||
vpmuludq $H2,$R0,$D2 # d2 = h2*r0
|
||||
vpandq $MASK,$T3,$T3 # 3, module-scheduled
|
||||
vporq $PADBIT,$T4,$T4 # padbit, yes, always
|
||||
vpaddq $H1,$T1,$H1 # accumulate input
|
||||
vpaddq $H3,$T3,$H3
|
||||
@ -2647,10 +2630,8 @@ $code.=<<___;
|
||||
.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
|
||||
.Lmask26:
|
||||
.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
|
||||
.Lfive:
|
||||
.long 5,0,5,0,5,0,5,0
|
||||
.Lgather:
|
||||
.long 0,8, 32,40, 64,72, 96,104
|
||||
.Lpermd_avx2:
|
||||
.long 2,2,2,3,2,0,2,1
|
||||
___
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user