mirror of
https://github.com/openssl/openssl.git
synced 2024-11-23 18:13:39 +08:00
Remove trailing whitespace from some files.
The prevailing style seems to not have trailing whitespace, but a few lines do. This is mostly in the perlasm files, but a few C files got them after the reformat. This is the result of: find . -name '*.pl' | xargs sed -E -i '' -e 's/( |'$'\t'')*$//' find . -name '*.c' | xargs sed -E -i '' -e 's/( |'$'\t'')*$//' find . -name '*.h' | xargs sed -E -i '' -e 's/( |'$'\t'')*$//' Then bn_prime.h was excluded since this is a generated file. Note mkerr.pl has some changes in a heredoc for some help output, but other lines there lack trailing whitespace too. Reviewed-by: Kurt Roeckx <kurt@openssl.org> Reviewed-by: Matt Caswell <matt@openssl.org>
This commit is contained in:
parent
11542af65a
commit
609b0852e4
@ -146,7 +146,7 @@ OPTIONS cms_options[] = {
|
||||
"Do not load certificates from the default certificates directory"},
|
||||
{"content", OPT_CONTENT, '<',
|
||||
"Supply or override content for detached signature"},
|
||||
{"print", OPT_PRINT, '-',
|
||||
{"print", OPT_PRINT, '-',
|
||||
"For the -cmsout operation print out all fields of the CMS structure"},
|
||||
{"secretkey", OPT_SECRETKEY, 's'},
|
||||
{"secretkeyid", OPT_SECRETKEYID, 's'},
|
||||
|
@ -89,7 +89,7 @@ OPTIONS smime_options[] = {
|
||||
{"no-CApath", OPT_NOCAPATH, '-',
|
||||
"Do not load certificates from the default certificates directory"},
|
||||
{"resign", OPT_RESIGN, '-', "Resign a signed message"},
|
||||
{"nochain", OPT_NOCHAIN, '-',
|
||||
{"nochain", OPT_NOCHAIN, '-',
|
||||
"set PKCS7_NOCHAIN so certificates contained in the message are not used as untrusted CAs" },
|
||||
{"nosmimecap", OPT_NOSMIMECAP, '-', "Omit the SMIMECapabilities attribute"},
|
||||
{"stream", OPT_STREAM, '-', "Enable CMS streaming" },
|
||||
|
@ -1187,8 +1187,8 @@ static int run_benchmark(int async_jobs,
|
||||
continue;
|
||||
#endif
|
||||
|
||||
ret = ASYNC_start_job(&loopargs[i].inprogress_job,
|
||||
loopargs[i].wait_ctx, &job_op_count, loop_function,
|
||||
ret = ASYNC_start_job(&loopargs[i].inprogress_job,
|
||||
loopargs[i].wait_ctx, &job_op_count, loop_function,
|
||||
(void *)(loopargs + i), sizeof(loopargs_t));
|
||||
switch (ret) {
|
||||
case ASYNC_PAUSE:
|
||||
|
@ -123,7 +123,7 @@
|
||||
# words every cache-line is *guaranteed* to be accessed within ~50
|
||||
# cycles window. Why just SSE? Because it's needed on hyper-threading
|
||||
# CPU! Which is also why it's prefetched with 64 byte stride. Best
|
||||
# part is that it has no negative effect on performance:-)
|
||||
# part is that it has no negative effect on performance:-)
|
||||
#
|
||||
# Version 4.3 implements switch between compact and non-compact block
|
||||
# functions in AES_cbc_encrypt depending on how much data was asked
|
||||
@ -585,7 +585,7 @@ sub enctransform()
|
||||
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
|
||||
# | mm4 | mm0 |
|
||||
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
|
||||
# | s3 | s2 | s1 | s0 |
|
||||
# | s3 | s2 | s1 | s0 |
|
||||
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
|
||||
# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
|
||||
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
|
||||
@ -805,7 +805,7 @@ sub encstep()
|
||||
|
||||
if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
|
||||
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
|
||||
else { &mov ($tmp,$s[3]);
|
||||
else { &mov ($tmp,$s[3]);
|
||||
&shr ($tmp,24) }
|
||||
&xor ($out,&DWP(1,$te,$tmp,8));
|
||||
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
|
||||
@ -1558,7 +1558,7 @@ sub sse_deccompact()
|
||||
&pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
|
||||
&pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
|
||||
&pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
|
||||
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
|
||||
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
|
||||
|
||||
&pxor ("mm3","mm3"); &pxor ("mm7","mm7");
|
||||
&pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
|
||||
@ -2028,7 +2028,7 @@ sub declast()
|
||||
{
|
||||
# stack frame layout
|
||||
# -4(%esp) # return address 0(%esp)
|
||||
# 0(%esp) # s0 backing store 4(%esp)
|
||||
# 0(%esp) # s0 backing store 4(%esp)
|
||||
# 4(%esp) # s1 backing store 8(%esp)
|
||||
# 8(%esp) # s2 backing store 12(%esp)
|
||||
# 12(%esp) # s3 backing store 16(%esp)
|
||||
@ -2738,7 +2738,7 @@ sub enckey()
|
||||
&mov (&DWP(80,"edi"),10); # setup number of rounds
|
||||
&xor ("eax","eax");
|
||||
&jmp (&label("exit"));
|
||||
|
||||
|
||||
&set_label("12rounds");
|
||||
&mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
|
||||
&mov ("ebx",&DWP(4,"esi"));
|
||||
|
@ -1433,10 +1433,10 @@ $code.=<<___;
|
||||
xor $s1,$s1,$acc05
|
||||
xor $s2,$s2,$acc06
|
||||
xor $s3,$s3,$acc07
|
||||
xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
|
||||
xor $s1,$s1,$acc09
|
||||
xor $s2,$s2,$acc10
|
||||
xor $s3,$s3,$acc11
|
||||
xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
|
||||
xor $s1,$s1,$acc09
|
||||
xor $s2,$s2,$acc10
|
||||
xor $s3,$s3,$acc11
|
||||
|
||||
b Ldec_compact_loop
|
||||
.align 4
|
||||
|
@ -404,7 +404,7 @@ _s390x_AES_encrypt:
|
||||
or $s1,$t1
|
||||
or $t2,$i2
|
||||
or $t3,$i3
|
||||
|
||||
|
||||
srlg $i1,$s2,`8-3` # i0
|
||||
srlg $i2,$s2,`16-3` # i1
|
||||
nr $i1,$mask
|
||||
@ -457,7 +457,7 @@ _s390x_AES_encrypt:
|
||||
x $s2,24($key)
|
||||
x $s3,28($key)
|
||||
|
||||
br $ra
|
||||
br $ra
|
||||
.size _s390x_AES_encrypt,.-_s390x_AES_encrypt
|
||||
___
|
||||
|
||||
@ -779,7 +779,7 @@ _s390x_AES_decrypt:
|
||||
x $s2,24($key)
|
||||
x $s3,28($key)
|
||||
|
||||
br $ra
|
||||
br $ra
|
||||
.size _s390x_AES_decrypt,.-_s390x_AES_decrypt
|
||||
___
|
||||
|
||||
@ -1297,7 +1297,7 @@ $code.=<<___;
|
||||
.Lcbc_enc_done:
|
||||
l${g} $ivp,6*$SIZE_T($sp)
|
||||
st $s0,0($ivp)
|
||||
st $s1,4($ivp)
|
||||
st $s1,4($ivp)
|
||||
st $s2,8($ivp)
|
||||
st $s3,12($ivp)
|
||||
|
||||
@ -1635,7 +1635,7 @@ $code.=<<___ if(1);
|
||||
llgc $len,2*$SIZE_T-1($sp)
|
||||
nill $len,0x0f # $len%=16
|
||||
br $ra
|
||||
|
||||
|
||||
.align 16
|
||||
.Lxts_km_vanilla:
|
||||
___
|
||||
@ -1862,7 +1862,7 @@ $code.=<<___;
|
||||
xgr $s1,%r1
|
||||
lrvgr $s1,$s1 # flip byte order
|
||||
lrvgr $s3,$s3
|
||||
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
|
||||
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
|
||||
stg $s1,$tweak+0($sp) # save the tweak
|
||||
llgfr $s1,$s1
|
||||
srlg $s2,$s3,32
|
||||
@ -1913,7 +1913,7 @@ $code.=<<___;
|
||||
xgr $s1,%r1
|
||||
lrvgr $s1,$s1 # flip byte order
|
||||
lrvgr $s3,$s3
|
||||
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
|
||||
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
|
||||
stg $s1,$tweak+0($sp) # save the tweak
|
||||
llgfr $s1,$s1
|
||||
srlg $s2,$s3,32
|
||||
@ -2105,7 +2105,7 @@ $code.=<<___;
|
||||
xgr $s1,%r1
|
||||
lrvgr $s1,$s1 # flip byte order
|
||||
lrvgr $s3,$s3
|
||||
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
|
||||
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
|
||||
stg $s1,$tweak+0($sp) # save the tweak
|
||||
llgfr $s1,$s1
|
||||
srlg $s2,$s3,32
|
||||
|
@ -1298,7 +1298,7 @@ $code.=<<___;
|
||||
AES_set_encrypt_key:
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12 # redundant, but allows to share
|
||||
push %r12 # redundant, but allows to share
|
||||
push %r13 # exception handler...
|
||||
push %r14
|
||||
push %r15
|
||||
@ -1424,7 +1424,7 @@ $code.=<<___;
|
||||
xor %rax,%rax
|
||||
jmp .Lexit
|
||||
|
||||
.L14rounds:
|
||||
.L14rounds:
|
||||
mov 0(%rsi),%rax # copy first 8 dwords
|
||||
mov 8(%rsi),%rbx
|
||||
mov 16(%rsi),%rcx
|
||||
|
@ -134,7 +134,7 @@ $code.=<<___ if ($win64);
|
||||
movaps %xmm10,0x40(%rsp)
|
||||
movaps %xmm11,0x50(%rsp)
|
||||
movaps %xmm12,0x60(%rsp)
|
||||
movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
|
||||
movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
|
||||
movaps %xmm14,-0x58(%rax)
|
||||
movaps %xmm15,-0x48(%rax)
|
||||
___
|
||||
@ -308,9 +308,9 @@ $code.=<<___;
|
||||
|
||||
movups @out[0],-16(@outptr[0],$offset)
|
||||
pxor @inp[0],@out[0]
|
||||
movups @out[1],-16(@outptr[1],$offset)
|
||||
movups @out[1],-16(@outptr[1],$offset)
|
||||
pxor @inp[1],@out[1]
|
||||
movups @out[2],-16(@outptr[2],$offset)
|
||||
movups @out[2],-16(@outptr[2],$offset)
|
||||
pxor @inp[2],@out[2]
|
||||
movups @out[3],-16(@outptr[3],$offset)
|
||||
pxor @inp[3],@out[3]
|
||||
@ -393,7 +393,7 @@ $code.=<<___ if ($win64);
|
||||
movaps %xmm10,0x40(%rsp)
|
||||
movaps %xmm11,0x50(%rsp)
|
||||
movaps %xmm12,0x60(%rsp)
|
||||
movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
|
||||
movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
|
||||
movaps %xmm14,-0x58(%rax)
|
||||
movaps %xmm15,-0x48(%rax)
|
||||
___
|
||||
@ -563,10 +563,10 @@ $code.=<<___;
|
||||
|
||||
movups @out[0],-16(@outptr[0],$offset)
|
||||
movdqu (@inptr[0],$offset),@out[0]
|
||||
movups @out[1],-16(@outptr[1],$offset)
|
||||
movups @out[1],-16(@outptr[1],$offset)
|
||||
movdqu (@inptr[1],$offset),@out[1]
|
||||
pxor $zero,@out[0]
|
||||
movups @out[2],-16(@outptr[2],$offset)
|
||||
movups @out[2],-16(@outptr[2],$offset)
|
||||
movdqu (@inptr[2],$offset),@out[2]
|
||||
pxor $zero,@out[1]
|
||||
movups @out[3],-16(@outptr[3],$offset)
|
||||
@ -835,10 +835,10 @@ $code.=<<___;
|
||||
vmovups @out[0],-16(@ptr[0]) # write output
|
||||
sub $offset,@ptr[0] # switch to input
|
||||
vpxor 0x00($offload),@out[0],@out[0]
|
||||
vmovups @out[1],-16(@ptr[1])
|
||||
vmovups @out[1],-16(@ptr[1])
|
||||
sub `64+1*8`(%rsp),@ptr[1]
|
||||
vpxor 0x10($offload),@out[1],@out[1]
|
||||
vmovups @out[2],-16(@ptr[2])
|
||||
vmovups @out[2],-16(@ptr[2])
|
||||
sub `64+2*8`(%rsp),@ptr[2]
|
||||
vpxor 0x20($offload),@out[2],@out[2]
|
||||
vmovups @out[3],-16(@ptr[3])
|
||||
@ -847,10 +847,10 @@ $code.=<<___;
|
||||
vmovups @out[4],-16(@ptr[4])
|
||||
sub `64+4*8`(%rsp),@ptr[4]
|
||||
vpxor @inp[0],@out[4],@out[4]
|
||||
vmovups @out[5],-16(@ptr[5])
|
||||
vmovups @out[5],-16(@ptr[5])
|
||||
sub `64+5*8`(%rsp),@ptr[5]
|
||||
vpxor @inp[1],@out[5],@out[5]
|
||||
vmovups @out[6],-16(@ptr[6])
|
||||
vmovups @out[6],-16(@ptr[6])
|
||||
sub `64+6*8`(%rsp),@ptr[6]
|
||||
vpxor @inp[2],@out[6],@out[6]
|
||||
vmovups @out[7],-16(@ptr[7])
|
||||
@ -1128,12 +1128,12 @@ $code.=<<___;
|
||||
sub $offset,@ptr[0] # switch to input
|
||||
vmovdqu 128+0(%rsp),@out[0]
|
||||
vpxor 0x70($offload),@out[7],@out[7]
|
||||
vmovups @out[1],-16(@ptr[1])
|
||||
vmovups @out[1],-16(@ptr[1])
|
||||
sub `64+1*8`(%rsp),@ptr[1]
|
||||
vmovdqu @out[0],0x00($offload)
|
||||
vpxor $zero,@out[0],@out[0]
|
||||
vmovdqu 128+16(%rsp),@out[1]
|
||||
vmovups @out[2],-16(@ptr[2])
|
||||
vmovups @out[2],-16(@ptr[2])
|
||||
sub `64+2*8`(%rsp),@ptr[2]
|
||||
vmovdqu @out[1],0x10($offload)
|
||||
vpxor $zero,@out[1],@out[1]
|
||||
@ -1149,11 +1149,11 @@ $code.=<<___;
|
||||
vpxor $zero,@out[3],@out[3]
|
||||
vmovdqu @inp[0],0x40($offload)
|
||||
vpxor @inp[0],$zero,@out[4]
|
||||
vmovups @out[5],-16(@ptr[5])
|
||||
vmovups @out[5],-16(@ptr[5])
|
||||
sub `64+5*8`(%rsp),@ptr[5]
|
||||
vmovdqu @inp[1],0x50($offload)
|
||||
vpxor @inp[1],$zero,@out[5]
|
||||
vmovups @out[6],-16(@ptr[6])
|
||||
vmovups @out[6],-16(@ptr[6])
|
||||
sub `64+6*8`(%rsp),@ptr[6]
|
||||
vmovdqu @inp[2],0x60($offload)
|
||||
vpxor @inp[2],$zero,@out[6]
|
||||
|
@ -793,7 +793,7 @@ sub body_00_19_dec () { # ((c^d)&b)^d
|
||||
sub body_20_39_dec () { # b^d^c
|
||||
# on entry @T[0]=b^d
|
||||
return &body_40_59_dec() if ($rx==39);
|
||||
|
||||
|
||||
my @r=@body_20_39;
|
||||
|
||||
unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
|
||||
|
@ -884,7 +884,7 @@ if ($avx>1) {{
|
||||
######################################################################
|
||||
# AVX2+BMI code path
|
||||
#
|
||||
my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
|
||||
my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
|
||||
my $PUSH8=8*2*$SZ;
|
||||
use integer;
|
||||
|
||||
|
@ -1051,7 +1051,7 @@ if ($PREFIX eq "aesni") {
|
||||
&set_label("ctr32_one_shortcut",16);
|
||||
&movups ($inout0,&QWP(0,$rounds_)); # load ivec
|
||||
&mov ($rounds,&DWP(240,$key));
|
||||
|
||||
|
||||
&set_label("ctr32_one");
|
||||
if ($inline)
|
||||
{ &aesni_inline_generate1("enc"); }
|
||||
|
@ -34,7 +34,7 @@
|
||||
# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
|
||||
# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
|
||||
# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
|
||||
# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
|
||||
# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
|
||||
# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
|
||||
# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
|
||||
#
|
||||
@ -118,7 +118,7 @@
|
||||
# performance is achieved by interleaving instructions working on
|
||||
# independent blocks. In which case asymptotic limit for such modes
|
||||
# can be obtained by dividing above mentioned numbers by AES
|
||||
# instructions' interleave factor. Westmere can execute at most 3
|
||||
# instructions' interleave factor. Westmere can execute at most 3
|
||||
# instructions at a time, meaning that optimal interleave factor is 3,
|
||||
# and that's where the "magic" number of 1.25 come from. "Optimal
|
||||
# interleave factor" means that increase of interleave factor does
|
||||
@ -312,7 +312,7 @@ ___
|
||||
# on 2x subroutine on Atom Silvermont account. For processors that
|
||||
# can schedule aes[enc|dec] every cycle optimal interleave factor
|
||||
# equals to corresponding instructions latency. 8x is optimal for
|
||||
# * Bridge and "super-optimal" for other Intel CPUs...
|
||||
# * Bridge and "super-optimal" for other Intel CPUs...
|
||||
|
||||
sub aesni_generate2 {
|
||||
my $dir=shift;
|
||||
@ -1271,7 +1271,7 @@ $code.=<<___;
|
||||
lea 7($ctr),%r9
|
||||
mov %r10d,0x60+12(%rsp)
|
||||
bswap %r9d
|
||||
mov OPENSSL_ia32cap_P+4(%rip),%r10d
|
||||
mov OPENSSL_ia32cap_P+4(%rip),%r10d
|
||||
xor $key0,%r9d
|
||||
and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
|
||||
mov %r9d,0x70+12(%rsp)
|
||||
@ -1551,7 +1551,7 @@ $code.=<<___;
|
||||
|
||||
.Lctr32_tail:
|
||||
# note that at this point $inout0..5 are populated with
|
||||
# counter values xor-ed with 0-round key
|
||||
# counter values xor-ed with 0-round key
|
||||
lea 16($key),$key
|
||||
cmp \$4,$len
|
||||
jb .Lctr32_loop3
|
||||
|
@ -3773,7 +3773,7 @@ foreach(split("\n",$code)) {
|
||||
if ($flavour =~ /le$/o) {
|
||||
SWITCH: for($conv) {
|
||||
/\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
|
||||
/\?rev/ && do { @bytes=reverse(@bytes); last; };
|
||||
/\?rev/ && do { @bytes=reverse(@bytes); last; };
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -961,21 +961,21 @@ if ($flavour =~ /64/) { ######## 64-bit code
|
||||
|
||||
$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
|
||||
sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
|
||||
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
|
||||
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
|
||||
}
|
||||
|
||||
sub unvdup32 {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
|
||||
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
|
||||
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
|
||||
}
|
||||
|
||||
sub unvmov32 {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
|
||||
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
|
||||
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
|
||||
}
|
||||
|
||||
foreach(split("\n",$code)) {
|
||||
|
@ -91,7 +91,7 @@ my @s=@_[12..15];
|
||||
|
||||
sub InBasisChange {
|
||||
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
|
||||
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
|
||||
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
|
||||
my @b=@_[0..7];
|
||||
$code.=<<___;
|
||||
veor @b[2], @b[2], @b[1]
|
||||
|
@ -129,7 +129,7 @@ my @s=@_[12..15];
|
||||
|
||||
sub InBasisChange {
|
||||
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
|
||||
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
|
||||
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
|
||||
my @b=@_[0..7];
|
||||
$code.=<<___;
|
||||
pxor @b[6], @b[5]
|
||||
@ -379,7 +379,7 @@ $code.=<<___;
|
||||
pxor @s[0], @t[3]
|
||||
pxor @s[1], @t[2]
|
||||
pxor @s[2], @t[1]
|
||||
pxor @s[3], @t[0]
|
||||
pxor @s[3], @t[0]
|
||||
|
||||
#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
|
||||
|
||||
|
@ -769,7 +769,7 @@ _vpaes_schedule_core:
|
||||
ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
|
||||
bl _vpaes_schedule_transform // input transform
|
||||
mov $inp, #7 // mov \$7, %esi
|
||||
|
||||
|
||||
.Loop_schedule_256:
|
||||
sub $inp, $inp, #1 // dec %esi
|
||||
bl _vpaes_schedule_mangle // output low result
|
||||
@ -778,7 +778,7 @@ _vpaes_schedule_core:
|
||||
// high round
|
||||
bl _vpaes_schedule_round
|
||||
cbz $inp, .Lschedule_mangle_last
|
||||
bl _vpaes_schedule_mangle
|
||||
bl _vpaes_schedule_mangle
|
||||
|
||||
// low round. swap xmm7 and xmm6
|
||||
dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
|
||||
@ -787,7 +787,7 @@ _vpaes_schedule_core:
|
||||
mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
|
||||
bl _vpaes_schedule_low_round
|
||||
mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
|
||||
|
||||
|
||||
b .Loop_schedule_256
|
||||
|
||||
##
|
||||
@ -814,7 +814,7 @@ _vpaes_schedule_core:
|
||||
|
||||
.Lschedule_mangle_last_dec:
|
||||
ld1 {v20.2d-v21.2d}, [x11] // reload constants
|
||||
sub $out, $out, #16 // add \$-16, %rdx
|
||||
sub $out, $out, #16 // add \$-16, %rdx
|
||||
eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
|
||||
bl _vpaes_schedule_transform // output transform
|
||||
st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key
|
||||
|
@ -1074,7 +1074,7 @@ Loop_schedule_256:
|
||||
# high round
|
||||
bl _vpaes_schedule_round
|
||||
bdz Lschedule_mangle_last # dec %esi
|
||||
bl _vpaes_schedule_mangle
|
||||
bl _vpaes_schedule_mangle
|
||||
|
||||
# low round. swap xmm7 and xmm6
|
||||
?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
|
||||
@ -1082,7 +1082,7 @@ Loop_schedule_256:
|
||||
vmr v7, v6 # vmovdqa %xmm6, %xmm7
|
||||
bl _vpaes_schedule_low_round
|
||||
vmr v7, v5 # vmovdqa %xmm5, %xmm7
|
||||
|
||||
|
||||
b Loop_schedule_256
|
||||
##
|
||||
## .aes_schedule_mangle_last
|
||||
@ -1130,7 +1130,7 @@ Lschedule_mangle_last:
|
||||
Lschedule_mangle_last_dec:
|
||||
lvx $iptlo, r11, r12 # reload $ipt
|
||||
lvx $ipthi, r9, r12
|
||||
addi $out, $out, -16 # add \$-16, %rdx
|
||||
addi $out, $out, -16 # add \$-16, %rdx
|
||||
vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
|
||||
bl _vpaes_schedule_transform # output transform
|
||||
|
||||
@ -1565,7 +1565,7 @@ foreach (split("\n",$code)) {
|
||||
if ($flavour =~ /le$/o) {
|
||||
SWITCH: for($conv) {
|
||||
/\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
|
||||
/\?rev/ && do { @bytes=reverse(@bytes); last; };
|
||||
/\?rev/ && do { @bytes=reverse(@bytes); last; };
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -445,7 +445,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
|
||||
##
|
||||
&set_label("schedule_192",16);
|
||||
&movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
|
||||
&call ("_vpaes_schedule_transform"); # input transform
|
||||
&call ("_vpaes_schedule_transform"); # input transform
|
||||
&movdqa ("xmm6","xmm0"); # save short part
|
||||
&pxor ("xmm4","xmm4"); # clear 4
|
||||
&movhlps("xmm6","xmm4"); # clobber low side with zeros
|
||||
@ -476,7 +476,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
|
||||
##
|
||||
&set_label("schedule_256",16);
|
||||
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
|
||||
&call ("_vpaes_schedule_transform"); # input transform
|
||||
&call ("_vpaes_schedule_transform"); # input transform
|
||||
&mov ($round,7);
|
||||
|
||||
&set_label("loop_schedule_256");
|
||||
@ -487,7 +487,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
|
||||
&call ("_vpaes_schedule_round");
|
||||
&dec ($round);
|
||||
&jz (&label("schedule_mangle_last"));
|
||||
&call ("_vpaes_schedule_mangle");
|
||||
&call ("_vpaes_schedule_mangle");
|
||||
|
||||
# low round. swap xmm7 and xmm6
|
||||
&pshufd ("xmm0","xmm0",0xFF);
|
||||
@ -610,7 +610,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
|
||||
# subbyte
|
||||
&movdqa ("xmm4",&QWP($k_s0F,$const));
|
||||
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
|
||||
&movdqa ("xmm1","xmm4");
|
||||
&movdqa ("xmm1","xmm4");
|
||||
&pandn ("xmm1","xmm0");
|
||||
&psrld ("xmm1",4); # 1 = i
|
||||
&pand ("xmm0","xmm4"); # 0 = k
|
||||
|
@ -171,7 +171,7 @@ _vpaes_encrypt_core:
|
||||
pshufb %xmm1, %xmm0
|
||||
ret
|
||||
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
|
||||
|
||||
|
||||
##
|
||||
## Decryption core
|
||||
##
|
||||
@ -332,7 +332,7 @@ _vpaes_schedule_core:
|
||||
##
|
||||
.Lschedule_128:
|
||||
mov \$10, %esi
|
||||
|
||||
|
||||
.Loop_schedule_128:
|
||||
call _vpaes_schedule_round
|
||||
dec %rsi
|
||||
@ -366,7 +366,7 @@ _vpaes_schedule_core:
|
||||
|
||||
.Loop_schedule_192:
|
||||
call _vpaes_schedule_round
|
||||
palignr \$8,%xmm6,%xmm0
|
||||
palignr \$8,%xmm6,%xmm0
|
||||
call _vpaes_schedule_mangle # save key n
|
||||
call _vpaes_schedule_192_smear
|
||||
call _vpaes_schedule_mangle # save key n+1
|
||||
@ -392,7 +392,7 @@ _vpaes_schedule_core:
|
||||
movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
|
||||
call _vpaes_schedule_transform # input transform
|
||||
mov \$7, %esi
|
||||
|
||||
|
||||
.Loop_schedule_256:
|
||||
call _vpaes_schedule_mangle # output low result
|
||||
movdqa %xmm0, %xmm6 # save cur_lo in xmm6
|
||||
@ -401,7 +401,7 @@ _vpaes_schedule_core:
|
||||
call _vpaes_schedule_round
|
||||
dec %rsi
|
||||
jz .Lschedule_mangle_last
|
||||
call _vpaes_schedule_mangle
|
||||
call _vpaes_schedule_mangle
|
||||
|
||||
# low round. swap xmm7 and xmm6
|
||||
pshufd \$0xFF, %xmm0, %xmm0
|
||||
@ -409,10 +409,10 @@ _vpaes_schedule_core:
|
||||
movdqa %xmm6, %xmm7
|
||||
call _vpaes_schedule_low_round
|
||||
movdqa %xmm5, %xmm7
|
||||
|
||||
|
||||
jmp .Loop_schedule_256
|
||||
|
||||
|
||||
|
||||
##
|
||||
## .aes_schedule_mangle_last
|
||||
##
|
||||
@ -511,9 +511,9 @@ _vpaes_schedule_round:
|
||||
# rotate
|
||||
pshufd \$0xFF, %xmm0, %xmm0
|
||||
palignr \$1, %xmm0, %xmm0
|
||||
|
||||
|
||||
# fall through...
|
||||
|
||||
|
||||
# low round: same as high round, but no rotation and no rcon.
|
||||
_vpaes_schedule_low_round:
|
||||
# smear xmm7
|
||||
@ -552,7 +552,7 @@ _vpaes_schedule_low_round:
|
||||
pxor %xmm4, %xmm0 # 0 = sbox output
|
||||
|
||||
# add in smeared stuff
|
||||
pxor %xmm7, %xmm0
|
||||
pxor %xmm7, %xmm0
|
||||
movdqa %xmm0, %xmm7
|
||||
ret
|
||||
.size _vpaes_schedule_round,.-_vpaes_schedule_round
|
||||
|
@ -36,7 +36,7 @@
|
||||
#
|
||||
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
|
||||
# Polynomial Multiplication on ARM Processors using the NEON Engine.
|
||||
#
|
||||
#
|
||||
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
|
||||
|
||||
$flavour = shift;
|
||||
|
@ -23,7 +23,7 @@
|
||||
# [depending on key length, less for longer keys] on ARM920T, and
|
||||
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
|
||||
# base and compiler generated code with in-lined umull and even umlal
|
||||
# instructions. The latter means that this code didn't really have an
|
||||
# instructions. The latter means that this code didn't really have an
|
||||
# "advantage" of utilizing some "secret" instruction.
|
||||
#
|
||||
# The code is interoperable with Thumb ISA and is rather compact, less
|
||||
|
@ -54,7 +54,7 @@ sub bn_mul_add_words
|
||||
&movd("mm0",&wparam(3)); # mm0 = w
|
||||
&pxor("mm1","mm1"); # mm1 = carry_in
|
||||
&jmp(&label("maw_sse2_entry"));
|
||||
|
||||
|
||||
&set_label("maw_sse2_unrolled",16);
|
||||
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
|
||||
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
|
||||
@ -675,20 +675,20 @@ sub bn_sub_part_words
|
||||
&adc($c,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
}
|
||||
|
||||
|
||||
&comment("");
|
||||
&add($b,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("pw_neg_loop"));
|
||||
|
||||
|
||||
&set_label("pw_neg_finish",0);
|
||||
&mov($tmp2,&wparam(4)); # get dl
|
||||
&mov($num,0);
|
||||
&sub($num,$tmp2);
|
||||
&and($num,7);
|
||||
&jz(&label("pw_end"));
|
||||
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("dl<0 Tail Round $i");
|
||||
@ -705,9 +705,9 @@ sub bn_sub_part_words
|
||||
}
|
||||
|
||||
&jmp(&label("pw_end"));
|
||||
|
||||
|
||||
&set_label("pw_pos",0);
|
||||
|
||||
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
&jz(&label("pw_pos_finish"));
|
||||
|
||||
@ -722,18 +722,18 @@ sub bn_sub_part_words
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&jnc(&label("pw_nc".$i));
|
||||
}
|
||||
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("pw_pos_loop"));
|
||||
|
||||
|
||||
&set_label("pw_pos_finish",0);
|
||||
&mov($num,&wparam(4)); # get dl
|
||||
&and($num,7);
|
||||
&jz(&label("pw_end"));
|
||||
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("dl>0 Tail Round $i");
|
||||
@ -754,17 +754,17 @@ sub bn_sub_part_words
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&set_label("pw_nc".$i,0);
|
||||
}
|
||||
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("pw_nc_loop"));
|
||||
|
||||
|
||||
&mov($num,&wparam(4)); # get dl
|
||||
&and($num,7);
|
||||
&jz(&label("pw_nc_end"));
|
||||
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
|
@ -47,7 +47,7 @@ sub mul_add_c
|
||||
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
|
||||
###
|
||||
&adc($c2,0);
|
||||
# is pos > 1, it means it is the last loop
|
||||
# is pos > 1, it means it is the last loop
|
||||
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
|
||||
}
|
||||
@ -76,7 +76,7 @@ sub sqr_add_c
|
||||
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
|
||||
###
|
||||
&adc($c2,0);
|
||||
# is pos > 1, it means it is the last loop
|
||||
# is pos > 1, it means it is the last loop
|
||||
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
|
||||
}
|
||||
@ -127,7 +127,7 @@ sub bn_mul_comba
|
||||
$c2="ebp";
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
|
||||
|
||||
$as=0;
|
||||
$ae=0;
|
||||
$bs=0;
|
||||
@ -142,9 +142,9 @@ sub bn_mul_comba
|
||||
&push("ebx");
|
||||
|
||||
&xor($c0,$c0);
|
||||
&mov("eax",&DWP(0,$a,"",0)); # load the first word
|
||||
&mov("eax",&DWP(0,$a,"",0)); # load the first word
|
||||
&xor($c1,$c1);
|
||||
&mov("edx",&DWP(0,$b,"",0)); # load the first second
|
||||
&mov("edx",&DWP(0,$b,"",0)); # load the first second
|
||||
|
||||
for ($i=0; $i<$tot; $i++)
|
||||
{
|
||||
@ -152,7 +152,7 @@ sub bn_mul_comba
|
||||
$bi=$bs;
|
||||
$end=$be+1;
|
||||
|
||||
&comment("################## Calculate word $i");
|
||||
&comment("################## Calculate word $i");
|
||||
|
||||
for ($j=$bs; $j<$end; $j++)
|
||||
{
|
||||
|
@ -80,7 +80,7 @@ $code=<<___;
|
||||
|
||||
// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
|
||||
// const BN_ULONG *bp,const BN_ULONG *np,
|
||||
// const BN_ULONG *n0p,int num);
|
||||
// const BN_ULONG *n0p,int num);
|
||||
.align 64
|
||||
.global bn_mul_mont#
|
||||
.proc bn_mul_mont#
|
||||
@ -203,7 +203,7 @@ bn_mul_mont_general:
|
||||
{ .mmi; .pred.rel "mutex",p39,p41
|
||||
(p39) add topbit=r0,r0
|
||||
(p41) add topbit=r0,r0,1
|
||||
nop.i 0 }
|
||||
nop.i 0 }
|
||||
{ .mmi; st8 [tp_1]=n[0]
|
||||
add tptr=16,sp
|
||||
add tp_1=8,sp };;
|
||||
|
@ -603,13 +603,13 @@ $code.=<<___;
|
||||
sltu $v0,$t2,$ta2
|
||||
$ST $t2,-2*$BNSZ($a0)
|
||||
$ADDU $v0,$t8
|
||||
|
||||
|
||||
$ADDU $ta3,$t3
|
||||
sltu $t9,$ta3,$t3
|
||||
$ADDU $t3,$ta3,$v0
|
||||
sltu $v0,$t3,$ta3
|
||||
$ST $t3,-$BNSZ($a0)
|
||||
|
||||
|
||||
.set noreorder
|
||||
bgtz $at,.L_bn_add_words_loop
|
||||
$ADDU $v0,$t9
|
||||
@ -808,7 +808,7 @@ bn_div_3_words:
|
||||
# so that we can save two arguments
|
||||
# and return address in registers
|
||||
# instead of stack:-)
|
||||
|
||||
|
||||
$LD $a0,($a3)
|
||||
move $ta2,$a1
|
||||
bne $a0,$a2,bn_div_3_words_internal
|
||||
|
@ -546,7 +546,7 @@ L\$copy
|
||||
ldd $idx($np),$hi0
|
||||
std,ma %r0,8($tp)
|
||||
addib,<> 8,$idx,.-8 ; L\$copy
|
||||
std,ma $hi0,8($rp)
|
||||
std,ma $hi0,8($rp)
|
||||
___
|
||||
|
||||
if ($BN_SZ==4) { # PA-RISC 1.1 code-path
|
||||
@ -868,7 +868,7 @@ L\$copy_pa11
|
||||
ldwx $idx($np),$hi0
|
||||
stws,ma %r0,4($tp)
|
||||
addib,<> 4,$idx,L\$copy_pa11
|
||||
stws,ma $hi0,4($rp)
|
||||
stws,ma $hi0,4($rp)
|
||||
|
||||
nop ; alignment
|
||||
L\$done
|
||||
|
@ -26,7 +26,7 @@
|
||||
# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
|
||||
# for 64-bit application running on PPC970/G5 is:
|
||||
#
|
||||
# 512-bit +65%
|
||||
# 512-bit +65%
|
||||
# 1024-bit +35%
|
||||
# 2048-bit +18%
|
||||
# 4096-bit +4%
|
||||
@ -49,7 +49,7 @@ if ($flavour =~ /32/) {
|
||||
$UMULL= "mullw"; # unsigned multiply low
|
||||
$UMULH= "mulhwu"; # unsigned multiply high
|
||||
$UCMP= "cmplw"; # unsigned compare
|
||||
$SHRI= "srwi"; # unsigned shift right by immediate
|
||||
$SHRI= "srwi"; # unsigned shift right by immediate
|
||||
$PUSH= $ST;
|
||||
$POP= $LD;
|
||||
} elsif ($flavour =~ /64/) {
|
||||
@ -69,7 +69,7 @@ if ($flavour =~ /32/) {
|
||||
$UMULL= "mulld"; # unsigned multiply low
|
||||
$UMULH= "mulhdu"; # unsigned multiply high
|
||||
$UCMP= "cmpld"; # unsigned compare
|
||||
$SHRI= "srdi"; # unsigned shift right by immediate
|
||||
$SHRI= "srdi"; # unsigned shift right by immediate
|
||||
$PUSH= $ST;
|
||||
$POP= $LD;
|
||||
} else { die "nonsense $flavour"; }
|
||||
|
@ -38,7 +38,7 @@
|
||||
#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
|
||||
#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
|
||||
#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
|
||||
#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
|
||||
#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
|
||||
#
|
||||
# Same bechmark with this assembler code:
|
||||
#
|
||||
@ -74,7 +74,7 @@
|
||||
#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
|
||||
#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
|
||||
#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
|
||||
#
|
||||
#
|
||||
# Again, performance increases by at about 75%
|
||||
#
|
||||
# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
|
||||
@ -125,7 +125,7 @@ if ($flavour =~ /32/) {
|
||||
$CNTLZ= "cntlzw"; # count leading zeros
|
||||
$SHL= "slw"; # shift left
|
||||
$SHR= "srw"; # unsigned shift right
|
||||
$SHRI= "srwi"; # unsigned shift right by immediate
|
||||
$SHRI= "srwi"; # unsigned shift right by immediate
|
||||
$SHLI= "slwi"; # shift left by immediate
|
||||
$CLRU= "clrlwi"; # clear upper bits
|
||||
$INSR= "insrwi"; # insert right
|
||||
@ -149,10 +149,10 @@ if ($flavour =~ /32/) {
|
||||
$CNTLZ= "cntlzd"; # count leading zeros
|
||||
$SHL= "sld"; # shift left
|
||||
$SHR= "srd"; # unsigned shift right
|
||||
$SHRI= "srdi"; # unsigned shift right by immediate
|
||||
$SHRI= "srdi"; # unsigned shift right by immediate
|
||||
$SHLI= "sldi"; # shift left by immediate
|
||||
$CLRU= "clrldi"; # clear upper bits
|
||||
$INSR= "insrdi"; # insert right
|
||||
$INSR= "insrdi"; # insert right
|
||||
$ROTL= "rotldi"; # rotate left by immediate
|
||||
$TR= "td"; # conditional trap
|
||||
} else { die "nonsense $flavour"; }
|
||||
@ -189,7 +189,7 @@ $data=<<EOF;
|
||||
# below.
|
||||
# 12/05/03 Suresh Chari
|
||||
# (with lots of help from) Andy Polyakov
|
||||
##
|
||||
##
|
||||
# 1. Initial version 10/20/02 Suresh Chari
|
||||
#
|
||||
#
|
||||
@ -202,7 +202,7 @@ $data=<<EOF;
|
||||
# be done in the build process.
|
||||
#
|
||||
# Hand optimized assembly code for the following routines
|
||||
#
|
||||
#
|
||||
# bn_sqr_comba4
|
||||
# bn_sqr_comba8
|
||||
# bn_mul_comba4
|
||||
@ -225,10 +225,10 @@ $data=<<EOF;
|
||||
#--------------------------------------------------------------------------
|
||||
#
|
||||
# Defines to be used in the assembly code.
|
||||
#
|
||||
#
|
||||
#.set r0,0 # we use it as storage for value of 0
|
||||
#.set SP,1 # preserved
|
||||
#.set RTOC,2 # preserved
|
||||
#.set RTOC,2 # preserved
|
||||
#.set r3,3 # 1st argument/return value
|
||||
#.set r4,4 # 2nd argument/volatile register
|
||||
#.set r5,5 # 3rd argument/volatile register
|
||||
@ -246,7 +246,7 @@ $data=<<EOF;
|
||||
# the first . i.e. for example change ".bn_sqr_comba4"
|
||||
# to "bn_sqr_comba4". This should be automatically done
|
||||
# in the build.
|
||||
|
||||
|
||||
.globl .bn_sqr_comba4
|
||||
.globl .bn_sqr_comba8
|
||||
.globl .bn_mul_comba4
|
||||
@ -257,9 +257,9 @@ $data=<<EOF;
|
||||
.globl .bn_sqr_words
|
||||
.globl .bn_mul_words
|
||||
.globl .bn_mul_add_words
|
||||
|
||||
|
||||
# .text section
|
||||
|
||||
|
||||
.machine "any"
|
||||
|
||||
#
|
||||
@ -278,8 +278,8 @@ $data=<<EOF;
|
||||
# r3 contains r
|
||||
# r4 contains a
|
||||
#
|
||||
# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
|
||||
#
|
||||
# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
|
||||
#
|
||||
# r5,r6 are the two BN_ULONGs being multiplied.
|
||||
# r7,r8 are the results of the 32x32 giving 64 bit multiply.
|
||||
# r9,r10, r11 are the equivalents of c1,c2, c3.
|
||||
@ -288,10 +288,10 @@ $data=<<EOF;
|
||||
#
|
||||
xor r0,r0,r0 # set r0 = 0. Used in the addze
|
||||
# instructions below
|
||||
|
||||
|
||||
#sqr_add_c(a,0,c1,c2,c3)
|
||||
$LD r5,`0*$BNSZ`(r4)
|
||||
$UMULL r9,r5,r5
|
||||
$LD r5,`0*$BNSZ`(r4)
|
||||
$UMULL r9,r5,r5
|
||||
$UMULH r10,r5,r5 #in first iteration. No need
|
||||
#to add since c1=c2=c3=0.
|
||||
# Note c3(r11) is NOT set to 0
|
||||
@ -299,20 +299,20 @@ $data=<<EOF;
|
||||
|
||||
$ST r9,`0*$BNSZ`(r3) # r[0]=c1;
|
||||
# sqr_add_c2(a,1,0,c2,c3,c1);
|
||||
$LD r6,`1*$BNSZ`(r4)
|
||||
$LD r6,`1*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
|
||||
adde r8,r8,r8
|
||||
addze r9,r0 # catch carry if any.
|
||||
# r9= r0(=0) and carry
|
||||
|
||||
# r9= r0(=0) and carry
|
||||
|
||||
addc r10,r7,r10 # now add to temp result.
|
||||
addze r11,r8 # r8 added to r11 which is 0
|
||||
addze r11,r8 # r8 added to r11 which is 0
|
||||
addze r9,r9
|
||||
|
||||
$ST r10,`1*$BNSZ`(r3) #r[1]=c2;
|
||||
|
||||
$ST r10,`1*$BNSZ`(r3) #r[1]=c2;
|
||||
#sqr_add_c(a,1,c3,c1,c2)
|
||||
$UMULL r7,r6,r6
|
||||
$UMULH r8,r6,r6
|
||||
@ -323,23 +323,23 @@ $data=<<EOF;
|
||||
$LD r6,`2*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r7,r7,r7
|
||||
adde r8,r8,r8
|
||||
addze r10,r10
|
||||
|
||||
|
||||
addc r11,r7,r11
|
||||
adde r9,r8,r9
|
||||
addze r10,r10
|
||||
$ST r11,`2*$BNSZ`(r3) #r[2]=c3
|
||||
$ST r11,`2*$BNSZ`(r3) #r[2]=c3
|
||||
#sqr_add_c2(a,3,0,c1,c2,c3);
|
||||
$LD r6,`3*$BNSZ`(r4)
|
||||
$LD r6,`3*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
addc r7,r7,r7
|
||||
adde r8,r8,r8
|
||||
addze r11,r0
|
||||
|
||||
|
||||
addc r9,r7,r9
|
||||
adde r10,r8,r10
|
||||
addze r11,r11
|
||||
@ -348,7 +348,7 @@ $data=<<EOF;
|
||||
$LD r6,`2*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r7,r7,r7
|
||||
adde r8,r8,r8
|
||||
addze r11,r11
|
||||
@ -363,31 +363,31 @@ $data=<<EOF;
|
||||
adde r11,r8,r11
|
||||
addze r9,r0
|
||||
#sqr_add_c2(a,3,1,c2,c3,c1);
|
||||
$LD r6,`3*$BNSZ`(r4)
|
||||
$LD r6,`3*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
addc r7,r7,r7
|
||||
adde r8,r8,r8
|
||||
addze r9,r9
|
||||
|
||||
|
||||
addc r10,r7,r10
|
||||
adde r11,r8,r11
|
||||
addze r9,r9
|
||||
$ST r10,`4*$BNSZ`(r3) #r[4]=c2
|
||||
#sqr_add_c2(a,3,2,c3,c1,c2);
|
||||
$LD r5,`2*$BNSZ`(r4)
|
||||
$LD r5,`2*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
addc r7,r7,r7
|
||||
adde r8,r8,r8
|
||||
addze r10,r0
|
||||
|
||||
|
||||
addc r11,r7,r11
|
||||
adde r9,r8,r9
|
||||
addze r10,r10
|
||||
$ST r11,`5*$BNSZ`(r3) #r[5] = c3
|
||||
#sqr_add_c(a,3,c1,c2,c3);
|
||||
$UMULL r7,r6,r6
|
||||
$UMULL r7,r6,r6
|
||||
$UMULH r8,r6,r6
|
||||
addc r9,r7,r9
|
||||
adde r10,r8,r10
|
||||
@ -406,7 +406,7 @@ $data=<<EOF;
|
||||
# for the gcc compiler. This should be automatically
|
||||
# done in the build
|
||||
#
|
||||
|
||||
|
||||
.align 4
|
||||
.bn_sqr_comba8:
|
||||
#
|
||||
@ -418,15 +418,15 @@ $data=<<EOF;
|
||||
# r3 contains r
|
||||
# r4 contains a
|
||||
#
|
||||
# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
|
||||
#
|
||||
# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
|
||||
#
|
||||
# r5,r6 are the two BN_ULONGs being multiplied.
|
||||
# r7,r8 are the results of the 32x32 giving 64 bit multiply.
|
||||
# r9,r10, r11 are the equivalents of c1,c2, c3.
|
||||
#
|
||||
# Possible optimization of loading all 8 longs of a into registers
|
||||
# doesn't provide any speedup
|
||||
#
|
||||
#
|
||||
|
||||
xor r0,r0,r0 #set r0 = 0.Used in addze
|
||||
#instructions below.
|
||||
@ -439,18 +439,18 @@ $data=<<EOF;
|
||||
#sqr_add_c2(a,1,0,c2,c3,c1);
|
||||
$LD r6,`1*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
addc r10,r7,r10 #add the two register number
|
||||
adde r11,r8,r0 # (r8,r7) to the three register
|
||||
addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
|
||||
|
||||
|
||||
addc r10,r7,r10 #add the two register number
|
||||
adde r11,r8,r11 # (r8,r7) to the three register
|
||||
addze r9,r9 # number (r9,r11,r10).
|
||||
|
||||
|
||||
$ST r10,`1*$BNSZ`(r3) # r[1]=c2
|
||||
|
||||
|
||||
#sqr_add_c(a,1,c3,c1,c2);
|
||||
$UMULL r7,r6,r6
|
||||
$UMULH r8,r6,r6
|
||||
@ -461,25 +461,25 @@ $data=<<EOF;
|
||||
$LD r6,`2*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r11,r7,r11
|
||||
adde r9,r8,r9
|
||||
addze r10,r10
|
||||
|
||||
|
||||
addc r11,r7,r11
|
||||
adde r9,r8,r9
|
||||
addze r10,r10
|
||||
|
||||
|
||||
$ST r11,`2*$BNSZ`(r3) #r[2]=c3
|
||||
#sqr_add_c2(a,3,0,c1,c2,c3);
|
||||
$LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r9,r7,r9
|
||||
adde r10,r8,r10
|
||||
addze r11,r0
|
||||
|
||||
|
||||
addc r9,r7,r9
|
||||
adde r10,r8,r10
|
||||
addze r11,r11
|
||||
@ -488,20 +488,20 @@ $data=<<EOF;
|
||||
$LD r6,`2*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r9,r7,r9
|
||||
adde r10,r8,r10
|
||||
addze r11,r11
|
||||
|
||||
|
||||
addc r9,r7,r9
|
||||
adde r10,r8,r10
|
||||
addze r11,r11
|
||||
|
||||
|
||||
$ST r9,`3*$BNSZ`(r3) #r[3]=c1;
|
||||
#sqr_add_c(a,2,c2,c3,c1);
|
||||
$UMULL r7,r6,r6
|
||||
$UMULH r8,r6,r6
|
||||
|
||||
|
||||
addc r10,r7,r10
|
||||
adde r11,r8,r11
|
||||
addze r9,r0
|
||||
@ -509,11 +509,11 @@ $data=<<EOF;
|
||||
$LD r6,`3*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r10,r7,r10
|
||||
adde r11,r8,r11
|
||||
addze r9,r9
|
||||
|
||||
|
||||
addc r10,r7,r10
|
||||
adde r11,r8,r11
|
||||
addze r9,r9
|
||||
@ -522,11 +522,11 @@ $data=<<EOF;
|
||||
$LD r6,`4*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r10,r7,r10
|
||||
adde r11,r8,r11
|
||||
addze r9,r9
|
||||
|
||||
|
||||
addc r10,r7,r10
|
||||
adde r11,r8,r11
|
||||
addze r9,r9
|
||||
@ -535,11 +535,11 @@ $data=<<EOF;
|
||||
$LD r6,`5*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r11,r7,r11
|
||||
adde r9,r8,r9
|
||||
addze r10,r0
|
||||
|
||||
|
||||
addc r11,r7,r11
|
||||
adde r9,r8,r9
|
||||
addze r10,r10
|
||||
@ -548,11 +548,11 @@ $data=<<EOF;
|
||||
$LD r6,`4*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r11,r7,r11
|
||||
adde r9,r8,r9
|
||||
addze r10,r10
|
||||
|
||||
|
||||
addc r11,r7,r11
|
||||
adde r9,r8,r9
|
||||
addze r10,r10
|
||||
@ -561,11 +561,11 @@ $data=<<EOF;
|
||||
$LD r6,`3*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r11,r7,r11
|
||||
adde r9,r8,r9
|
||||
addze r10,r10
|
||||
|
||||
|
||||
addc r11,r7,r11
|
||||
adde r9,r8,r9
|
||||
addze r10,r10
|
||||
@ -580,11 +580,11 @@ $data=<<EOF;
|
||||
$LD r6,`4*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r9,r7,r9
|
||||
adde r10,r8,r10
|
||||
addze r11,r11
|
||||
|
||||
|
||||
addc r9,r7,r9
|
||||
adde r10,r8,r10
|
||||
addze r11,r11
|
||||
@ -593,11 +593,11 @@ $data=<<EOF;
|
||||
$LD r6,`5*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r9,r7,r9
|
||||
adde r10,r8,r10
|
||||
addze r11,r11
|
||||
|
||||
|
||||
addc r9,r7,r9
|
||||
adde r10,r8,r10
|
||||
addze r11,r11
|
||||
@ -617,7 +617,7 @@ $data=<<EOF;
|
||||
$LD r6,`7*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r10,r7,r10
|
||||
adde r11,r8,r11
|
||||
addze r9,r0
|
||||
@ -629,7 +629,7 @@ $data=<<EOF;
|
||||
$LD r6,`6*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r10,r7,r10
|
||||
adde r11,r8,r11
|
||||
addze r9,r9
|
||||
@ -652,7 +652,7 @@ $data=<<EOF;
|
||||
$LD r6,`4*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r10,r7,r10
|
||||
adde r11,r8,r11
|
||||
addze r9,r9
|
||||
@ -684,7 +684,7 @@ $data=<<EOF;
|
||||
addc r11,r7,r11
|
||||
adde r9,r8,r9
|
||||
addze r10,r10
|
||||
|
||||
|
||||
addc r11,r7,r11
|
||||
adde r9,r8,r9
|
||||
addze r10,r10
|
||||
@ -704,7 +704,7 @@ $data=<<EOF;
|
||||
$LD r5,`2*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
$UMULH r8,r5,r6
|
||||
|
||||
|
||||
addc r9,r7,r9
|
||||
adde r10,r8,r10
|
||||
addze r11,r0
|
||||
@ -801,7 +801,7 @@ $data=<<EOF;
|
||||
adde r10,r8,r10
|
||||
addze r11,r11
|
||||
$ST r9,`12*$BNSZ`(r3) #r[12]=c1;
|
||||
|
||||
|
||||
#sqr_add_c2(a,7,6,c2,c3,c1)
|
||||
$LD r5,`6*$BNSZ`(r4)
|
||||
$UMULL r7,r5,r6
|
||||
@ -850,21 +850,21 @@ $data=<<EOF;
|
||||
#
|
||||
xor r0,r0,r0 #r0=0. Used in addze below.
|
||||
#mul_add_c(a[0],b[0],c1,c2,c3);
|
||||
$LD r6,`0*$BNSZ`(r4)
|
||||
$LD r7,`0*$BNSZ`(r5)
|
||||
$UMULL r10,r6,r7
|
||||
$UMULH r11,r6,r7
|
||||
$LD r6,`0*$BNSZ`(r4)
|
||||
$LD r7,`0*$BNSZ`(r5)
|
||||
$UMULL r10,r6,r7
|
||||
$UMULH r11,r6,r7
|
||||
$ST r10,`0*$BNSZ`(r3) #r[0]=c1
|
||||
#mul_add_c(a[0],b[1],c2,c3,c1);
|
||||
$LD r7,`1*$BNSZ`(r5)
|
||||
$LD r7,`1*$BNSZ`(r5)
|
||||
$UMULL r8,r6,r7
|
||||
$UMULH r9,r6,r7
|
||||
addc r11,r8,r11
|
||||
adde r12,r9,r0
|
||||
addze r10,r0
|
||||
#mul_add_c(a[1],b[0],c2,c3,c1);
|
||||
$LD r6, `1*$BNSZ`(r4)
|
||||
$LD r7, `0*$BNSZ`(r5)
|
||||
$LD r6, `1*$BNSZ`(r4)
|
||||
$LD r7, `0*$BNSZ`(r5)
|
||||
$UMULL r8,r6,r7
|
||||
$UMULH r9,r6,r7
|
||||
addc r11,r8,r11
|
||||
@ -872,23 +872,23 @@ $data=<<EOF;
|
||||
addze r10,r10
|
||||
$ST r11,`1*$BNSZ`(r3) #r[1]=c2
|
||||
#mul_add_c(a[2],b[0],c3,c1,c2);
|
||||
$LD r6,`2*$BNSZ`(r4)
|
||||
$LD r6,`2*$BNSZ`(r4)
|
||||
$UMULL r8,r6,r7
|
||||
$UMULH r9,r6,r7
|
||||
addc r12,r8,r12
|
||||
adde r10,r9,r10
|
||||
addze r11,r0
|
||||
#mul_add_c(a[1],b[1],c3,c1,c2);
|
||||
$LD r6,`1*$BNSZ`(r4)
|
||||
$LD r7,`1*$BNSZ`(r5)
|
||||
$LD r6,`1*$BNSZ`(r4)
|
||||
$LD r7,`1*$BNSZ`(r5)
|
||||
$UMULL r8,r6,r7
|
||||
$UMULH r9,r6,r7
|
||||
addc r12,r8,r12
|
||||
adde r10,r9,r10
|
||||
addze r11,r11
|
||||
#mul_add_c(a[0],b[2],c3,c1,c2);
|
||||
$LD r6,`0*$BNSZ`(r4)
|
||||
$LD r7,`2*$BNSZ`(r5)
|
||||
$LD r6,`0*$BNSZ`(r4)
|
||||
$LD r7,`2*$BNSZ`(r5)
|
||||
$UMULL r8,r6,r7
|
||||
$UMULH r9,r6,r7
|
||||
addc r12,r8,r12
|
||||
@ -896,7 +896,7 @@ $data=<<EOF;
|
||||
addze r11,r11
|
||||
$ST r12,`2*$BNSZ`(r3) #r[2]=c3
|
||||
#mul_add_c(a[0],b[3],c1,c2,c3);
|
||||
$LD r7,`3*$BNSZ`(r5)
|
||||
$LD r7,`3*$BNSZ`(r5)
|
||||
$UMULL r8,r6,r7
|
||||
$UMULH r9,r6,r7
|
||||
addc r10,r8,r10
|
||||
@ -928,7 +928,7 @@ $data=<<EOF;
|
||||
addze r12,r12
|
||||
$ST r10,`3*$BNSZ`(r3) #r[3]=c1
|
||||
#mul_add_c(a[3],b[1],c2,c3,c1);
|
||||
$LD r7,`1*$BNSZ`(r5)
|
||||
$LD r7,`1*$BNSZ`(r5)
|
||||
$UMULL r8,r6,r7
|
||||
$UMULH r9,r6,r7
|
||||
addc r11,r8,r11
|
||||
@ -952,7 +952,7 @@ $data=<<EOF;
|
||||
addze r10,r10
|
||||
$ST r11,`4*$BNSZ`(r3) #r[4]=c2
|
||||
#mul_add_c(a[2],b[3],c3,c1,c2);
|
||||
$LD r6,`2*$BNSZ`(r4)
|
||||
$LD r6,`2*$BNSZ`(r4)
|
||||
$UMULL r8,r6,r7
|
||||
$UMULH r9,r6,r7
|
||||
addc r12,r8,r12
|
||||
@ -968,7 +968,7 @@ $data=<<EOF;
|
||||
addze r11,r11
|
||||
$ST r12,`5*$BNSZ`(r3) #r[5]=c3
|
||||
#mul_add_c(a[3],b[3],c1,c2,c3);
|
||||
$LD r7,`3*$BNSZ`(r5)
|
||||
$LD r7,`3*$BNSZ`(r5)
|
||||
$UMULL r8,r6,r7
|
||||
$UMULH r9,r6,r7
|
||||
addc r10,r8,r10
|
||||
@ -988,7 +988,7 @@ $data=<<EOF;
|
||||
# for the gcc compiler. This should be automatically
|
||||
# done in the build
|
||||
#
|
||||
|
||||
|
||||
.align 4
|
||||
.bn_mul_comba8:
|
||||
#
|
||||
@ -1003,7 +1003,7 @@ $data=<<EOF;
|
||||
# r10, r11, r12 are the equivalents of c1, c2, and c3.
|
||||
#
|
||||
xor r0,r0,r0 #r0=0. Used in addze below.
|
||||
|
||||
|
||||
#mul_add_c(a[0],b[0],c1,c2,c3);
|
||||
$LD r6,`0*$BNSZ`(r4) #a[0]
|
||||
$LD r7,`0*$BNSZ`(r5) #b[0]
|
||||
@ -1065,7 +1065,7 @@ $data=<<EOF;
|
||||
addc r10,r10,r8
|
||||
adde r11,r11,r9
|
||||
addze r12,r12
|
||||
|
||||
|
||||
#mul_add_c(a[2],b[1],c1,c2,c3);
|
||||
$LD r6,`2*$BNSZ`(r4)
|
||||
$LD r7,`1*$BNSZ`(r5)
|
||||
@ -1131,7 +1131,7 @@ $data=<<EOF;
|
||||
adde r10,r10,r9
|
||||
addze r11,r0
|
||||
#mul_add_c(a[1],b[4],c3,c1,c2);
|
||||
$LD r6,`1*$BNSZ`(r4)
|
||||
$LD r6,`1*$BNSZ`(r4)
|
||||
$LD r7,`4*$BNSZ`(r5)
|
||||
$UMULL r8,r6,r7
|
||||
$UMULH r9,r6,r7
|
||||
@ -1139,7 +1139,7 @@ $data=<<EOF;
|
||||
adde r10,r10,r9
|
||||
addze r11,r11
|
||||
#mul_add_c(a[2],b[3],c3,c1,c2);
|
||||
$LD r6,`2*$BNSZ`(r4)
|
||||
$LD r6,`2*$BNSZ`(r4)
|
||||
$LD r7,`3*$BNSZ`(r5)
|
||||
$UMULL r8,r6,r7
|
||||
$UMULH r9,r6,r7
|
||||
@ -1147,7 +1147,7 @@ $data=<<EOF;
|
||||
adde r10,r10,r9
|
||||
addze r11,r11
|
||||
#mul_add_c(a[3],b[2],c3,c1,c2);
|
||||
$LD r6,`3*$BNSZ`(r4)
|
||||
$LD r6,`3*$BNSZ`(r4)
|
||||
$LD r7,`2*$BNSZ`(r5)
|
||||
$UMULL r8,r6,r7
|
||||
$UMULH r9,r6,r7
|
||||
@ -1155,7 +1155,7 @@ $data=<<EOF;
|
||||
adde r10,r10,r9
|
||||
addze r11,r11
|
||||
#mul_add_c(a[4],b[1],c3,c1,c2);
|
||||
$LD r6,`4*$BNSZ`(r4)
|
||||
$LD r6,`4*$BNSZ`(r4)
|
||||
$LD r7,`1*$BNSZ`(r5)
|
||||
$UMULL r8,r6,r7
|
||||
$UMULH r9,r6,r7
|
||||
@ -1163,7 +1163,7 @@ $data=<<EOF;
|
||||
adde r10,r10,r9
|
||||
addze r11,r11
|
||||
#mul_add_c(a[5],b[0],c3,c1,c2);
|
||||
$LD r6,`5*$BNSZ`(r4)
|
||||
$LD r6,`5*$BNSZ`(r4)
|
||||
$LD r7,`0*$BNSZ`(r5)
|
||||
$UMULL r8,r6,r7
|
||||
$UMULH r9,r6,r7
|
||||
@ -1555,7 +1555,7 @@ $data=<<EOF;
|
||||
addi r3,r3,-$BNSZ
|
||||
addi r5,r5,-$BNSZ
|
||||
mtctr r6
|
||||
Lppcasm_sub_mainloop:
|
||||
Lppcasm_sub_mainloop:
|
||||
$LDU r7,$BNSZ(r4)
|
||||
$LDU r8,$BNSZ(r5)
|
||||
subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
|
||||
@ -1563,7 +1563,7 @@ Lppcasm_sub_mainloop:
|
||||
# is r7-r8 -1 as we need.
|
||||
$STU r6,$BNSZ(r3)
|
||||
bdnz Lppcasm_sub_mainloop
|
||||
Lppcasm_sub_adios:
|
||||
Lppcasm_sub_adios:
|
||||
subfze r3,r0 # if carry bit is set then r3 = 0 else -1
|
||||
andi. r3,r3,1 # keep only last bit.
|
||||
blr
|
||||
@ -1604,13 +1604,13 @@ Lppcasm_sub_adios:
|
||||
addi r3,r3,-$BNSZ
|
||||
addi r5,r5,-$BNSZ
|
||||
mtctr r6
|
||||
Lppcasm_add_mainloop:
|
||||
Lppcasm_add_mainloop:
|
||||
$LDU r7,$BNSZ(r4)
|
||||
$LDU r8,$BNSZ(r5)
|
||||
adde r8,r7,r8
|
||||
$STU r8,$BNSZ(r3)
|
||||
bdnz Lppcasm_add_mainloop
|
||||
Lppcasm_add_adios:
|
||||
Lppcasm_add_adios:
|
||||
addze r3,r0 #return carry bit.
|
||||
blr
|
||||
.long 0
|
||||
@ -1633,11 +1633,11 @@ Lppcasm_add_adios:
|
||||
# the PPC instruction to count leading zeros instead
|
||||
# of call to num_bits_word. Since this was compiled
|
||||
# only at level -O2 we can possibly squeeze it more?
|
||||
#
|
||||
#
|
||||
# r3 = h
|
||||
# r4 = l
|
||||
# r5 = d
|
||||
|
||||
|
||||
$UCMPI 0,r5,0 # compare r5 and 0
|
||||
bne Lppcasm_div1 # proceed if d!=0
|
||||
li r3,-1 # d=0 return -1
|
||||
@ -1653,7 +1653,7 @@ Lppcasm_div1:
|
||||
Lppcasm_div2:
|
||||
$UCMP 0,r3,r5 #h>=d?
|
||||
blt Lppcasm_div3 #goto Lppcasm_div3 if not
|
||||
subf r3,r5,r3 #h-=d ;
|
||||
subf r3,r5,r3 #h-=d ;
|
||||
Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
|
||||
cmpi 0,0,r7,0 # is (i == 0)?
|
||||
beq Lppcasm_div4
|
||||
@ -1668,7 +1668,7 @@ Lppcasm_div4:
|
||||
# as it saves registers.
|
||||
li r6,2 #r6=2
|
||||
mtctr r6 #counter will be in count.
|
||||
Lppcasm_divouterloop:
|
||||
Lppcasm_divouterloop:
|
||||
$SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
|
||||
$SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
|
||||
# compute here for innerloop.
|
||||
@ -1676,7 +1676,7 @@ Lppcasm_divouterloop:
|
||||
bne Lppcasm_div5 # goto Lppcasm_div5 if not
|
||||
|
||||
li r8,-1
|
||||
$CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
|
||||
$CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
|
||||
b Lppcasm_div6
|
||||
Lppcasm_div5:
|
||||
$UDIV r8,r3,r9 #q = h/dh
|
||||
@ -1684,7 +1684,7 @@ Lppcasm_div6:
|
||||
$UMULL r12,r9,r8 #th = q*dh
|
||||
$CLRU r10,r5,`$BITS/2` #r10=dl
|
||||
$UMULL r6,r8,r10 #tl = q*dl
|
||||
|
||||
|
||||
Lppcasm_divinnerloop:
|
||||
subf r10,r12,r3 #t = h -th
|
||||
$SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
|
||||
@ -1761,7 +1761,7 @@ Lppcasm_div9:
|
||||
addi r4,r4,-$BNSZ
|
||||
addi r3,r3,-$BNSZ
|
||||
mtctr r5
|
||||
Lppcasm_sqr_mainloop:
|
||||
Lppcasm_sqr_mainloop:
|
||||
#sqr(r[0],r[1],a[0]);
|
||||
$LDU r6,$BNSZ(r4)
|
||||
$UMULL r7,r6,r6
|
||||
@ -1769,7 +1769,7 @@ Lppcasm_sqr_mainloop:
|
||||
$STU r7,$BNSZ(r3)
|
||||
$STU r8,$BNSZ(r3)
|
||||
bdnz Lppcasm_sqr_mainloop
|
||||
Lppcasm_sqr_adios:
|
||||
Lppcasm_sqr_adios:
|
||||
blr
|
||||
.long 0
|
||||
.byte 0,12,0x14,0,0,0,3,0
|
||||
@ -1783,7 +1783,7 @@ Lppcasm_sqr_adios:
|
||||
# done in the build
|
||||
#
|
||||
|
||||
.align 4
|
||||
.align 4
|
||||
.bn_mul_words:
|
||||
#
|
||||
# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
|
||||
@ -1797,7 +1797,7 @@ Lppcasm_sqr_adios:
|
||||
rlwinm. r7,r5,30,2,31 # num >> 2
|
||||
beq Lppcasm_mw_REM
|
||||
mtctr r7
|
||||
Lppcasm_mw_LOOP:
|
||||
Lppcasm_mw_LOOP:
|
||||
#mul(rp[0],ap[0],w,c1);
|
||||
$LD r8,`0*$BNSZ`(r4)
|
||||
$UMULL r9,r6,r8
|
||||
@ -1809,7 +1809,7 @@ Lppcasm_mw_LOOP:
|
||||
#using adde.
|
||||
$ST r9,`0*$BNSZ`(r3)
|
||||
#mul(rp[1],ap[1],w,c1);
|
||||
$LD r8,`1*$BNSZ`(r4)
|
||||
$LD r8,`1*$BNSZ`(r4)
|
||||
$UMULL r11,r6,r8
|
||||
$UMULH r12,r6,r8
|
||||
adde r11,r11,r10
|
||||
@ -1830,7 +1830,7 @@ Lppcasm_mw_LOOP:
|
||||
addze r12,r12 #this spin we collect carry into
|
||||
#r12
|
||||
$ST r11,`3*$BNSZ`(r3)
|
||||
|
||||
|
||||
addi r3,r3,`4*$BNSZ`
|
||||
addi r4,r4,`4*$BNSZ`
|
||||
bdnz Lppcasm_mw_LOOP
|
||||
@ -1846,25 +1846,25 @@ Lppcasm_mw_REM:
|
||||
addze r10,r10
|
||||
$ST r9,`0*$BNSZ`(r3)
|
||||
addi r12,r10,0
|
||||
|
||||
|
||||
addi r5,r5,-1
|
||||
cmpli 0,0,r5,0
|
||||
beq Lppcasm_mw_OVER
|
||||
|
||||
|
||||
|
||||
#mul(rp[1],ap[1],w,c1);
|
||||
$LD r8,`1*$BNSZ`(r4)
|
||||
$LD r8,`1*$BNSZ`(r4)
|
||||
$UMULL r9,r6,r8
|
||||
$UMULH r10,r6,r8
|
||||
addc r9,r9,r12
|
||||
addze r10,r10
|
||||
$ST r9,`1*$BNSZ`(r3)
|
||||
addi r12,r10,0
|
||||
|
||||
|
||||
addi r5,r5,-1
|
||||
cmpli 0,0,r5,0
|
||||
beq Lppcasm_mw_OVER
|
||||
|
||||
|
||||
#mul_add(rp[2],ap[2],w,c1);
|
||||
$LD r8,`2*$BNSZ`(r4)
|
||||
$UMULL r9,r6,r8
|
||||
@ -1873,8 +1873,8 @@ Lppcasm_mw_REM:
|
||||
addze r10,r10
|
||||
$ST r9,`2*$BNSZ`(r3)
|
||||
addi r12,r10,0
|
||||
|
||||
Lppcasm_mw_OVER:
|
||||
|
||||
Lppcasm_mw_OVER:
|
||||
addi r3,r12,0
|
||||
blr
|
||||
.long 0
|
||||
@ -1902,11 +1902,11 @@ Lppcasm_mw_OVER:
|
||||
# empirical evidence suggests that unrolled version performs best!!
|
||||
#
|
||||
xor r0,r0,r0 #r0 = 0
|
||||
xor r12,r12,r12 #r12 = 0 . used for carry
|
||||
xor r12,r12,r12 #r12 = 0 . used for carry
|
||||
rlwinm. r7,r5,30,2,31 # num >> 2
|
||||
beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
|
||||
mtctr r7
|
||||
Lppcasm_maw_mainloop:
|
||||
Lppcasm_maw_mainloop:
|
||||
#mul_add(rp[0],ap[0],w,c1);
|
||||
$LD r8,`0*$BNSZ`(r4)
|
||||
$LD r11,`0*$BNSZ`(r3)
|
||||
@ -1922,9 +1922,9 @@ Lppcasm_maw_mainloop:
|
||||
#by multiply and will be collected
|
||||
#in the next spin
|
||||
$ST r9,`0*$BNSZ`(r3)
|
||||
|
||||
|
||||
#mul_add(rp[1],ap[1],w,c1);
|
||||
$LD r8,`1*$BNSZ`(r4)
|
||||
$LD r8,`1*$BNSZ`(r4)
|
||||
$LD r9,`1*$BNSZ`(r3)
|
||||
$UMULL r11,r6,r8
|
||||
$UMULH r12,r6,r8
|
||||
@ -1933,7 +1933,7 @@ Lppcasm_maw_mainloop:
|
||||
addc r11,r11,r9
|
||||
#addze r12,r12
|
||||
$ST r11,`1*$BNSZ`(r3)
|
||||
|
||||
|
||||
#mul_add(rp[2],ap[2],w,c1);
|
||||
$LD r8,`2*$BNSZ`(r4)
|
||||
$UMULL r9,r6,r8
|
||||
@ -1944,7 +1944,7 @@ Lppcasm_maw_mainloop:
|
||||
addc r9,r9,r11
|
||||
#addze r10,r10
|
||||
$ST r9,`2*$BNSZ`(r3)
|
||||
|
||||
|
||||
#mul_add(rp[3],ap[3],w,c1);
|
||||
$LD r8,`3*$BNSZ`(r4)
|
||||
$UMULL r11,r6,r8
|
||||
@ -1958,7 +1958,7 @@ Lppcasm_maw_mainloop:
|
||||
addi r3,r3,`4*$BNSZ`
|
||||
addi r4,r4,`4*$BNSZ`
|
||||
bdnz Lppcasm_maw_mainloop
|
||||
|
||||
|
||||
Lppcasm_maw_leftover:
|
||||
andi. r5,r5,0x3
|
||||
beq Lppcasm_maw_adios
|
||||
@ -1975,10 +1975,10 @@ Lppcasm_maw_leftover:
|
||||
addc r9,r9,r12
|
||||
addze r12,r10
|
||||
$ST r9,0(r3)
|
||||
|
||||
|
||||
bdz Lppcasm_maw_adios
|
||||
#mul_add(rp[1],ap[1],w,c1);
|
||||
$LDU r8,$BNSZ(r4)
|
||||
$LDU r8,$BNSZ(r4)
|
||||
$UMULL r9,r6,r8
|
||||
$UMULH r10,r6,r8
|
||||
$LDU r11,$BNSZ(r3)
|
||||
@ -1987,7 +1987,7 @@ Lppcasm_maw_leftover:
|
||||
addc r9,r9,r12
|
||||
addze r12,r10
|
||||
$ST r9,0(r3)
|
||||
|
||||
|
||||
bdz Lppcasm_maw_adios
|
||||
#mul_add(rp[2],ap[2],w,c1);
|
||||
$LDU r8,$BNSZ(r4)
|
||||
@ -1999,8 +1999,8 @@ Lppcasm_maw_leftover:
|
||||
addc r9,r9,r12
|
||||
addze r12,r10
|
||||
$ST r9,0(r3)
|
||||
|
||||
Lppcasm_maw_adios:
|
||||
|
||||
Lppcasm_maw_adios:
|
||||
addi r3,r12,0
|
||||
blr
|
||||
.long 0
|
||||
|
@ -382,7 +382,7 @@ $code.=<<___;
|
||||
vpaddq $TEMP1, $ACC1, $ACC1
|
||||
vpmuludq 32*7-128($aap), $B2, $ACC2
|
||||
vpbroadcastq 32*5-128($tpa), $B2
|
||||
vpaddq 32*11-448($tp1), $ACC2, $ACC2
|
||||
vpaddq 32*11-448($tp1), $ACC2, $ACC2
|
||||
|
||||
vmovdqu $ACC6, 32*6-192($tp0)
|
||||
vmovdqu $ACC7, 32*7-192($tp0)
|
||||
@ -441,7 +441,7 @@ $code.=<<___;
|
||||
vmovdqu $ACC7, 32*16-448($tp1)
|
||||
lea 8($tp1), $tp1
|
||||
|
||||
dec $i
|
||||
dec $i
|
||||
jnz .LOOP_SQR_1024
|
||||
___
|
||||
$ZERO = $ACC9;
|
||||
@ -786,7 +786,7 @@ $code.=<<___;
|
||||
vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
|
||||
vpaddq $TEMP3, $ACC7, $ACC7
|
||||
vpaddq $TEMP4, $ACC8, $ACC8
|
||||
|
||||
|
||||
vpsrlq \$29, $ACC4, $TEMP1
|
||||
vpand $AND_MASK, $ACC4, $ACC4
|
||||
vpsrlq \$29, $ACC5, $TEMP2
|
||||
@ -1451,7 +1451,7 @@ $code.=<<___;
|
||||
vpaddq $TEMP4, $ACC8, $ACC8
|
||||
|
||||
vmovdqu $ACC4, 128-128($rp)
|
||||
vmovdqu $ACC5, 160-128($rp)
|
||||
vmovdqu $ACC5, 160-128($rp)
|
||||
vmovdqu $ACC6, 192-128($rp)
|
||||
vmovdqu $ACC7, 224-128($rp)
|
||||
vmovdqu $ACC8, 256-128($rp)
|
||||
|
@ -282,9 +282,9 @@ $code.=<<___;
|
||||
movq %r9, 16(%rsp)
|
||||
movq %r10, 24(%rsp)
|
||||
shrq \$63, %rbx
|
||||
|
||||
|
||||
#third iteration
|
||||
movq 16($inp), %r9
|
||||
movq 16($inp), %r9
|
||||
movq 24($inp), %rax
|
||||
mulq %r9
|
||||
addq %rax, %r12
|
||||
@ -532,7 +532,7 @@ $code.=<<___;
|
||||
movl $times,128+8(%rsp)
|
||||
movq $out, %xmm0 # off-load
|
||||
movq %rbp, %xmm1 # off-load
|
||||
#first iteration
|
||||
#first iteration
|
||||
mulx %rax, %r8, %r9
|
||||
|
||||
mulx 16($inp), %rcx, %r10
|
||||
@ -568,7 +568,7 @@ $code.=<<___;
|
||||
mov %rax, (%rsp)
|
||||
mov %r8, 8(%rsp)
|
||||
|
||||
#second iteration
|
||||
#second iteration
|
||||
mulx 16($inp), %rax, %rbx
|
||||
adox %rax, %r10
|
||||
adcx %rbx, %r11
|
||||
@ -607,8 +607,8 @@ $code.=<<___;
|
||||
|
||||
mov %r9, 16(%rsp)
|
||||
.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
|
||||
|
||||
#third iteration
|
||||
|
||||
#third iteration
|
||||
.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
|
||||
adox $out, %r12
|
||||
adcx %r9, %r13
|
||||
@ -643,8 +643,8 @@ $code.=<<___;
|
||||
|
||||
mov %r11, 32(%rsp)
|
||||
.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
|
||||
|
||||
#fourth iteration
|
||||
|
||||
#fourth iteration
|
||||
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
|
||||
adox %rax, %r14
|
||||
adcx %rbx, %r15
|
||||
@ -676,8 +676,8 @@ $code.=<<___;
|
||||
|
||||
mov %r13, 48(%rsp)
|
||||
mov %r14, 56(%rsp)
|
||||
|
||||
#fifth iteration
|
||||
|
||||
#fifth iteration
|
||||
.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
|
||||
adox $out, %r8
|
||||
adcx %r11, %r9
|
||||
@ -704,8 +704,8 @@ $code.=<<___;
|
||||
|
||||
mov %r15, 64(%rsp)
|
||||
mov %r8, 72(%rsp)
|
||||
|
||||
#sixth iteration
|
||||
|
||||
#sixth iteration
|
||||
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
|
||||
adox %rax, %r10
|
||||
adcx %rbx, %r11
|
||||
@ -1048,7 +1048,7 @@ $code.=<<___;
|
||||
movq 56($ap), %rax
|
||||
movq %rdx, %r14
|
||||
adcq \$0, %r14
|
||||
|
||||
|
||||
mulq %rbx
|
||||
addq %rax, %r14
|
||||
movq ($ap), %rax
|
||||
@ -1150,7 +1150,7 @@ $code.=<<___;
|
||||
movq ($ap), %rax
|
||||
adcq \$0, %rdx
|
||||
addq %r15, %r14
|
||||
movq %rdx, %r15
|
||||
movq %rdx, %r15
|
||||
adcq \$0, %r15
|
||||
|
||||
leaq 8(%rdi), %rdi
|
||||
@ -1212,7 +1212,7 @@ $code.=<<___ if ($addx);
|
||||
|
||||
mulx 48($ap), %rbx, %r14
|
||||
adcx %rax, %r12
|
||||
|
||||
|
||||
mulx 56($ap), %rax, %r15
|
||||
adcx %rbx, %r13
|
||||
adcx %rax, %r14
|
||||
@ -1411,7 +1411,7 @@ $code.=<<___;
|
||||
___
|
||||
$code.=<<___ if ($addx);
|
||||
jmp .Lmul_scatter_tail
|
||||
|
||||
|
||||
.align 32
|
||||
.Lmulx_scatter:
|
||||
movq ($out), %rdx # pass b[0]
|
||||
@ -1824,7 +1824,7 @@ __rsaz_512_mul:
|
||||
movq 56($ap), %rax
|
||||
movq %rdx, %r14
|
||||
adcq \$0, %r14
|
||||
|
||||
|
||||
mulq %rbx
|
||||
addq %rax, %r14
|
||||
movq ($ap), %rax
|
||||
@ -1901,7 +1901,7 @@ __rsaz_512_mul:
|
||||
movq ($ap), %rax
|
||||
adcq \$0, %rdx
|
||||
addq %r15, %r14
|
||||
movq %rdx, %r15
|
||||
movq %rdx, %r15
|
||||
adcq \$0, %r15
|
||||
|
||||
leaq 8(%rdi), %rdi
|
||||
|
@ -198,7 +198,7 @@ $code.=<<___;
|
||||
xgr $hi,@r[1]
|
||||
xgr $lo,@r[0]
|
||||
xgr $hi,@r[2]
|
||||
xgr $lo,@r[3]
|
||||
xgr $lo,@r[3]
|
||||
xgr $hi,@r[3]
|
||||
xgr $lo,$hi
|
||||
stg $hi,16($rp)
|
||||
|
@ -76,7 +76,7 @@
|
||||
# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
|
||||
# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
|
||||
#
|
||||
# Conclusions:
|
||||
# Conclusions:
|
||||
# - VIA SDK leaves a *lot* of room for improvement (which this
|
||||
# implementation successfully fills:-);
|
||||
# - 'rep montmul' gives up to >3x performance improvement depending on
|
||||
|
@ -39,7 +39,7 @@ require "x86asm.pl";
|
||||
|
||||
$output = pop;
|
||||
open STDOUT,">$output";
|
||||
|
||||
|
||||
&asm_init($ARGV[0],$0);
|
||||
|
||||
$sse2=0;
|
||||
|
@ -1049,7 +1049,7 @@ my $bptr="%rdx"; # const void *table,
|
||||
my $nptr="%rcx"; # const BN_ULONG *nptr,
|
||||
my $n0 ="%r8"; # const BN_ULONG *n0);
|
||||
my $num ="%r9"; # int num, has to be divisible by 8
|
||||
# int pwr
|
||||
# int pwr
|
||||
|
||||
my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
|
||||
my @A0=("%r10","%r11");
|
||||
@ -1126,7 +1126,7 @@ $code.=<<___;
|
||||
ja .Lpwr_page_walk
|
||||
.Lpwr_page_walk_done:
|
||||
|
||||
mov $num,%r10
|
||||
mov $num,%r10
|
||||
neg $num
|
||||
|
||||
##############################################################
|
||||
@ -2036,7 +2036,7 @@ __bn_post4x_internal:
|
||||
jnz .Lsqr4x_sub
|
||||
|
||||
mov $num,%r10 # prepare for back-to-back call
|
||||
neg $num # restore $num
|
||||
neg $num # restore $num
|
||||
ret
|
||||
.size __bn_post4x_internal,.-__bn_post4x_internal
|
||||
___
|
||||
@ -2259,7 +2259,7 @@ bn_mulx4x_mont_gather5:
|
||||
mov \$0,%r10
|
||||
cmovc %r10,%r11
|
||||
sub %r11,%rbp
|
||||
.Lmulx4xsp_done:
|
||||
.Lmulx4xsp_done:
|
||||
and \$-64,%rbp # ensure alignment
|
||||
mov %rsp,%r11
|
||||
sub %rbp,%r11
|
||||
@ -2741,7 +2741,7 @@ bn_powerx5:
|
||||
ja .Lpwrx_page_walk
|
||||
.Lpwrx_page_walk_done:
|
||||
|
||||
mov $num,%r10
|
||||
mov $num,%r10
|
||||
neg $num
|
||||
|
||||
##############################################################
|
||||
|
@ -792,9 +792,9 @@ if ($OPENSSL) {
|
||||
64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
|
||||
|
||||
sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
|
||||
sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
|
||||
sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
|
||||
sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
|
||||
sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
|
||||
sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
|
||||
sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
|
||||
|
||||
&set_label("Camellia_SIGMA",64);
|
||||
&data_word(
|
||||
|
@ -7,7 +7,7 @@
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
|
||||
# This flag makes the inner loop one cycle longer, but generates
|
||||
# This flag makes the inner loop one cycle longer, but generates
|
||||
# code that runs %30 faster on the pentium pro/II, 44% faster
|
||||
# of PIII, while only %7 slower on the pentium.
|
||||
# By default, this flag is on.
|
||||
@ -157,7 +157,7 @@ sub E_CAST {
|
||||
if ($ppro) {
|
||||
&xor( $tmp1, $tmp1);
|
||||
&mov( $tmp2, 0xff);
|
||||
|
||||
|
||||
&movb( &LB($tmp1), &HB($tmp4)); # A
|
||||
&and( $tmp2, $tmp4);
|
||||
|
||||
@ -166,7 +166,7 @@ sub E_CAST {
|
||||
} else {
|
||||
&mov( $tmp2, $tmp4); # B
|
||||
&movb( &LB($tmp1), &HB($tmp4)); # A # BAD BAD BAD
|
||||
|
||||
|
||||
&shr( $tmp4, 16); #
|
||||
&and( $tmp2, 0xff);
|
||||
}
|
||||
|
@ -15,7 +15,7 @@
|
||||
# ====================================================================
|
||||
#
|
||||
# December 2014
|
||||
#
|
||||
#
|
||||
# ChaCha20 for ARMv4.
|
||||
#
|
||||
# Performance in cycles per byte out of large buffer.
|
||||
@ -720,7 +720,7 @@ ChaCha20_neon:
|
||||
vadd.i32 $d2,$d1,$t0 @ counter+2
|
||||
str @t[3], [sp,#4*(16+15)]
|
||||
mov @t[3],#10
|
||||
add @x[12],@x[12],#3 @ counter+3
|
||||
add @x[12],@x[12],#3 @ counter+3
|
||||
b .Loop_neon
|
||||
|
||||
.align 4
|
||||
|
@ -15,7 +15,7 @@
|
||||
# ====================================================================
|
||||
#
|
||||
# June 2015
|
||||
#
|
||||
#
|
||||
# ChaCha20 for ARMv8.
|
||||
#
|
||||
# Performance in cycles per byte out of large buffer.
|
||||
@ -201,7 +201,7 @@ ChaCha20_ctr32:
|
||||
mov $ctr,#10
|
||||
subs $len,$len,#64
|
||||
.Loop:
|
||||
sub $ctr,$ctr,#1
|
||||
sub $ctr,$ctr,#1
|
||||
___
|
||||
foreach (&ROUND(0, 4, 8,12)) { eval; }
|
||||
foreach (&ROUND(0, 5,10,15)) { eval; }
|
||||
|
@ -15,7 +15,7 @@
|
||||
# ====================================================================
|
||||
#
|
||||
# October 2015
|
||||
#
|
||||
#
|
||||
# ChaCha20 for PowerPC/AltiVec.
|
||||
#
|
||||
# Performance in cycles per byte out of large buffer.
|
||||
@ -524,7 +524,7 @@ $code.=<<___;
|
||||
lwz @d[3],12($ctr)
|
||||
vadduwm @K[5],@K[4],@K[5]
|
||||
|
||||
vspltisw $twenty,-12 # synthesize constants
|
||||
vspltisw $twenty,-12 # synthesize constants
|
||||
vspltisw $twelve,12
|
||||
vspltisw $twenty5,-7
|
||||
#vspltisw $seven,7 # synthesized in the loop
|
||||
|
@ -111,7 +111,7 @@ sub D_ENCRYPT
|
||||
&and( $u, "0xfcfcfcfc" ); # 2
|
||||
&xor( $tmp1, $tmp1); # 1
|
||||
&and( $t, "0xcfcfcfcf" ); # 2
|
||||
&xor( $tmp2, $tmp2);
|
||||
&xor( $tmp2, $tmp2);
|
||||
&movb( &LB($tmp1), &LB($u) );
|
||||
&movb( &LB($tmp2), &HB($u) );
|
||||
&rotr( $t, 4 );
|
||||
@ -175,7 +175,7 @@ sub IP_new
|
||||
&R_PERM_OP($l,$tt,$r,14,"0x33333333",$r);
|
||||
&R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r);
|
||||
&R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r);
|
||||
|
||||
|
||||
if ($lr != 3)
|
||||
{
|
||||
if (($lr-3) < 0)
|
||||
|
@ -85,7 +85,7 @@ sub DES_encrypt_internal()
|
||||
|
||||
&function_end_B("_x86_DES_encrypt");
|
||||
}
|
||||
|
||||
|
||||
sub DES_decrypt_internal()
|
||||
{
|
||||
&function_begin_B("_x86_DES_decrypt");
|
||||
@ -122,7 +122,7 @@ sub DES_decrypt_internal()
|
||||
|
||||
&function_end_B("_x86_DES_decrypt");
|
||||
}
|
||||
|
||||
|
||||
sub DES_encrypt
|
||||
{
|
||||
local($name,$do_ip)=@_;
|
||||
@ -283,7 +283,7 @@ sub IP_new
|
||||
&R_PERM_OP($l,$tt,$r,14,"0x33333333",$r);
|
||||
&R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r);
|
||||
&R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r);
|
||||
|
||||
|
||||
if ($lr != 3)
|
||||
{
|
||||
if (($lr-3) < 0)
|
||||
|
@ -34,7 +34,7 @@ sub DES_encrypt3
|
||||
&IP_new($L,$R,"edx",0);
|
||||
|
||||
# put them back
|
||||
|
||||
|
||||
if ($enc)
|
||||
{
|
||||
&mov(&DWP(4,"ebx","",0),$R);
|
||||
|
@ -660,7 +660,7 @@ __ecp_nistz256_div_by_2:
|
||||
adc $ap,xzr,xzr // zap $ap
|
||||
tst $acc0,#1 // is a even?
|
||||
|
||||
csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus
|
||||
csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus
|
||||
csel $acc1,$acc1,$t1,eq
|
||||
csel $acc2,$acc2,$t2,eq
|
||||
csel $acc3,$acc3,$t3,eq
|
||||
|
@ -1874,7 +1874,7 @@ $code.=<<___ if ($i<3);
|
||||
ldx [$bp+8*($i+1)],$bi ! bp[$i+1]
|
||||
___
|
||||
$code.=<<___;
|
||||
addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication
|
||||
addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication
|
||||
sllx $acc0,32,$t0
|
||||
addxccc $acc2,$t1,$acc2
|
||||
srlx $acc0,32,$t1
|
||||
|
@ -443,7 +443,7 @@ for(1..37) {
|
||||
&mov (&DWP(20,"esp"),"eax");
|
||||
&mov (&DWP(24,"esp"),"eax");
|
||||
&mov (&DWP(28,"esp"),"eax");
|
||||
|
||||
|
||||
&call ("_ecp_nistz256_sub");
|
||||
|
||||
&stack_pop(8);
|
||||
|
@ -611,7 +611,7 @@ __ecp_nistz256_mul_montq:
|
||||
adc \$0, $acc0
|
||||
|
||||
########################################################################
|
||||
# Second reduction step
|
||||
# Second reduction step
|
||||
mov $acc1, $t1
|
||||
shl \$32, $acc1
|
||||
mulq $poly3
|
||||
@ -658,7 +658,7 @@ __ecp_nistz256_mul_montq:
|
||||
adc \$0, $acc1
|
||||
|
||||
########################################################################
|
||||
# Third reduction step
|
||||
# Third reduction step
|
||||
mov $acc2, $t1
|
||||
shl \$32, $acc2
|
||||
mulq $poly3
|
||||
@ -705,7 +705,7 @@ __ecp_nistz256_mul_montq:
|
||||
adc \$0, $acc2
|
||||
|
||||
########################################################################
|
||||
# Final reduction step
|
||||
# Final reduction step
|
||||
mov $acc3, $t1
|
||||
shl \$32, $acc3
|
||||
mulq $poly3
|
||||
@ -718,7 +718,7 @@ __ecp_nistz256_mul_montq:
|
||||
mov $acc5, $t1
|
||||
adc \$0, $acc2
|
||||
|
||||
########################################################################
|
||||
########################################################################
|
||||
# Branch-less conditional subtraction of P
|
||||
sub \$-1, $acc4 # .Lpoly[0]
|
||||
mov $acc0, $t2
|
||||
@ -2118,7 +2118,7 @@ $code.=<<___;
|
||||
movq %xmm1, $r_ptr
|
||||
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
|
||||
___
|
||||
{
|
||||
{
|
||||
######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
|
||||
# operate in 4-5-6-7 "name space" that matches squaring output
|
||||
#
|
||||
@ -2207,7 +2207,7 @@ $code.=<<___;
|
||||
lea $M(%rsp), $b_ptr
|
||||
mov $acc4, $acc6 # harmonize sub output and mul input
|
||||
xor %ecx, %ecx
|
||||
mov $acc4, $S+8*0(%rsp) # have to save:-(
|
||||
mov $acc4, $S+8*0(%rsp) # have to save:-(
|
||||
mov $acc5, $acc2
|
||||
mov $acc5, $S+8*1(%rsp)
|
||||
cmovz $acc0, $acc3
|
||||
@ -3055,8 +3055,8 @@ ___
|
||||
########################################################################
|
||||
# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
|
||||
#
|
||||
open TABLE,"<ecp_nistz256_table.c" or
|
||||
open TABLE,"<${dir}../ecp_nistz256_table.c" or
|
||||
open TABLE,"<ecp_nistz256_table.c" or
|
||||
open TABLE,"<${dir}../ecp_nistz256_table.c" or
|
||||
die "failed to open ecp_nistz256_table.c:",$!;
|
||||
|
||||
use integer;
|
||||
|
@ -57,7 +57,7 @@ sub R0
|
||||
local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
|
||||
|
||||
&mov($tmp1,$C) if $pos < 0;
|
||||
&mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
|
||||
&mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
|
||||
|
||||
# body proper
|
||||
|
||||
|
@ -242,7 +242,7 @@ md5_block_asm_data_order:
|
||||
ldd [%o1 + 0x20], %f16
|
||||
ldd [%o1 + 0x28], %f18
|
||||
ldd [%o1 + 0x30], %f20
|
||||
subcc %o2, 1, %o2 ! done yet?
|
||||
subcc %o2, 1, %o2 ! done yet?
|
||||
ldd [%o1 + 0x38], %f22
|
||||
add %o1, 0x40, %o1
|
||||
prefetch [%o1 + 63], 20
|
||||
|
@ -15,7 +15,7 @@
|
||||
&& !defined(_MIPS_ARCH_MIPS32R2)
|
||||
# define _MIPS_ARCH_MIPS32R2
|
||||
# endif
|
||||
|
||||
|
||||
# if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \
|
||||
defined(_MIPS_ARCH_MIPS64R6)) \
|
||||
&& !defined(_MIPS_ARCH_MIPS64R2)
|
||||
|
@ -54,7 +54,7 @@
|
||||
#
|
||||
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
|
||||
# Polynomial Multiplication on ARM Processors using the NEON Engine.
|
||||
#
|
||||
#
|
||||
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
|
||||
|
||||
# ====================================================================
|
||||
@ -528,7 +528,7 @@ $code.=<<___;
|
||||
#ifdef __ARMEL__
|
||||
vrev64.8 $Xl,$Xl
|
||||
#endif
|
||||
sub $Xi,#16
|
||||
sub $Xi,#16
|
||||
vst1.64 $Xl#hi,[$Xi]! @ write out Xi
|
||||
vst1.64 $Xl#lo,[$Xi]
|
||||
|
||||
|
@ -158,7 +158,7 @@ $code.=<<___;
|
||||
lg $Zhi,0+1($Xi)
|
||||
lghi $tmp,0
|
||||
.Louter:
|
||||
xg $Zhi,0($inp) # Xi ^= inp
|
||||
xg $Zhi,0($inp) # Xi ^= inp
|
||||
xg $Zlo,8($inp)
|
||||
xgr $Zhi,$tmp
|
||||
stg $Zlo,8+1($Xi)
|
||||
|
@ -811,7 +811,7 @@ sub mmx_loop() {
|
||||
&bswap ($dat);
|
||||
&pshufw ($Zhi,$Zhi,0b00011011); # 76543210
|
||||
&bswap ("ebx");
|
||||
|
||||
|
||||
&cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
|
||||
&jne (&label("outer"));
|
||||
}
|
||||
@ -915,7 +915,7 @@ my ($Xhi,$Xi) = @_;
|
||||
&psllq ($Xi,57); #
|
||||
&movdqa ($T1,$Xi); #
|
||||
&pslldq ($Xi,8);
|
||||
&psrldq ($T1,8); #
|
||||
&psrldq ($T1,8); #
|
||||
&pxor ($Xi,$T2);
|
||||
&pxor ($Xhi,$T1); #
|
||||
|
||||
@ -1085,7 +1085,7 @@ my ($Xhi,$Xi) = @_;
|
||||
&psllq ($Xi,57); #
|
||||
&movdqa ($T1,$Xi); #
|
||||
&pslldq ($Xi,8);
|
||||
&psrldq ($T1,8); #
|
||||
&psrldq ($T1,8); #
|
||||
&pxor ($Xi,$T2);
|
||||
&pxor ($Xhi,$T1); #
|
||||
&pshufd ($T1,$Xhn,0b01001110);
|
||||
|
@ -468,7 +468,7 @@ $code.=<<___;
|
||||
psllq \$57,$Xi #
|
||||
movdqa $Xi,$T1 #
|
||||
pslldq \$8,$Xi
|
||||
psrldq \$8,$T1 #
|
||||
psrldq \$8,$T1 #
|
||||
pxor $T2,$Xi
|
||||
pxor $T1,$Xhi #
|
||||
|
||||
@ -582,7 +582,7 @@ ___
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
|
||||
$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
|
||||
# experimental alternative. special thing about is that there
|
||||
# no dependency between the two multiplications...
|
||||
# no dependency between the two multiplications...
|
||||
mov \$`0xE1<<1`,%eax
|
||||
mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
|
||||
mov \$0x07,%r11d
|
||||
@ -757,7 +757,7 @@ $code.=<<___;
|
||||
movdqa $T2,$T1 #
|
||||
pslldq \$8,$T2
|
||||
pclmulqdq \$0x00,$Hkey2,$Xln
|
||||
psrldq \$8,$T1 #
|
||||
psrldq \$8,$T1 #
|
||||
pxor $T2,$Xi
|
||||
pxor $T1,$Xhi #
|
||||
movdqu 0($inp),$T1
|
||||
@ -893,7 +893,7 @@ $code.=<<___;
|
||||
psllq \$57,$Xi #
|
||||
movdqa $Xi,$T1 #
|
||||
pslldq \$8,$Xi
|
||||
psrldq \$8,$T1 #
|
||||
psrldq \$8,$T1 #
|
||||
pxor $T2,$Xi
|
||||
pshufd \$0b01001110,$Xhn,$Xmn
|
||||
pxor $T1,$Xhi #
|
||||
|
@ -15,7 +15,7 @@
|
||||
# des_cblock (*ivec);
|
||||
# int enc;
|
||||
#
|
||||
# calls
|
||||
# calls
|
||||
# des_encrypt((DES_LONG *)tin,schedule,DES_ENCRYPT);
|
||||
#
|
||||
|
||||
@ -36,7 +36,7 @@ sub cbc
|
||||
# name is the function name
|
||||
# enc_func and dec_func and the functions to call for encrypt/decrypt
|
||||
# swap is true if byte order needs to be reversed
|
||||
# iv_off is parameter number for the iv
|
||||
# iv_off is parameter number for the iv
|
||||
# enc_off is parameter number for the encrypt/decrypt flag
|
||||
# p1,p2,p3 are the offsets for parameters to be passed to the
|
||||
# underlying calls.
|
||||
@ -114,7 +114,7 @@ sub cbc
|
||||
#############################################################
|
||||
|
||||
&set_label("encrypt_loop");
|
||||
# encrypt start
|
||||
# encrypt start
|
||||
# "eax" and "ebx" hold iv (or the last cipher text)
|
||||
|
||||
&mov("ecx", &DWP(0,$in,"",0)); # load first 4 bytes
|
||||
@ -208,7 +208,7 @@ sub cbc
|
||||
#############################################################
|
||||
#############################################################
|
||||
&set_label("decrypt",1);
|
||||
# decrypt start
|
||||
# decrypt start
|
||||
&and($count,0xfffffff8);
|
||||
# The next 2 instructions are only for if the jz is taken
|
||||
&mov("eax", &DWP($data_off+8,"esp","",0)); # get iv[0]
|
||||
@ -350,7 +350,7 @@ sub cbc
|
||||
&align(64);
|
||||
|
||||
&function_end_B($name);
|
||||
|
||||
|
||||
}
|
||||
|
||||
1;
|
||||
|
@ -36,7 +36,7 @@ my $globl = sub {
|
||||
my $ret;
|
||||
|
||||
$name =~ s|^\.||;
|
||||
|
||||
|
||||
SWITCH: for ($flavour) {
|
||||
/aix/ && do { if (!$$type) {
|
||||
$$type = "\@function";
|
||||
|
@ -117,7 +117,7 @@ $::code.=<<___;
|
||||
|
||||
brnz,pn $ooff, 2f
|
||||
sub $len, 1, $len
|
||||
|
||||
|
||||
std %f0, [$out + 0]
|
||||
std %f2, [$out + 8]
|
||||
brnz,pt $len, .L${bits}_cbc_enc_loop
|
||||
@ -224,7 +224,7 @@ $::code.=<<___;
|
||||
call _${alg}${bits}_encrypt_1x
|
||||
add $inp, 16, $inp
|
||||
sub $len, 1, $len
|
||||
|
||||
|
||||
stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
|
||||
add $out, 8, $out
|
||||
stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
|
||||
@ -339,7 +339,7 @@ $::code.=<<___;
|
||||
|
||||
brnz,pn $ooff, 2f
|
||||
sub $len, 1, $len
|
||||
|
||||
|
||||
std %f0, [$out + 0]
|
||||
std %f2, [$out + 8]
|
||||
brnz,pt $len, .L${bits}_cbc_dec_loop2x
|
||||
@ -445,7 +445,7 @@ $::code.=<<___;
|
||||
|
||||
brnz,pn $ooff, 2f
|
||||
sub $len, 2, $len
|
||||
|
||||
|
||||
std %f0, [$out + 0]
|
||||
std %f2, [$out + 8]
|
||||
std %f4, [$out + 16]
|
||||
@ -702,7 +702,7 @@ $::code.=<<___;
|
||||
|
||||
brnz,pn $ooff, 2f
|
||||
sub $len, 1, $len
|
||||
|
||||
|
||||
std %f0, [$out + 0]
|
||||
std %f2, [$out + 8]
|
||||
brnz,pt $len, .L${bits}_ctr32_loop2x
|
||||
@ -791,7 +791,7 @@ $::code.=<<___;
|
||||
|
||||
brnz,pn $ooff, 2f
|
||||
sub $len, 2, $len
|
||||
|
||||
|
||||
std %f0, [$out + 0]
|
||||
std %f2, [$out + 8]
|
||||
std %f4, [$out + 16]
|
||||
@ -1024,7 +1024,7 @@ $code.=<<___;
|
||||
|
||||
brnz,pn $ooff, 2f
|
||||
sub $len, 1, $len
|
||||
|
||||
|
||||
std %f0, [$out + 0]
|
||||
std %f2, [$out + 8]
|
||||
brnz,pt $len, .L${bits}_xts_${dir}loop2x
|
||||
@ -1135,7 +1135,7 @@ $code.=<<___;
|
||||
|
||||
brnz,pn $ooff, 2f
|
||||
sub $len, 2, $len
|
||||
|
||||
|
||||
std %f0, [$out + 0]
|
||||
std %f2, [$out + 8]
|
||||
std %f4, [$out + 16]
|
||||
|
@ -151,7 +151,7 @@ my %globals;
|
||||
if ($gas) {
|
||||
if ($self->{op} eq "movz") { # movz is pain...
|
||||
sprintf "%s%s%s",$self->{op},$self->{sz},shift;
|
||||
} elsif ($self->{op} =~ /^set/) {
|
||||
} elsif ($self->{op} =~ /^set/) {
|
||||
"$self->{op}";
|
||||
} elsif ($self->{op} eq "ret") {
|
||||
my $epilogue = "";
|
||||
@ -178,7 +178,7 @@ my %globals;
|
||||
$self->{op} .= $self->{sz};
|
||||
} elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") {
|
||||
$self->{op} = "\tDQ";
|
||||
}
|
||||
}
|
||||
$self->{op};
|
||||
}
|
||||
}
|
||||
@ -639,7 +639,7 @@ my %globals;
|
||||
if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva"))
|
||||
{ $var=~s/([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; }
|
||||
$var;
|
||||
};
|
||||
};
|
||||
|
||||
$sz =~ tr/bvlrq/BWDDQ/;
|
||||
$self->{value} = "\tD$sz\t";
|
||||
@ -649,7 +649,7 @@ my %globals;
|
||||
};
|
||||
/\.byte/ && do { my @str=split(/,\s*/,$$line);
|
||||
map(s/(0b[0-1]+)/oct($1)/eig,@str);
|
||||
map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);
|
||||
map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);
|
||||
while ($#str>15) {
|
||||
$self->{value}.="DB\t"
|
||||
.join(",",@str[0..15])."\n";
|
||||
@ -896,7 +896,7 @@ while(defined(my $line=<>)) {
|
||||
printf "%s",$directive->out();
|
||||
} elsif (my $opcode=opcode->re(\$line)) {
|
||||
my $asm = eval("\$".$opcode->mnemonic());
|
||||
|
||||
|
||||
if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) {
|
||||
print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
|
||||
next;
|
||||
@ -974,7 +974,7 @@ close STDOUT;
|
||||
# %r13 - -
|
||||
# %r14 - -
|
||||
# %r15 - -
|
||||
#
|
||||
#
|
||||
# (*) volatile register
|
||||
# (-) preserved by callee
|
||||
# (#) Nth argument, volatile
|
||||
|
@ -132,7 +132,7 @@ ___
|
||||
grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
|
||||
push (@out,$comm)
|
||||
}
|
||||
push (@out,$initseg) if ($initseg);
|
||||
push (@out,$initseg) if ($initseg);
|
||||
}
|
||||
|
||||
sub ::comment { foreach (@_) { push(@out,"\t; $_\n"); } }
|
||||
|
@ -89,7 +89,7 @@ _RC4:
|
||||
|| NOP 5
|
||||
STB $XX,*${KEYA}[-2] ; key->x
|
||||
|| SUB4 $YY,$TX,$YY
|
||||
|| BNOP B3
|
||||
|| BNOP B3
|
||||
STB $YY,*${KEYB}[-1] ; key->y
|
||||
|| NOP 5
|
||||
.endasmfunc
|
||||
|
@ -51,7 +51,7 @@ my ($rc4,$md5)=(1,1); # what to generate?
|
||||
my $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(),
|
||||
# but its result is discarded. Idea here is
|
||||
# to be able to use 'openssl speed rc4' for
|
||||
# benchmarking the stitched subroutine...
|
||||
# benchmarking the stitched subroutine...
|
||||
|
||||
my $flavour = shift;
|
||||
my $output = shift;
|
||||
@ -419,7 +419,7 @@ $code.=<<___ if ($rc4 && (!$md5 || $D));
|
||||
and \$63,$len # remaining bytes
|
||||
jnz .Loop1
|
||||
jmp .Ldone
|
||||
|
||||
|
||||
.align 16
|
||||
.Loop1:
|
||||
add $TX[0]#b,$YY#b
|
||||
|
@ -98,7 +98,7 @@ sub unrolledloopbody {
|
||||
for ($i=0;$i<4;$i++) {
|
||||
$code.=<<___;
|
||||
ldo 1($XX[0]),$XX[1]
|
||||
`sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
|
||||
`sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
|
||||
and $mask,$XX[1],$XX[1]
|
||||
$LDX $YY($key),$TY
|
||||
$MKX $YY,$key,$ix
|
||||
@ -166,7 +166,7 @@ RC4
|
||||
ldo `2*$SZ`($key),$key
|
||||
|
||||
ldi 0xff,$mask
|
||||
ldi 3,$dat0
|
||||
ldi 3,$dat0
|
||||
|
||||
ldo 1($XX[0]),$XX[0] ; warm up loop
|
||||
and $mask,$XX[0],$XX[0]
|
||||
|
@ -48,7 +48,7 @@
|
||||
|
||||
# April 2005
|
||||
#
|
||||
# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
|
||||
# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
|
||||
# those with add/sub results in 50% performance improvement of folded
|
||||
# loop...
|
||||
|
||||
|
@ -34,7 +34,7 @@ $KL2=0x6ED9EBA1;
|
||||
$KL3=0x8F1BBCDC;
|
||||
$KL4=0xA953FD4E;
|
||||
$KR0=0x50A28BE6;
|
||||
$KR1=0x5C4DD124;
|
||||
$KR1=0x5C4DD124;
|
||||
$KR2=0x6D703EF3;
|
||||
$KR3=0x7A6D76E9;
|
||||
|
||||
@ -543,28 +543,28 @@ sub ripemd160_block
|
||||
# &mov($tmp2, &wparam(0)); # Moved into last round
|
||||
|
||||
&mov($tmp1, &DWP( 4,$tmp2,"",0)); # ctx->B
|
||||
&add($D, $tmp1);
|
||||
&add($D, $tmp1);
|
||||
&mov($tmp1, &swtmp(16+2)); # $c
|
||||
&add($D, $tmp1);
|
||||
|
||||
&mov($tmp1, &DWP( 8,$tmp2,"",0)); # ctx->C
|
||||
&add($E, $tmp1);
|
||||
&add($E, $tmp1);
|
||||
&mov($tmp1, &swtmp(16+3)); # $d
|
||||
&add($E, $tmp1);
|
||||
|
||||
&mov($tmp1, &DWP(12,$tmp2,"",0)); # ctx->D
|
||||
&add($A, $tmp1);
|
||||
&add($A, $tmp1);
|
||||
&mov($tmp1, &swtmp(16+4)); # $e
|
||||
&add($A, $tmp1);
|
||||
|
||||
|
||||
&mov($tmp1, &DWP(16,$tmp2,"",0)); # ctx->E
|
||||
&add($B, $tmp1);
|
||||
&add($B, $tmp1);
|
||||
&mov($tmp1, &swtmp(16+0)); # $a
|
||||
&add($B, $tmp1);
|
||||
|
||||
&mov($tmp1, &DWP( 0,$tmp2,"",0)); # ctx->A
|
||||
&add($C, $tmp1);
|
||||
&add($C, $tmp1);
|
||||
&mov($tmp1, &swtmp(16+1)); # $b
|
||||
&add($C, $tmp1);
|
||||
|
||||
|
@ -133,7 +133,7 @@ $ymm=1 if ($xmm &&
|
||||
=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
|
||||
$1>=2.19); # first version supporting AVX
|
||||
|
||||
$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
|
||||
$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
|
||||
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
|
||||
$1>=2.03); # first version supporting AVX
|
||||
|
||||
|
@ -95,7 +95,7 @@ $K="%xmm15";
|
||||
|
||||
if (1) {
|
||||
# Atom-specific optimization aiming to eliminate pshufb with high
|
||||
# registers [and thus get rid of 48 cycles accumulated penalty]
|
||||
# registers [and thus get rid of 48 cycles accumulated penalty]
|
||||
@Xi=map("%xmm$_",(0..4));
|
||||
($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
|
||||
@V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
|
||||
@ -126,7 +126,7 @@ my $k=$i+2;
|
||||
# ...
|
||||
# $i==13: 14,15,15,15,
|
||||
# $i==14: 15
|
||||
#
|
||||
#
|
||||
# Then at $i==15 Xupdate is applied one iteration in advance...
|
||||
$code.=<<___ if ($i==0);
|
||||
movd (@ptr[0]),@Xi[0]
|
||||
|
@ -227,7 +227,7 @@ sha1_block_data_order:
|
||||
ldd [%o1 + 0x20], %f16
|
||||
ldd [%o1 + 0x28], %f18
|
||||
ldd [%o1 + 0x30], %f20
|
||||
subcc %o2, 1, %o2 ! done yet?
|
||||
subcc %o2, 1, %o2 ! done yet?
|
||||
ldd [%o1 + 0x38], %f22
|
||||
add %o1, 0x40, %o1
|
||||
prefetch [%o1 + 63], 20
|
||||
|
@ -519,7 +519,7 @@ $code.=<<___;
|
||||
mov $Cctx,$C
|
||||
mov $Dctx,$D
|
||||
mov $Ectx,$E
|
||||
alignaddr %g0,$tmp0,%g0
|
||||
alignaddr %g0,$tmp0,%g0
|
||||
dec 1,$len
|
||||
ba .Loop
|
||||
mov $nXfer,$Xfer
|
||||
|
@ -262,7 +262,7 @@ sha1_block_data_order:
|
||||
jz .Lialu
|
||||
___
|
||||
$code.=<<___ if ($shaext);
|
||||
test \$`1<<29`,%r10d # check SHA bit
|
||||
test \$`1<<29`,%r10d # check SHA bit
|
||||
jnz _shaext_shortcut
|
||||
___
|
||||
$code.=<<___ if ($avx>1);
|
||||
|
@ -47,7 +47,7 @@
|
||||
#
|
||||
# Performance in clock cycles per processed byte (less is better):
|
||||
#
|
||||
# gcc icc x86 asm(*) SIMD x86_64 asm(**)
|
||||
# gcc icc x86 asm(*) SIMD x86_64 asm(**)
|
||||
# Pentium 46 57 40/38 - -
|
||||
# PIII 36 33 27/24 - -
|
||||
# P4 41 38 28 - 17.3
|
||||
@ -276,7 +276,7 @@ my $suffix=shift;
|
||||
&mov ($Coff,"ecx");
|
||||
&mov ($Doff,"edi");
|
||||
&mov (&DWP(0,"esp"),"ebx"); # magic
|
||||
&mov ($E,&DWP(16,"esi"));
|
||||
&mov ($E,&DWP(16,"esi"));
|
||||
&mov ("ebx",&DWP(20,"esi"));
|
||||
&mov ("ecx",&DWP(24,"esi"));
|
||||
&mov ("edi",&DWP(28,"esi"));
|
||||
@ -385,7 +385,7 @@ my @AH=($A,$K256);
|
||||
&xor ($AH[1],"ecx"); # magic
|
||||
&mov (&DWP(8,"esp"),"ecx");
|
||||
&mov (&DWP(12,"esp"),"ebx");
|
||||
&mov ($E,&DWP(16,"esi"));
|
||||
&mov ($E,&DWP(16,"esi"));
|
||||
&mov ("ebx",&DWP(20,"esi"));
|
||||
&mov ("ecx",&DWP(24,"esi"));
|
||||
&mov ("esi",&DWP(28,"esi"));
|
||||
|
@ -36,7 +36,7 @@
|
||||
# (iii) "this" is for n=8, when we gather twice as much data, result
|
||||
# for n=4 is 20.3+4.44=24.7;
|
||||
# (iv) presented improvement coefficients are asymptotic limits and
|
||||
# in real-life application are somewhat lower, e.g. for 2KB
|
||||
# in real-life application are somewhat lower, e.g. for 2KB
|
||||
# fragments they range from 75% to 130% (on Haswell);
|
||||
|
||||
$flavour = shift;
|
||||
|
@ -383,7 +383,7 @@ if ($sse2) {
|
||||
|
||||
&set_label("16_79_sse2",16);
|
||||
for ($j=0;$j<2;$j++) { # 2x unroll
|
||||
#&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
|
||||
#&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
|
||||
&movq ("mm5",&QWP(8*(9+16-14),"esp"));
|
||||
&movq ("mm1","mm7");
|
||||
&psrlq ("mm7",1);
|
||||
|
@ -26,7 +26,7 @@
|
||||
# Denver 2.01 10.5 (+26%) 6.70 (+8%)
|
||||
# X-Gene 20.0 (+100%) 12.8 (+300%(***))
|
||||
# Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
|
||||
#
|
||||
#
|
||||
# (*) Software SHA256 results are of lesser relevance, presented
|
||||
# mostly for informational purposes.
|
||||
# (**) The result is a trade-off: it's possible to improve it by
|
||||
|
@ -368,7 +368,7 @@ L\$parisc1
|
||||
___
|
||||
|
||||
@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
|
||||
$Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
|
||||
$Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
|
||||
( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
|
||||
"%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
|
||||
$a0 ="%r17";
|
||||
@ -419,7 +419,7 @@ $code.=<<___;
|
||||
add $t0,$hlo,$hlo
|
||||
shd $ahi,$alo,$Sigma0[0],$t0
|
||||
addc $t1,$hhi,$hhi ; h += Sigma1(e)
|
||||
shd $alo,$ahi,$Sigma0[0],$t1
|
||||
shd $alo,$ahi,$Sigma0[0],$t1
|
||||
add $a0,$hlo,$hlo
|
||||
shd $ahi,$alo,$Sigma0[1],$t2
|
||||
addc $a1,$hhi,$hhi ; h += Ch(e,f,g)
|
||||
|
@ -311,7 +311,7 @@ $code.=<<___;
|
||||
cl${g} $inp,`$frame+4*$SIZE_T`($sp)
|
||||
jne .Lloop
|
||||
|
||||
lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
|
||||
lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
|
||||
br %r14
|
||||
.size $Func,.-$Func
|
||||
.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
|
@ -102,7 +102,7 @@ if ($output =~ /512/) {
|
||||
|
||||
$locals=0; # X[16] is register resident
|
||||
@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
|
||||
|
||||
|
||||
$A="%l0";
|
||||
$B="%l1";
|
||||
$C="%l2";
|
||||
@ -254,7 +254,7 @@ $code.=<<___;
|
||||
$SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
|
||||
xor $tmp0,$h,$h
|
||||
$SRL $a,@Sigma0[2],$tmp0
|
||||
xor $tmp1,$h,$h
|
||||
xor $tmp1,$h,$h
|
||||
$SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
|
||||
xor $tmp0,$h,$h
|
||||
xor $tmp1,$h,$h ! Sigma0(a)
|
||||
|
@ -1782,7 +1782,7 @@ if ($avx>1) {{
|
||||
######################################################################
|
||||
# AVX2+BMI code path
|
||||
#
|
||||
my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
|
||||
my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
|
||||
my $PUSH8=8*2*$SZ;
|
||||
use integer;
|
||||
|
||||
|
@ -480,7 +480,7 @@ static char *ts_get_status_text(STACK_OF(ASN1_UTF8STRING) *text)
|
||||
return result;
|
||||
}
|
||||
|
||||
static int ts_check_policy(const ASN1_OBJECT *req_oid,
|
||||
static int ts_check_policy(const ASN1_OBJECT *req_oid,
|
||||
const TS_TST_INFO *tst_info)
|
||||
{
|
||||
const ASN1_OBJECT *resp_oid = tst_info->policy_id;
|
||||
|
@ -31,7 +31,7 @@
|
||||
# multiplying 64 by CPU clock frequency and dividing by relevant
|
||||
# value from the given table:
|
||||
#
|
||||
# $SCALE=2/8 icc8 gcc3
|
||||
# $SCALE=2/8 icc8 gcc3
|
||||
# Intel P4 3200/4600 4600(*) 6400
|
||||
# Intel PIII 2900/3000 4900 5400
|
||||
# AMD K[78] 2500/1800 9900 8200(**)
|
||||
@ -502,6 +502,6 @@ for($i=0;$i<8;$i++) {
|
||||
&L(0xca,0x2d,0xbf,0x07,0xad,0x5a,0x83,0x33);
|
||||
|
||||
&function_end_B("whirlpool_block_mmx");
|
||||
&asm_finish();
|
||||
&asm_finish();
|
||||
|
||||
close STDOUT;
|
||||
|
@ -38,7 +38,7 @@ const X509V3_EXT_METHOD v3_crl_reason = {
|
||||
crl_reasons
|
||||
};
|
||||
|
||||
char *i2s_ASN1_ENUMERATED_TABLE(X509V3_EXT_METHOD *method,
|
||||
char *i2s_ASN1_ENUMERATED_TABLE(X509V3_EXT_METHOD *method,
|
||||
const ASN1_ENUMERATED *e)
|
||||
{
|
||||
ENUMERATED_NAMES *enam;
|
||||
|
@ -24,7 +24,7 @@ const X509V3_EXT_METHOD v3_skey_id = {
|
||||
NULL
|
||||
};
|
||||
|
||||
char *i2s_ASN1_OCTET_STRING(X509V3_EXT_METHOD *method,
|
||||
char *i2s_ASN1_OCTET_STRING(X509V3_EXT_METHOD *method,
|
||||
const ASN1_OCTET_STRING *oct)
|
||||
{
|
||||
return OPENSSL_buf2hexstr(oct->data, oct->length);
|
||||
|
@ -89,7 +89,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
||||
&ja (&label("generic"));
|
||||
&and ("edx",0xefffffff); # clear hyper-threading bit
|
||||
&jmp (&label("generic"));
|
||||
|
||||
|
||||
&set_label("intel");
|
||||
&cmp ("edi",7);
|
||||
&jb (&label("cacheinfo"));
|
||||
|
@ -535,7 +535,7 @@ $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
|
||||
sub $len,%rsp
|
||||
shr \$3,$len
|
||||
lea (%rsp),$out
|
||||
.byte 0xf3,0x48,0xa5 # rep movsq
|
||||
.byte 0xf3,0x48,0xa5 # rep movsq
|
||||
lea (%r8),$out
|
||||
lea (%rsp),$inp
|
||||
mov $chunk,$len
|
||||
|
@ -805,7 +805,7 @@ X509_NAME_ENTRY *X509_NAME_ENTRY_create_by_txt(X509_NAME_ENTRY **ne,
|
||||
const unsigned char *bytes,
|
||||
int len);
|
||||
X509_NAME_ENTRY *X509_NAME_ENTRY_create_by_NID(X509_NAME_ENTRY **ne, int nid,
|
||||
int type,
|
||||
int type,
|
||||
const unsigned char *bytes,
|
||||
int len);
|
||||
int X509_NAME_add_entry_by_txt(X509_NAME *name, const char *field, int type,
|
||||
|
@ -178,7 +178,7 @@ static int wpacket_intern_close(WPACKET *pkt)
|
||||
}
|
||||
|
||||
/* Write out the WPACKET length if needed */
|
||||
if (sub->lenbytes > 0
|
||||
if (sub->lenbytes > 0
|
||||
&& !put_value((unsigned char *)&pkt->buf->data[sub->packet_len],
|
||||
packlen, sub->lenbytes))
|
||||
return 0;
|
||||
|
@ -707,7 +707,7 @@ int WPACKET_sub_allocate_bytes__(WPACKET *pkt, size_t len,
|
||||
* maximum size will be. If this function is used, then it should be immediately
|
||||
* followed by a WPACKET_allocate_bytes() call before any other WPACKET
|
||||
* functions are called (unless the write to the allocated bytes is abandoned).
|
||||
*
|
||||
*
|
||||
* For example: If we are generating a signature, then the size of that
|
||||
* signature may not be known in advance. We can use WPACKET_reserve_bytes() to
|
||||
* handle this:
|
||||
|
@ -6,7 +6,7 @@
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
# Perl utility to run PKITS tests for RFC3280 compliance.
|
||||
# Perl utility to run PKITS tests for RFC3280 compliance.
|
||||
|
||||
my $ossl_path;
|
||||
|
||||
|
@ -23,7 +23,7 @@ my %conversionforms = (
|
||||
sub tconversion {
|
||||
my $testtype = shift;
|
||||
my $t = shift;
|
||||
my @conversionforms =
|
||||
my @conversionforms =
|
||||
defined($conversionforms{$testtype}) ?
|
||||
@{$conversionforms{$testtype}} :
|
||||
@{$conversionforms{"*"}};
|
||||
|
@ -115,7 +115,7 @@ static int test_WPACKET_set_max_size(void)
|
||||
|| !WPACKET_set_max_size(&pkt, SIZE_MAX)
|
||||
|| !WPACKET_finish(&pkt)) {
|
||||
testfail("test_WPACKET_set_max_size():1 failed\n", &pkt);
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!WPACKET_init_len(&pkt, buf, 1)
|
||||
|
@ -8,7 +8,7 @@
|
||||
|
||||
# This is just a quick script to scan for cases where the 'error'
|
||||
# function name in a XXXerr() macro is wrong.
|
||||
#
|
||||
#
|
||||
# Run in the top level by going
|
||||
# perl util/ck_errf.pl */*.c */*/*.c
|
||||
#
|
||||
|
@ -40,7 +40,7 @@ if ($fnum <= 1)
|
||||
}
|
||||
|
||||
$dest = pop @filelist;
|
||||
|
||||
|
||||
if ($fnum > 2 && ! -d $dest)
|
||||
{
|
||||
die "Destination must be a directory";
|
||||
@ -73,5 +73,5 @@ foreach (@filelist)
|
||||
close(OUT);
|
||||
print "Copying: $_ to $dfile\n";
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
@ -109,7 +109,7 @@ sub check_hash
|
||||
$hashval =~ s/^.*=\s+//;
|
||||
die "Invalid hash syntax in file" if (length($hashfile) != 40);
|
||||
die "Invalid hash received for file" if (length($hashval) != 40);
|
||||
die "***HASH VALUE MISMATCH FOR FILE $filename ***" if ($hashval ne $hashfile);
|
||||
die "***HASH VALUE MISMATCH FOR FILE $filename ***" if ($hashval ne $hashfile);
|
||||
}
|
||||
|
||||
|
||||
|
@ -24,7 +24,7 @@
|
||||
# existence:platform:kind:algorithms
|
||||
#
|
||||
# - "existence" can be "EXIST" or "NOEXIST" depending on if the symbol is
|
||||
# found somewhere in the source,
|
||||
# found somewhere in the source,
|
||||
# - "platforms" is empty if it exists on all platforms, otherwise it contains
|
||||
# comma-separated list of the platform, just as they are if the symbol exists
|
||||
# for those platforms, or prepended with a "!" if not. This helps resolve
|
||||
@ -172,7 +172,7 @@ foreach (@ARGV, split(/ /, $config{options}))
|
||||
|
||||
$do_ssl=1 if $_ eq "libssl";
|
||||
if ($_ eq "ssl") {
|
||||
$do_ssl=1;
|
||||
$do_ssl=1;
|
||||
$libname=$_
|
||||
}
|
||||
$do_crypto=1 if $_ eq "libcrypto";
|
||||
@ -211,7 +211,7 @@ foreach (@ARGV, split(/ /, $config{options}))
|
||||
|
||||
}
|
||||
|
||||
if (!$libname) {
|
||||
if (!$libname) {
|
||||
if ($do_ssl) {
|
||||
$libname="LIBSSL";
|
||||
}
|
||||
@ -339,7 +339,7 @@ if($do_crypto == 1) {
|
||||
}
|
||||
&update_numbers(*OUT,"LIBCRYPTO",*crypto_list,$max_crypto,@crypto_symbols);
|
||||
close OUT;
|
||||
}
|
||||
}
|
||||
|
||||
} elsif ($do_checkexist) {
|
||||
&check_existing(*ssl_list, @ssl_symbols)
|
||||
|
@ -97,7 +97,7 @@ Options:
|
||||
Default: keep previously assigned numbers. (You are warned
|
||||
when collisions are detected.)
|
||||
|
||||
-nostatic Generates a different source code, where these additional
|
||||
-nostatic Generates a different source code, where these additional
|
||||
functions are generated for each library specified in the
|
||||
config file:
|
||||
void ERR_load_<LIB>_strings(void);
|
||||
@ -105,7 +105,7 @@ Options:
|
||||
void ERR_<LIB>_error(int f, int r, char *fn, int ln);
|
||||
#define <LIB>err(f,r) ERR_<LIB>_error(f,r,OPENSSL_FILE,OPENSSL_LINE)
|
||||
while the code facilitates the use of these in an environment
|
||||
where the error support routines are dynamically loaded at
|
||||
where the error support routines are dynamically loaded at
|
||||
runtime.
|
||||
Default: 'static' code generation.
|
||||
|
||||
@ -114,8 +114,8 @@ Options:
|
||||
|
||||
-unref Print out unreferenced function and reason codes.
|
||||
|
||||
-write Actually (over)write the generated code to the header and C
|
||||
source files as assigned to each library through the config
|
||||
-write Actually (over)write the generated code to the header and C
|
||||
source files as assigned to each library through the config
|
||||
file.
|
||||
Default: don't write.
|
||||
|
||||
@ -196,7 +196,7 @@ while (($hdr, $lib) = each %libinc)
|
||||
if(/\/\*/) {
|
||||
if (not /\*\//) { # multiline comment...
|
||||
$line = $_; # ... just accumulate
|
||||
next;
|
||||
next;
|
||||
} else {
|
||||
s/\/\*.*?\*\///gs; # wipe it
|
||||
}
|
||||
@ -370,7 +370,7 @@ foreach $file (@source) {
|
||||
print STDERR "ERROR: mismatch $file:$linenr $func:$3\n";
|
||||
$errcount++;
|
||||
}
|
||||
print STDERR "Function: $1\t= $fcodes{$1} (lib: $2, name: $3)\n" if $debug;
|
||||
print STDERR "Function: $1\t= $fcodes{$1} (lib: $2, name: $3)\n" if $debug;
|
||||
}
|
||||
if(/(([A-Z0-9]+)_R_[A-Z0-9_]+)/) {
|
||||
next unless exists $csrc{$2};
|
||||
@ -379,8 +379,8 @@ foreach $file (@source) {
|
||||
$rcodes{$1} = "X";
|
||||
$rnew{$2}++;
|
||||
}
|
||||
print STDERR "Reason: $1\t= $rcodes{$1} (lib: $2)\n" if $debug;
|
||||
}
|
||||
print STDERR "Reason: $1\t= $rcodes{$1} (lib: $2)\n" if $debug;
|
||||
}
|
||||
}
|
||||
close IN;
|
||||
}
|
||||
|
@ -108,7 +108,7 @@ sub structureData {
|
||||
if($inbrace) {
|
||||
if($item eq "}") {
|
||||
$inbrace --;
|
||||
|
||||
|
||||
if(!$inbrace) {
|
||||
$substruc = structureData($dataitem);
|
||||
$dataitem = $substruc;
|
||||
|
Loading…
Reference in New Issue
Block a user