ARM assembler pack: reschedule instructions for dual-issue pipeline.

Modest improvement coefficients mean that code already had some
parallelism and there was not very much room for improvement. Special
thanks to Ted Krovetz for benchmarking the code with such patience.
This commit is contained in:
Andy Polyakov 2010-07-13 14:03:31 +00:00
parent 0852f90c30
commit 2d22e08083
4 changed files with 250 additions and 240 deletions

View File

@ -16,12 +16,17 @@
# allows to merge logical or arithmetic operation with shift or rotate
# in one instruction and emit combined result every cycle. The module
# is endian-neutral. The performance is ~42 cycles/byte for 128-bit
# key.
# key [on single-issue Xscale PXA250 core].
# May 2007.
#
# AES_set_[en|de]crypt_key is added.
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 12% improvement on
# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@ -167,24 +172,24 @@ AES_encrypt:
ldrb $t2,[$rounds,#1]
ldrb $t3,[$rounds,#0]
orr $s0,$s0,$t1,lsl#8
orr $s0,$s0,$t2,lsl#16
orr $s0,$s0,$t3,lsl#24
ldrb $s1,[$rounds,#7]
orr $s0,$s0,$t2,lsl#16
ldrb $t1,[$rounds,#6]
orr $s0,$s0,$t3,lsl#24
ldrb $t2,[$rounds,#5]
ldrb $t3,[$rounds,#4]
orr $s1,$s1,$t1,lsl#8
orr $s1,$s1,$t2,lsl#16
orr $s1,$s1,$t3,lsl#24
ldrb $s2,[$rounds,#11]
orr $s1,$s1,$t2,lsl#16
ldrb $t1,[$rounds,#10]
orr $s1,$s1,$t3,lsl#24
ldrb $t2,[$rounds,#9]
ldrb $t3,[$rounds,#8]
orr $s2,$s2,$t1,lsl#8
orr $s2,$s2,$t2,lsl#16
orr $s2,$s2,$t3,lsl#24
ldrb $s3,[$rounds,#15]
orr $s2,$s2,$t2,lsl#16
ldrb $t1,[$rounds,#14]
orr $s2,$s2,$t3,lsl#24
ldrb $t2,[$rounds,#13]
ldrb $t3,[$rounds,#12]
orr $s3,$s3,$t1,lsl#8
@ -199,24 +204,24 @@ AES_encrypt:
mov $t3,$s0,lsr#8
strb $t1,[$rounds,#0]
strb $t2,[$rounds,#1]
strb $t3,[$rounds,#2]
strb $s0,[$rounds,#3]
mov $t1,$s1,lsr#24
strb $t3,[$rounds,#2]
mov $t2,$s1,lsr#16
strb $s0,[$rounds,#3]
mov $t3,$s1,lsr#8
strb $t1,[$rounds,#4]
strb $t2,[$rounds,#5]
strb $t3,[$rounds,#6]
strb $s1,[$rounds,#7]
mov $t1,$s2,lsr#24
strb $t3,[$rounds,#6]
mov $t2,$s2,lsr#16
strb $s1,[$rounds,#7]
mov $t3,$s2,lsr#8
strb $t1,[$rounds,#8]
strb $t2,[$rounds,#9]
strb $t3,[$rounds,#10]
strb $s2,[$rounds,#11]
mov $t1,$s3,lsr#24
strb $t3,[$rounds,#10]
mov $t2,$s3,lsr#16
strb $s2,[$rounds,#11]
mov $t3,$s3,lsr#8
strb $t1,[$rounds,#12]
strb $t2,[$rounds,#13]
@ -233,141 +238,137 @@ AES_encrypt:
.align 2
_armv4_AES_encrypt:
str lr,[sp,#-4]! @ push lr
ldr $t1,[$key],#16
ldr $t2,[$key,#-12]
ldr $t3,[$key,#-8]
ldr $i1,[$key,#-4]
ldr $rounds,[$key,#240-16]
ldmia $key!,{$t1-$i1}
eor $s0,$s0,$t1
ldr $rounds,[$key,#240-16]
eor $s1,$s1,$t2
eor $s2,$s2,$t3
eor $s3,$s3,$i1
sub $rounds,$rounds,#1
mov lr,#255
.Lenc_loop:
and $i1,lr,$s0
and $i2,lr,$s0,lsr#8
and $i3,lr,$s0,lsr#16
and $i1,lr,$s0
mov $s0,$s0,lsr#24
.Lenc_loop:
ldr $t1,[$tbl,$i1,lsl#2] @ Te3[s0>>0]
ldr $s0,[$tbl,$s0,lsl#2] @ Te0[s0>>24]
ldr $t2,[$tbl,$i2,lsl#2] @ Te2[s0>>8]
ldr $t3,[$tbl,$i3,lsl#2] @ Te1[s0>>16]
and $i1,lr,$s1,lsr#16 @ i0
ldr $t2,[$tbl,$i2,lsl#2] @ Te2[s0>>8]
and $i2,lr,$s1
ldr $t3,[$tbl,$i3,lsl#2] @ Te1[s0>>16]
and $i3,lr,$s1,lsr#8
ldr $s0,[$tbl,$s0,lsl#2] @ Te0[s0>>24]
mov $s1,$s1,lsr#24
ldr $i1,[$tbl,$i1,lsl#2] @ Te1[s1>>16]
ldr $s1,[$tbl,$s1,lsl#2] @ Te0[s1>>24]
ldr $i2,[$tbl,$i2,lsl#2] @ Te3[s1>>0]
ldr $i3,[$tbl,$i3,lsl#2] @ Te2[s1>>8]
eor $s0,$s0,$i1,ror#8
eor $s1,$s1,$t1,ror#24
eor $t2,$t2,$i2,ror#8
eor $t3,$t3,$i3,ror#8
ldr $s1,[$tbl,$s1,lsl#2] @ Te0[s1>>24]
and $i1,lr,$s2,lsr#8 @ i0
eor $t2,$t2,$i2,ror#8
and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$t3,$i3,ror#8
and $i3,lr,$s2
mov $s2,$s2,lsr#24
eor $s1,$s1,$t1,ror#24
ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
mov $s2,$s2,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
eor $s0,$s0,$i1,ror#16
eor $s1,$s1,$i2,ror#8
eor $s2,$s2,$t2,ror#16
eor $t3,$t3,$i3,ror#16
ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
and $i1,lr,$s3 @ i0
eor $s1,$s1,$i2,ror#8
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$t3,$i3,ror#16
and $i3,lr,$s3,lsr#16 @ i2
mov $s3,$s3,lsr#24
eor $s2,$s2,$t2,ror#16
ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
mov $s3,$s3,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s0,$s0,$i1,ror#24
ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s1,$s1,$i2,ror#16
ldr $i1,[$key],#16
eor $s2,$s2,$i3,ror#8
ldr $t1,[$key,#-12]
eor $s3,$s3,$t3,ror#8
ldr $t1,[$key],#16
ldr $t2,[$key,#-12]
ldr $t3,[$key,#-8]
ldr $i1,[$key,#-4]
eor $s0,$s0,$t1
eor $s1,$s1,$t2
eor $s2,$s2,$t3
eor $s3,$s3,$i1
ldr $t2,[$key,#-8]
eor $s0,$s0,$i1
ldr $t3,[$key,#-4]
and $i1,lr,$s0
eor $s1,$s1,$t1
and $i2,lr,$s0,lsr#8
eor $s2,$s2,$t2
and $i3,lr,$s0,lsr#16
eor $s3,$s3,$t3
mov $s0,$s0,lsr#24
subs $rounds,$rounds,#1
bne .Lenc_loop
add $tbl,$tbl,#2
and $i1,lr,$s0
and $i2,lr,$s0,lsr#8
and $i3,lr,$s0,lsr#16
mov $s0,$s0,lsr#24
ldrb $t1,[$tbl,$i1,lsl#2] @ Te4[s0>>0]
ldrb $s0,[$tbl,$s0,lsl#2] @ Te4[s0>>24]
ldrb $t2,[$tbl,$i2,lsl#2] @ Te4[s0>>8]
ldrb $t3,[$tbl,$i3,lsl#2] @ Te4[s0>>16]
and $i1,lr,$s1,lsr#16 @ i0
ldrb $t2,[$tbl,$i2,lsl#2] @ Te4[s0>>8]
and $i2,lr,$s1
ldrb $t3,[$tbl,$i3,lsl#2] @ Te4[s0>>16]
and $i3,lr,$s1,lsr#8
ldrb $s0,[$tbl,$s0,lsl#2] @ Te4[s0>>24]
mov $s1,$s1,lsr#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s1>>16]
ldrb $s1,[$tbl,$s1,lsl#2] @ Te4[s1>>24]
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s1>>0]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s1>>8]
eor $s0,$i1,$s0,lsl#8
eor $s1,$t1,$s1,lsl#24
eor $t2,$i2,$t2,lsl#8
eor $t3,$i3,$t3,lsl#8
ldrb $s1,[$tbl,$s1,lsl#2] @ Te4[s1>>24]
and $i1,lr,$s2,lsr#8 @ i0
eor $t2,$i2,$t2,lsl#8
and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s2
mov $s2,$s2,lsr#24
eor $s1,$t1,$s1,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
mov $s2,$s2,lsr#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
eor $s0,$i1,$s0,lsl#8
eor $s1,$s1,$i2,lsl#16
eor $s2,$t2,$s2,lsl#24
eor $t3,$i3,$t3,lsl#8
ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
and $i1,lr,$s3 @ i0
eor $s1,$s1,$i2,lsl#16
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s3,lsr#16 @ i2
mov $s3,$s3,lsr#24
eor $s2,$t2,$s2,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
mov $s3,$s3,lsr#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
eor $s0,$i1,$s0,lsl#8
ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
ldr $i1,[$key,#0]
eor $s1,$s1,$i2,lsl#8
ldr $t1,[$key,#4]
eor $s2,$s2,$i3,lsl#16
ldr $t2,[$key,#8]
eor $s3,$t3,$s3,lsl#24
ldr $t3,[$key,#12]
ldr lr,[sp],#4 @ pop lr
ldr $t1,[$key,#0]
ldr $t2,[$key,#4]
ldr $t3,[$key,#8]
ldr $i1,[$key,#12]
eor $s0,$s0,$t1
eor $s1,$s1,$t2
eor $s2,$s2,$t3
eor $s3,$s3,$i1
eor $s0,$s0,$i1
eor $s1,$s1,$t1
eor $s2,$s2,$t2
eor $s3,$s3,$t3
sub $tbl,$tbl,#2
mov pc,lr @ return
ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
.global AES_set_encrypt_key
@ -402,31 +403,31 @@ AES_set_encrypt_key:
ldrb $t2,[$rounds,#1]
ldrb $t3,[$rounds,#0]
orr $s0,$s0,$t1,lsl#8
orr $s0,$s0,$t2,lsl#16
orr $s0,$s0,$t3,lsl#24
ldrb $s1,[$rounds,#7]
orr $s0,$s0,$t2,lsl#16
ldrb $t1,[$rounds,#6]
orr $s0,$s0,$t3,lsl#24
ldrb $t2,[$rounds,#5]
ldrb $t3,[$rounds,#4]
orr $s1,$s1,$t1,lsl#8
orr $s1,$s1,$t2,lsl#16
orr $s1,$s1,$t3,lsl#24
ldrb $s2,[$rounds,#11]
orr $s1,$s1,$t2,lsl#16
ldrb $t1,[$rounds,#10]
orr $s1,$s1,$t3,lsl#24
ldrb $t2,[$rounds,#9]
ldrb $t3,[$rounds,#8]
orr $s2,$s2,$t1,lsl#8
orr $s2,$s2,$t2,lsl#16
orr $s2,$s2,$t3,lsl#24
ldrb $s3,[$rounds,#15]
orr $s2,$s2,$t2,lsl#16
ldrb $t1,[$rounds,#14]
orr $s2,$s2,$t3,lsl#24
ldrb $t2,[$rounds,#13]
ldrb $t3,[$rounds,#12]
orr $s3,$s3,$t1,lsl#8
orr $s3,$s3,$t2,lsl#16
orr $s3,$s3,$t3,lsl#24
str $s0,[$key],#16
orr $s3,$s3,$t2,lsl#16
str $s1,[$key,#-12]
orr $s3,$s3,$t3,lsl#24
str $s2,[$key,#-8]
str $s3,[$key,#-4]
@ -440,27 +441,26 @@ AES_set_encrypt_key:
.L128_loop:
and $t2,lr,$s3,lsr#24
and $i1,lr,$s3,lsr#16
and $i2,lr,$s3,lsr#8
and $i3,lr,$s3
ldrb $t2,[$tbl,$t2]
and $i2,lr,$s3,lsr#8
ldrb $i1,[$tbl,$i1]
and $i3,lr,$s3
ldrb $i2,[$tbl,$i2]
ldrb $i3,[$tbl,$i3]
ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i1,lsl#24
ldrb $i3,[$tbl,$i3]
orr $t2,$t2,$i2,lsl#16
ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i3,lsl#8
eor $t2,$t2,$t1
eor $s0,$s0,$t2 @ rk[4]=rk[0]^...
eor $s1,$s1,$s0 @ rk[5]=rk[1]^rk[4]
eor $s2,$s2,$s1 @ rk[6]=rk[2]^rk[5]
eor $s3,$s3,$s2 @ rk[7]=rk[3]^rk[6]
str $s0,[$key],#16
eor $s2,$s2,$s1 @ rk[6]=rk[2]^rk[5]
str $s1,[$key,#-12]
eor $s3,$s3,$s2 @ rk[7]=rk[3]^rk[6]
str $s2,[$key,#-8]
str $s3,[$key,#-4]
subs $rounds,$rounds,#1
str $s3,[$key,#-4]
bne .L128_loop
sub r2,$key,#176
b .Ldone
@ -471,16 +471,16 @@ AES_set_encrypt_key:
ldrb $t2,[$rounds,#17]
ldrb $t3,[$rounds,#16]
orr $i2,$i2,$t1,lsl#8
orr $i2,$i2,$t2,lsl#16
orr $i2,$i2,$t3,lsl#24
ldrb $i3,[$rounds,#23]
orr $i2,$i2,$t2,lsl#16
ldrb $t1,[$rounds,#22]
orr $i2,$i2,$t3,lsl#24
ldrb $t2,[$rounds,#21]
ldrb $t3,[$rounds,#20]
orr $i3,$i3,$t1,lsl#8
orr $i3,$i3,$t2,lsl#16
orr $i3,$i3,$t3,lsl#24
str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4]
teq lr,#192
@ -494,27 +494,26 @@ AES_set_encrypt_key:
.L192_loop:
and $t2,lr,$i3,lsr#24
and $i1,lr,$i3,lsr#16
and $i2,lr,$i3,lsr#8
and $i3,lr,$i3
ldrb $t2,[$tbl,$t2]
and $i2,lr,$i3,lsr#8
ldrb $i1,[$tbl,$i1]
and $i3,lr,$i3
ldrb $i2,[$tbl,$i2]
ldrb $i3,[$tbl,$i3]
ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i1,lsl#24
ldrb $i3,[$tbl,$i3]
orr $t2,$t2,$i2,lsl#16
ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i3,lsl#8
eor $i3,$t2,$t1
eor $s0,$s0,$i3 @ rk[6]=rk[0]^...
eor $s1,$s1,$s0 @ rk[7]=rk[1]^rk[6]
eor $s2,$s2,$s1 @ rk[8]=rk[2]^rk[7]
eor $s3,$s3,$s2 @ rk[9]=rk[3]^rk[8]
str $s0,[$key],#24
eor $s2,$s2,$s1 @ rk[8]=rk[2]^rk[7]
str $s1,[$key,#-20]
eor $s3,$s3,$s2 @ rk[9]=rk[3]^rk[8]
str $s2,[$key,#-16]
str $s3,[$key,#-12]
subs $rounds,$rounds,#1
str $s3,[$key,#-12]
subeq r2,$key,#216
beq .Ldone
@ -532,16 +531,16 @@ AES_set_encrypt_key:
ldrb $t2,[$rounds,#25]
ldrb $t3,[$rounds,#24]
orr $i2,$i2,$t1,lsl#8
orr $i2,$i2,$t2,lsl#16
orr $i2,$i2,$t3,lsl#24
ldrb $i3,[$rounds,#31]
orr $i2,$i2,$t2,lsl#16
ldrb $t1,[$rounds,#30]
orr $i2,$i2,$t3,lsl#24
ldrb $t2,[$rounds,#29]
ldrb $t3,[$rounds,#28]
orr $i3,$i3,$t1,lsl#8
orr $i3,$i3,$t2,lsl#16
orr $i3,$i3,$t3,lsl#24
str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4]
mov $rounds,#14
@ -553,52 +552,51 @@ AES_set_encrypt_key:
.L256_loop:
and $t2,lr,$i3,lsr#24
and $i1,lr,$i3,lsr#16
and $i2,lr,$i3,lsr#8
and $i3,lr,$i3
ldrb $t2,[$tbl,$t2]
and $i2,lr,$i3,lsr#8
ldrb $i1,[$tbl,$i1]
and $i3,lr,$i3
ldrb $i2,[$tbl,$i2]
ldrb $i3,[$tbl,$i3]
ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i1,lsl#24
ldrb $i3,[$tbl,$i3]
orr $t2,$t2,$i2,lsl#16
ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i3,lsl#8
eor $i3,$t2,$t1
eor $s0,$s0,$i3 @ rk[8]=rk[0]^...
eor $s1,$s1,$s0 @ rk[9]=rk[1]^rk[8]
eor $s2,$s2,$s1 @ rk[10]=rk[2]^rk[9]
eor $s3,$s3,$s2 @ rk[11]=rk[3]^rk[10]
str $s0,[$key],#32
eor $s2,$s2,$s1 @ rk[10]=rk[2]^rk[9]
str $s1,[$key,#-28]
eor $s3,$s3,$s2 @ rk[11]=rk[3]^rk[10]
str $s2,[$key,#-24]
str $s3,[$key,#-20]
subs $rounds,$rounds,#1
str $s3,[$key,#-20]
subeq r2,$key,#256
beq .Ldone
and $t2,lr,$s3
and $i1,lr,$s3,lsr#8
and $i2,lr,$s3,lsr#16
and $i3,lr,$s3,lsr#24
ldrb $t2,[$tbl,$t2]
and $i2,lr,$s3,lsr#16
ldrb $i1,[$tbl,$i1]
and $i3,lr,$s3,lsr#24
ldrb $i2,[$tbl,$i2]
ldrb $i3,[$tbl,$i3]
orr $t2,$t2,$i1,lsl#8
ldrb $i3,[$tbl,$i3]
orr $t2,$t2,$i2,lsl#16
ldr $t1,[$key,#-48]
orr $t2,$t2,$i3,lsl#24
ldr $t1,[$key,#-48]
ldr $i1,[$key,#-44]
ldr $i2,[$key,#-40]
ldr $i3,[$key,#-36]
eor $t1,$t1,$t2 @ rk[12]=rk[4]^...
ldr $i3,[$key,#-36]
eor $i1,$i1,$t1 @ rk[13]=rk[5]^rk[12]
eor $i2,$i2,$i1 @ rk[14]=rk[6]^rk[13]
eor $i3,$i3,$i2 @ rk[15]=rk[7]^rk[14]
str $t1,[$key,#-16]
eor $i2,$i2,$i1 @ rk[14]=rk[6]^rk[13]
str $i1,[$key,#-12]
eor $i3,$i3,$i2 @ rk[15]=rk[7]^rk[14]
str $i2,[$key,#-8]
str $i3,[$key,#-4]
b .L256_loop
@ -819,24 +817,24 @@ AES_decrypt:
ldrb $t2,[$rounds,#1]
ldrb $t3,[$rounds,#0]
orr $s0,$s0,$t1,lsl#8
orr $s0,$s0,$t2,lsl#16
orr $s0,$s0,$t3,lsl#24
ldrb $s1,[$rounds,#7]
orr $s0,$s0,$t2,lsl#16
ldrb $t1,[$rounds,#6]
orr $s0,$s0,$t3,lsl#24
ldrb $t2,[$rounds,#5]
ldrb $t3,[$rounds,#4]
orr $s1,$s1,$t1,lsl#8
orr $s1,$s1,$t2,lsl#16
orr $s1,$s1,$t3,lsl#24
ldrb $s2,[$rounds,#11]
orr $s1,$s1,$t2,lsl#16
ldrb $t1,[$rounds,#10]
orr $s1,$s1,$t3,lsl#24
ldrb $t2,[$rounds,#9]
ldrb $t3,[$rounds,#8]
orr $s2,$s2,$t1,lsl#8
orr $s2,$s2,$t2,lsl#16
orr $s2,$s2,$t3,lsl#24
ldrb $s3,[$rounds,#15]
orr $s2,$s2,$t2,lsl#16
ldrb $t1,[$rounds,#14]
orr $s2,$s2,$t3,lsl#24
ldrb $t2,[$rounds,#13]
ldrb $t3,[$rounds,#12]
orr $s3,$s3,$t1,lsl#8
@ -851,24 +849,24 @@ AES_decrypt:
mov $t3,$s0,lsr#8
strb $t1,[$rounds,#0]
strb $t2,[$rounds,#1]
strb $t3,[$rounds,#2]
strb $s0,[$rounds,#3]
mov $t1,$s1,lsr#24
strb $t3,[$rounds,#2]
mov $t2,$s1,lsr#16
strb $s0,[$rounds,#3]
mov $t3,$s1,lsr#8
strb $t1,[$rounds,#4]
strb $t2,[$rounds,#5]
strb $t3,[$rounds,#6]
strb $s1,[$rounds,#7]
mov $t1,$s2,lsr#24
strb $t3,[$rounds,#6]
mov $t2,$s2,lsr#16
strb $s1,[$rounds,#7]
mov $t3,$s2,lsr#8
strb $t1,[$rounds,#8]
strb $t2,[$rounds,#9]
strb $t3,[$rounds,#10]
strb $s2,[$rounds,#11]
mov $t1,$s3,lsr#24
strb $t3,[$rounds,#10]
mov $t2,$s3,lsr#16
strb $s2,[$rounds,#11]
mov $t3,$s3,lsr#8
strb $t1,[$rounds,#12]
strb $t2,[$rounds,#13]
@ -885,146 +883,143 @@ AES_decrypt:
.align 2
_armv4_AES_decrypt:
str lr,[sp,#-4]! @ push lr
ldr $t1,[$key],#16
ldr $t2,[$key,#-12]
ldr $t3,[$key,#-8]
ldr $i1,[$key,#-4]
ldr $rounds,[$key,#240-16]
ldmia $key!,{$t1-$i1}
eor $s0,$s0,$t1
ldr $rounds,[$key,#240-16]
eor $s1,$s1,$t2
eor $s2,$s2,$t3
eor $s3,$s3,$i1
sub $rounds,$rounds,#1
mov lr,#255
.Ldec_loop:
and $i1,lr,$s0,lsr#16
and $i2,lr,$s0,lsr#8
and $i3,lr,$s0
mov $s0,$s0,lsr#24
.Ldec_loop:
ldr $t1,[$tbl,$i1,lsl#2] @ Td1[s0>>16]
ldr $s0,[$tbl,$s0,lsl#2] @ Td0[s0>>24]
ldr $t2,[$tbl,$i2,lsl#2] @ Td2[s0>>8]
ldr $t3,[$tbl,$i3,lsl#2] @ Td3[s0>>0]
and $i1,lr,$s1 @ i0
ldr $t2,[$tbl,$i2,lsl#2] @ Td2[s0>>8]
and $i2,lr,$s1,lsr#16
ldr $t3,[$tbl,$i3,lsl#2] @ Td3[s0>>0]
and $i3,lr,$s1,lsr#8
ldr $s0,[$tbl,$s0,lsl#2] @ Td0[s0>>24]
mov $s1,$s1,lsr#24
ldr $i1,[$tbl,$i1,lsl#2] @ Td3[s1>>0]
ldr $s1,[$tbl,$s1,lsl#2] @ Td0[s1>>24]
ldr $i2,[$tbl,$i2,lsl#2] @ Td1[s1>>16]
ldr $i3,[$tbl,$i3,lsl#2] @ Td2[s1>>8]
eor $s0,$s0,$i1,ror#24
eor $s1,$s1,$t1,ror#8
eor $t2,$i2,$t2,ror#8
eor $t3,$i3,$t3,ror#8
ldr $s1,[$tbl,$s1,lsl#2] @ Td0[s1>>24]
and $i1,lr,$s2,lsr#8 @ i0
eor $t2,$i2,$t2,ror#8
and $i2,lr,$s2 @ i1
eor $t3,$i3,$t3,ror#8
and $i3,lr,$s2,lsr#16
mov $s2,$s2,lsr#24
eor $s1,$s1,$t1,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
mov $s2,$s2,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
eor $s0,$s0,$i1,ror#16
eor $s1,$s1,$i2,ror#24
eor $s2,$s2,$t2,ror#8
eor $t3,$i3,$t3,ror#8
ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
and $i1,lr,$s3,lsr#16 @ i0
eor $s1,$s1,$i2,ror#24
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,ror#8
and $i3,lr,$s3 @ i2
mov $s3,$s3,lsr#24
eor $s2,$s2,$t2,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
mov $s3,$s3,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s0,$s0,$i1,ror#8
ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s1,$s1,$i2,ror#16
eor $s2,$s2,$i3,ror#24
ldr $i1,[$key],#16
eor $s3,$s3,$t3,ror#8
ldr $t1,[$key],#16
ldr $t2,[$key,#-12]
ldr $t3,[$key,#-8]
ldr $i1,[$key,#-4]
eor $s0,$s0,$t1
eor $s1,$s1,$t2
eor $s2,$s2,$t3
eor $s3,$s3,$i1
ldr $t1,[$key,#-12]
ldr $t2,[$key,#-8]
eor $s0,$s0,$i1
ldr $t3,[$key,#-4]
and $i1,lr,$s0,lsr#16
eor $s1,$s1,$t1
and $i2,lr,$s0,lsr#8
eor $s2,$s2,$t2
and $i3,lr,$s0
eor $s3,$s3,$t3
mov $s0,$s0,lsr#24
subs $rounds,$rounds,#1
bne .Ldec_loop
add $tbl,$tbl,#1024
ldr $t1,[$tbl,#0] @ prefetch Td4
ldr $t2,[$tbl,#32]
ldr $t3,[$tbl,#64]
ldr $i1,[$tbl,#96]
ldr $i2,[$tbl,#128]
ldr $i3,[$tbl,#160]
ldr $t1,[$tbl,#192]
ldr $t2,[$tbl,#224]
ldr $t2,[$tbl,#0] @ prefetch Td4
ldr $t3,[$tbl,#32]
ldr $t1,[$tbl,#64]
ldr $t2,[$tbl,#96]
ldr $t3,[$tbl,#128]
ldr $t1,[$tbl,#160]
ldr $t2,[$tbl,#192]
ldr $t3,[$tbl,#224]
and $i1,lr,$s0,lsr#16
and $i2,lr,$s0,lsr#8
and $i3,lr,$s0
ldrb $s0,[$tbl,$s0,lsr#24] @ Td4[s0>>24]
ldrb $s0,[$tbl,$s0] @ Td4[s0>>24]
ldrb $t1,[$tbl,$i1] @ Td4[s0>>16]
ldrb $t2,[$tbl,$i2] @ Td4[s0>>8]
ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
and $i1,lr,$s1 @ i0
ldrb $t2,[$tbl,$i2] @ Td4[s0>>8]
and $i2,lr,$s1,lsr#16
ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
and $i3,lr,$s1,lsr#8
ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24]
ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
eor $s0,$i1,$s0,lsl#24
ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
eor $s1,$t1,$s1,lsl#8
eor $t2,$t2,$i2,lsl#8
eor $t3,$t3,$i3,lsl#8
and $i1,lr,$s2,lsr#8 @ i0
eor $t2,$t2,$i2,lsl#8
and $i2,lr,$s2 @ i1
and $i3,lr,$s2,lsr#16
eor $t3,$t3,$i3,lsl#8
ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
and $i3,lr,$s2,lsr#16
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
eor $s0,$s0,$i1,lsl#8
ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
eor $s1,$i2,$s1,lsl#16
eor $s2,$t2,$s2,lsl#16
eor $t3,$t3,$i3,lsl#16
and $i1,lr,$s3,lsr#16 @ i0
eor $s2,$t2,$s2,lsl#16
and $i2,lr,$s3,lsr#8 @ i1
and $i3,lr,$s3 @ i2
eor $t3,$t3,$i3,lsl#16
ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
and $i3,lr,$s3 @ i2
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
eor $s0,$s0,$i1,lsl#16
ldr $i1,[$key,#0]
eor $s1,$s1,$i2,lsl#8
ldr $t1,[$key,#4]
eor $s2,$i3,$s2,lsl#8
ldr $t2,[$key,#8]
eor $s3,$t3,$s3,lsl#24
ldr $t3,[$key,#12]
ldr lr,[sp],#4 @ pop lr
ldr $t1,[$key,#0]
ldr $t2,[$key,#4]
ldr $t3,[$key,#8]
ldr $i1,[$key,#12]
eor $s0,$s0,$t1
eor $s1,$s1,$t2
eor $s2,$s2,$t3
eor $s3,$s3,$i1
eor $s0,$s0,$i1
eor $s1,$s1,$t1
eor $s2,$s2,$t2
eor $s3,$s3,$t3
sub $tbl,$tbl,#1024
mov pc,lr @ return
ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_decrypt,.-_armv4_AES_decrypt
.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
.align 2

View File

@ -19,6 +19,12 @@
# loop, this assembler loop body was found to be ~3x smaller than
# compiler-generated one...
#
# July 2010
#
# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
# Cortex A8 core and ~25 cycles per processed byte (which was observed
# to be ~3 times faster than gcc-generated code:-)
#
# Note about "528B" variant. In ARM case it makes lesser sense to
# implement it for following reasons:
#
@ -123,12 +129,12 @@ gcm_ghash_4bit:
add $Zhh,$Htbl,$nlo,lsl#4
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
add $Thh,$Htbl,$nhi
ldrb $nlo,[$inp,#14]
add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
mov $nhi,$nhi,lsl#1
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
@ -139,15 +145,15 @@ gcm_ghash_4bit:
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
eor $nlo,$nlo,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16
and $nhi,$nlo,#0xf0
and $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16
.Loop:
add $Thh,$Htbl,$nlo,lsl#4
subs $cnt,$cnt,#1
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
and $nlo,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
add $nlo,$nlo,$nlo
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
@ -161,22 +167,22 @@ gcm_ghash_4bit:
add $Thh,$Htbl,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
ldrplb $nhi,[$Xi,$cnt]
eor $Zlh,$Tlh,$Zlh,lsr#4
ldrplb $nhi,[$Xi,$cnt]
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
eorpl $nlo,$nlo,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
eor $Zhh,$Thh,$Zhh,lsr#4
andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop
ldr $len,[sp,#32] @ re-load $len/end
@ -212,7 +218,7 @@ gcm_gmult_4bit:
add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
mov $nhi,$nhi,lsl#1
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
@ -228,8 +234,8 @@ gcm_gmult_4bit:
.Loop2:
add $Thh,$Htbl,$nlo,lsl#4
subs $cnt,$cnt,#1
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
and $nlo,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
add $nlo,$nlo,$nlo
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
@ -243,8 +249,8 @@ gcm_gmult_4bit:
add $Thh,$Htbl,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
@ -255,8 +261,8 @@ gcm_gmult_4bit:
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
andpl $nhi,$nlo,#0xf0
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop2
___
&Zsmash();

View File

@ -11,7 +11,12 @@
# Performance is ~2x better than gcc 3.4 generated code and in "abso-
# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
# byte.
# byte [on single-issue Xscale PXA250 core].
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 22% improvement on
# Cortex A8 core and ~20 cycles per processed byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@ -52,27 +57,27 @@ $code.=<<___ if ($i<16);
___
$code.=<<___;
ldr $t2,[$Ktbl],#4 @ *K256++
str $T1,[sp,#`$i%16`*4]
mov $t0,$e,ror#$Sigma1[0]
str $T1,[sp,#`$i%16`*4]
eor $t0,$t0,$e,ror#$Sigma1[1]
eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
add $T1,$T1,$t0
eor $t1,$f,$g
eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
and $t1,$t1,$e
add $T1,$T1,$t0
eor $t1,$t1,$g @ Ch(e,f,g)
add $T1,$T1,$t1
add $T1,$T1,$h
add $T1,$T1,$t2
mov $h,$a,ror#$Sigma0[0]
add $T1,$T1,$t1
eor $h,$h,$a,ror#$Sigma0[1]
add $T1,$T1,$t2
eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
orr $t0,$a,$b
and $t0,$t0,$c
and $t1,$a,$b
orr $t0,$t0,$t1 @ Maj(a,b,c)
add $h,$h,$t0
add $d,$d,$T1
and $t0,$t0,$c
add $h,$h,$T1
orr $t0,$t0,$t1 @ Maj(a,b,c)
add $d,$d,$T1
add $h,$h,$t0
___
}
@ -80,19 +85,19 @@ sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
ldr $t1,[sp,#`($i+1)%16`*4] @ $i
ldr $t1,[sp,#`($i+1)%16`*4] @ $i
ldr $t2,[sp,#`($i+14)%16`*4]
ldr $T1,[sp,#`($i+0)%16`*4]
ldr $inp,[sp,#`($i+9)%16`*4]
mov $t0,$t1,ror#$sigma0[0]
ldr $inp,[sp,#`($i+9)%16`*4]
eor $t0,$t0,$t1,ror#$sigma0[1]
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
mov $t1,$t2,ror#$sigma1[0]
eor $t1,$t1,$t2,ror#$sigma1[1]
eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
add $T1,$T1,$t0
add $T1,$T1,$t1
eor $t1,$t1,$t2,ror#$sigma1[1]
add $T1,$T1,$inp
eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
add $T1,$T1,$t1
___
&BODY_00_15(@_);
}

View File

@ -10,7 +10,13 @@
# SHA512 block procedure for ARMv4. September 2007.
# This code is ~4.5 (four and a half) times faster than code generated
# by gcc 3.4 and it spends ~72 clock cycles per byte.
# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
# Xscale PXA250 core].
#
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
# Cortex A8 core and ~40 cycles per processed byte.
# Byte order [in]dependence. =========================================
#
@ -73,33 +79,31 @@ $code.=<<___;
eor $t0,$t0,$Elo,lsl#23
eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
adds $Tlo,$Tlo,$t0
adc $Thi,$Thi,$t1 @ T += Sigma1(e)
adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3 @ T += h
ldr $t0,[sp,#$Foff+0] @ f.lo
adc $Thi,$Thi,$t1 @ T += Sigma1(e)
ldr $t1,[sp,#$Foff+4] @ f.hi
adds $Tlo,$Tlo,$t2
ldr $t2,[sp,#$Goff+0] @ g.lo
adc $Thi,$Thi,$t3 @ T += h
ldr $t3,[sp,#$Goff+4] @ g.hi
eor $t0,$t0,$t2
str $Elo,[sp,#$Eoff+0]
str $Ehi,[sp,#$Eoff+4]
str $Alo,[sp,#$Aoff+0]
str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2
eor $t1,$t1,$t3
str $Ehi,[sp,#$Eoff+4]
and $t0,$t0,$Elo
str $Alo,[sp,#$Aoff+0]
and $t1,$t1,$Ehi
str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2
eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t2,[$Ktbl,#4] @ K[i].lo
eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t3,[$Ktbl,#0] @ K[i].hi
ldr $Elo,[sp,#$Doff+0] @ d.lo
ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t0
ldr $Elo,[sp,#$Doff+0] @ d.lo
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3 @ T += K[i]
adds $Elo,$Elo,$Tlo