AES for SPARC T4: add XTS, reorder subroutines to improve TLB locality.

This commit is contained in:
Andy Polyakov 2012-11-24 21:55:23 +00:00
parent 1c16fd1f03
commit cd68694646
3 changed files with 625 additions and 133 deletions

View File

@ -83,7 +83,12 @@ $::evp=1; # if $evp is set to 0, script generates module with
{
my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
$code=<<___;
$code.=<<___ if ($::abibits==64);
.register %g2,#scratch
.register %g3,#scratch
___
$code.=<<___;
.text
.globl aes_t4_encrypt
@ -411,24 +416,6 @@ my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
$code.=<<___;
.align 32
_aes128_loadkey:
ldx [$key + 0], %g4
ldx [$key + 8], %g5
___
for ($i=2; $i<22;$i++) { # load key schedule
$code.=<<___;
ldd [$key + `8*$i`], %f`12+2*$i`
___
}
$code.=<<___;
retl
nop
.type _aes128_loadkey,#function
.size _aes128_loadkey,.-_aes128_loadkey
_aes128_load_enckey=_aes128_loadkey
_aes128_load_deckey=_aes128_loadkey
.align 32
_aes128_encrypt_1x:
___
@ -477,6 +464,35 @@ $code.=<<___;
.type _aes128_encrypt_2x,#function
.size _aes128_encrypt_2x,.-_aes128_encrypt_2x
.align 32
_aes128_loadkey:
ldx [$key + 0], %g4
ldx [$key + 8], %g5
___
for ($i=2; $i<22;$i++) { # load key schedule
$code.=<<___;
ldd [$key + `8*$i`], %f`12+2*$i`
___
}
$code.=<<___;
retl
nop
.type _aes128_loadkey,#function
.size _aes128_loadkey,.-_aes128_loadkey
_aes128_load_enckey=_aes128_loadkey
_aes128_load_deckey=_aes128_loadkey
___
&alg_cbc_encrypt_implement("aes",128);
if ($::evp) {
&alg_ctr32_implement("aes",128);
&alg_xts_implement("aes",128,"en");
&alg_xts_implement("aes",128,"de");
}
&alg_cbc_decrypt_implement("aes",128);
$code.=<<___;
.align 32
_aes128_decrypt_1x:
___
@ -524,28 +540,9 @@ $code.=<<___;
aes_dround23_l %f54, %f10, %f6, %f6
.type _aes128_decrypt_2x,#function
.size _aes128_decrypt_2x,.-_aes128_decrypt_2x
___
.align 32
_aes192_loadkey:
_aes256_loadkey:
ldx [$key + 0], %g4
ldx [$key + 8], %g5
___
for ($i=2; $i<26;$i++) { # load key schedule
$code.=<<___;
ldd [$key + `8*$i`], %f`12+2*$i`
___
}
$code.=<<___;
retl
nop
.type _aes192_loadkey,#function
.size _aes192_loadkey,.-_aes192_loadkey
_aes192_load_enckey=_aes192_loadkey
_aes192_load_deckey=_aes192_loadkey
_aes256_load_enckey=_aes192_loadkey
_aes256_load_deckey=_aes192_loadkey
.align 32
_aes192_encrypt_1x:
___
@ -594,54 +591,6 @@ $code.=<<___;
.type _aes192_encrypt_2x,#function
.size _aes192_encrypt_2x,.-_aes192_encrypt_2x
.align 32
_aes192_decrypt_1x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_dround01 %f56, %f0, %f2, %f4
aes_dround23 %f58, %f0, %f2, %f2
aes_dround01_l %f60, %f4, %f2, %f0
retl
aes_dround23_l %f62, %f4, %f2, %f2
.type _aes192_decrypt_1x,#function
.size _aes192_decrypt_1x,.-_aes192_decrypt_1x
.align 32
_aes192_decrypt_2x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_dround01 %f56, %f0, %f2, %f8
aes_dround23 %f58, %f0, %f2, %f2
aes_dround01 %f56, %f4, %f6, %f10
aes_dround23 %f58, %f4, %f6, %f6
aes_dround01_l %f60, %f8, %f2, %f0
aes_dround23_l %f62, %f8, %f2, %f2
aes_dround01_l %f60, %f10, %f6, %f4
retl
aes_dround23_l %f62, %f10, %f6, %f6
.type _aes192_decrypt_2x,#function
.size _aes192_decrypt_2x,.-_aes192_decrypt_2x
.align 32
_aes256_encrypt_1x:
aes_eround01 %f16, %f0, %f2, %f4
@ -718,6 +667,40 @@ $code.=<<___;
.type _aes256_encrypt_2x,#function
.size _aes256_encrypt_2x,.-_aes256_encrypt_2x
.align 32
_aes192_loadkey:
ldx [$key + 0], %g4
ldx [$key + 8], %g5
___
for ($i=2; $i<26;$i++) { # load key schedule
$code.=<<___;
ldd [$key + `8*$i`], %f`12+2*$i`
___
}
$code.=<<___;
retl
nop
.type _aes192_loadkey,#function
.size _aes192_loadkey,.-_aes192_loadkey
_aes256_loadkey=_aes192_loadkey
_aes192_load_enckey=_aes192_loadkey
_aes192_load_deckey=_aes192_loadkey
_aes256_load_enckey=_aes192_loadkey
_aes256_load_deckey=_aes192_loadkey
___
&alg_cbc_encrypt_implement("aes",256);
&alg_cbc_encrypt_implement("aes",192);
if ($::evp) {
&alg_ctr32_implement("aes",256);
&alg_xts_implement("aes",256,"en");
&alg_xts_implement("aes",256,"de");
&alg_ctr32_implement("aes",192);
}
&alg_cbc_decrypt_implement("aes",192);
&alg_cbc_decrypt_implement("aes",256);
$code.=<<___;
.align 32
_aes256_decrypt_1x:
aes_dround01 %f16, %f0, %f2, %f4
@ -793,21 +776,55 @@ $code.=<<___;
ldd [$key + 40], %f22
.type _aes256_decrypt_2x,#function
.size _aes256_decrypt_2x,.-_aes256_decrypt_2x
.align 32
_aes192_decrypt_1x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
&alg_cbc_encrypt_implement("aes",128);
&alg_cbc_encrypt_implement("aes",192);
&alg_cbc_encrypt_implement("aes",256);
&alg_cbc_decrypt_implement("aes",128);
&alg_cbc_decrypt_implement("aes",192);
&alg_cbc_decrypt_implement("aes",256);
if ($::evp) {
&alg_ctr32_implement("aes",128);
&alg_ctr32_implement("aes",192);
&alg_ctr32_implement("aes",256);
}
$code.=<<___;
aes_dround01 %f56, %f0, %f2, %f4
aes_dround23 %f58, %f0, %f2, %f2
aes_dround01_l %f60, %f4, %f2, %f0
retl
aes_dround23_l %f62, %f4, %f2, %f2
.type _aes192_decrypt_1x,#function
.size _aes192_decrypt_1x,.-_aes192_decrypt_1x
.align 32
_aes192_decrypt_2x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_dround01 %f56, %f0, %f2, %f8
aes_dround23 %f58, %f0, %f2, %f2
aes_dround01 %f56, %f4, %f6, %f10
aes_dround23 %f58, %f4, %f6, %f6
aes_dround01_l %f60, %f8, %f2, %f0
aes_dround23_l %f62, %f8, %f2, %f2
aes_dround01_l %f60, %f10, %f6, %f4
retl
aes_dround23_l %f62, %f10, %f6, %f6
.type _aes192_decrypt_2x,#function
.size _aes192_decrypt_2x,.-_aes192_decrypt_2x
___
}}}
if (!$::evp) {

View File

@ -511,6 +511,18 @@ void aes192_t4_ctr32_encrypt (const unsigned char *in, unsigned char *out,
void aes256_t4_ctr32_encrypt (const unsigned char *in, unsigned char *out,
size_t blocks, const AES_KEY *key,
unsigned char *ivec);
void aes128_t4_xts_encrypt (const unsigned char *in, unsigned char *out,
size_t blocks, const AES_KEY *key1,
const AES_KEY *key2, const unsigned char *ivec);
void aes128_t4_xts_decrypt (const unsigned char *in, unsigned char *out,
size_t blocks, const AES_KEY *key1,
const AES_KEY *key2, const unsigned char *ivec);
void aes256_t4_xts_encrypt (const unsigned char *in, unsigned char *out,
size_t blocks, const AES_KEY *key1,
const AES_KEY *key2, const unsigned char *ivec);
void aes256_t4_xts_decrypt (const unsigned char *in, unsigned char *out,
size_t blocks, const AES_KEY *key1,
const AES_KEY *key2, const unsigned char *ivec);
static int aes_t4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc)
@ -681,46 +693,47 @@ static int aes_t4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
if (key)
{
int bits = ctx->key_len * 4;
xctx->stream = NULL;
/* key_len is two AES keys */
if (enc)
{
aes_t4_set_encrypt_key(key, bits, &xctx->ks1.ks);
xctx->xts.block1 = (block128_f)aes_t4_encrypt;
#if 0 /* not yet */
switch (bits) {
case 128:
xctx->stream = aes128_t4_xts_encrypt;
break;
#if 0 /* not yet */
case 192:
xctx->stream = aes192_t4_xts_encrypt;
break;
#endif
case 256:
xctx->stream = aes256_t4_xts_encrypt;
break;
default:
return 0;
}
#endif
}
else
{
aes_t4_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
xctx->xts.block1 = (block128_f)aes_t4_decrypt;
#if 0 /* not yet */
switch (bits) {
case 128:
xctx->stream = aes128_t4_xts_decrypt;
break;
#if 0 /* not yet */
case 192:
xctx->stream = aes192_t4_xts_decrypt;
break;
#endif
case 256:
xctx->stream = aes256_t4_xts_decrypt;
break;
default:
return 0;
}
#endif
}
aes_t4_set_encrypt_key(key + ctx->key_len/2,

View File

@ -900,6 +900,480 @@ $::code.=<<___;
___
}
sub alg_xts_implement {
my ($alg,$bits,$dir) = @_;
my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
my $rem=$ivec;
$::code.=<<___;
.globl ${alg}${bits}_t4_xts_${dir}crypt
.align 32
${alg}${bits}_t4_xts_${dir}crypt:
save %sp, -$::frame-16, %sp
mov $ivec, %o0
add %fp, $::bias-16, %o1
call ${alg}_t4_encrypt
mov $key2, %o2
add %fp, $::bias-16, %l7
ldxa [%l7]0x88, %g2
add %fp, $::bias-8, %l7
ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
sethi %hi(0x76543210), %l7
or %l7, %lo(0x76543210), %l7
bmask %l7, %g0, %g0 ! byte swap mask
prefetch [$inp], 20
prefetch [$inp + 63], 20
call _${alg}${bits}_load_${dir}ckey
and $len, 15, $rem
and $len, -16, $len
___
$code.=<<___ if ($dir eq "de");
mov 0, %l7
movrnz $rem, 16, %l7
sub $len, %l7, $len
___
$code.=<<___;
sub $inp, $out, $blk_init ! $inp!=$out
and $inp, 7, $ileft
andn $inp, 7, $inp
sll $ileft, 3, $ileft
mov 64, $iright
mov 0xff, $omask
sub $iright, $ileft, $iright
and $out, 7, $ooff
cmp $len, 255
movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out)
srl $omask, $ooff, $omask
andcc $len, 16, %g0 ! is number of blocks even?
___
$code.=<<___ if ($dir eq "de");
brz,pn $len, .L${bits}_xts_${dir}steal
___
$code.=<<___;
alignaddrl $out, %g0, $out
bz %icc, .L${bits}_xts_${dir}loop2x
srlx $len, 4, $len
.L${bits}_xts_${dir}loop:
ldx [$inp + 0], %o0
brz,pt $ileft, 4f
ldx [$inp + 8], %o1
ldx [$inp + 16], %o2
sllx %o0, $ileft, %o0
srlx %o1, $iright, %g1
sllx %o1, $ileft, %o1
or %g1, %o0, %o0
srlx %o2, $iright, %o2
or %o2, %o1, %o1
4:
movxtod %g2, %f12
movxtod %g3, %f14
bshuffle %f12, %f12, %f12
bshuffle %f14, %f14, %f14
xor %g4, %o0, %o0 ! ^= rk[0]
xor %g5, %o1, %o1
movxtod %o0, %f0
movxtod %o1, %f2
fxor %f12, %f0, %f0 ! ^= tweak[0]
fxor %f14, %f2, %f2
prefetch [$out + 63], 22
prefetch [$inp + 16+63], 20
call _${alg}${bits}_${dir}crypt_1x
add $inp, 16, $inp
fxor %f12, %f0, %f0 ! ^= tweak[0]
fxor %f14, %f2, %f2
srax %g3, 63, %l7 ! next tweak value
addcc %g2, %g2, %g2
and %l7, 0x87, %l7
addxc %g3, %g3, %g3
xor %l7, %g2, %g2
brnz,pn $ooff, 2f
sub $len, 1, $len
std %f0, [$out + 0]
std %f2, [$out + 8]
brnz,pt $len, .L${bits}_xts_${dir}loop2x
add $out, 16, $out
brnz,pn $rem, .L${bits}_xts_${dir}steal
nop
ret
restore
.align 16
2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
! and ~3x deterioration
! in inp==out case
faligndata %f0, %f0, %f4 ! handle unaligned output
faligndata %f0, %f2, %f6
faligndata %f2, %f2, %f8
stda %f4, [$out + $omask]0xc0 ! partial store
std %f6, [$out + 8]
add $out, 16, $out
orn %g0, $omask, $omask
stda %f8, [$out + $omask]0xc0 ! partial store
brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
orn %g0, $omask, $omask
brnz,pn $rem, .L${bits}_xts_${dir}steal
nop
ret
restore
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align 32
.L${bits}_xts_${dir}loop2x:
ldx [$inp + 0], %o0
ldx [$inp + 8], %o1
ldx [$inp + 16], %o2
brz,pt $ileft, 4f
ldx [$inp + 24], %o3
ldx [$inp + 32], %o4
sllx %o0, $ileft, %o0
srlx %o1, $iright, %g1
or %g1, %o0, %o0
sllx %o1, $ileft, %o1
srlx %o2, $iright, %g1
or %g1, %o1, %o1
sllx %o2, $ileft, %o2
srlx %o3, $iright, %g1
or %g1, %o2, %o2
sllx %o3, $ileft, %o3
srlx %o4, $iright, %o4
or %o4, %o3, %o3
4:
movxtod %g2, %f12
movxtod %g3, %f14
bshuffle %f12, %f12, %f12
bshuffle %f14, %f14, %f14
srax %g3, 63, %l7 ! next tweak value
addcc %g2, %g2, %g2
and %l7, 0x87, %l7
addxc %g3, %g3, %g3
xor %l7, %g2, %g2
movxtod %g2, %f8
movxtod %g3, %f10
bshuffle %f8, %f8, %f8
bshuffle %f10, %f10, %f10
xor %g4, %o0, %o0 ! ^= rk[0]
xor %g5, %o1, %o1
xor %g4, %o2, %o2 ! ^= rk[0]
xor %g5, %o3, %o3
movxtod %o0, %f0
movxtod %o1, %f2
movxtod %o2, %f4
movxtod %o3, %f6
fxor %f12, %f0, %f0 ! ^= tweak[0]
fxor %f14, %f2, %f2
fxor %f8, %f4, %f4 ! ^= tweak[0]
fxor %f10, %f6, %f6
prefetch [$out + 63], 22
prefetch [$inp + 32+63], 20
call _${alg}${bits}_${dir}crypt_2x
add $inp, 32, $inp
movxtod %g2, %f8
movxtod %g3, %f10
srax %g3, 63, %l7 ! next tweak value
addcc %g2, %g2, %g2
and %l7, 0x87, %l7
addxc %g3, %g3, %g3
xor %l7, %g2, %g2
bshuffle %f8, %f8, %f8
bshuffle %f10, %f10, %f10
fxor %f12, %f0, %f0 ! ^= tweak[0]
fxor %f14, %f2, %f2
fxor %f8, %f4, %f4
fxor %f10, %f6, %f6
brnz,pn $ooff, 2f
sub $len, 2, $len
std %f0, [$out + 0]
std %f2, [$out + 8]
std %f4, [$out + 16]
std %f6, [$out + 24]
brnz,pt $len, .L${bits}_xts_${dir}loop2x
add $out, 32, $out
fsrc2 %f4, %f0
fsrc2 %f6, %f2
brnz,pn $rem, .L${bits}_xts_${dir}steal
nop
ret
restore
.align 16
2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
! and ~3x deterioration
! in inp==out case
faligndata %f0, %f0, %f8 ! handle unaligned output
faligndata %f0, %f2, %f10
faligndata %f2, %f4, %f12
faligndata %f4, %f6, %f14
faligndata %f6, %f6, %f0
stda %f8, [$out + $omask]0xc0 ! partial store
std %f10, [$out + 8]
std %f12, [$out + 16]
std %f14, [$out + 24]
add $out, 32, $out
orn %g0, $omask, $omask
stda %f0, [$out + $omask]0xc0 ! partial store
brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
orn %g0, $omask, $omask
fsrc2 %f4, %f0
fsrc2 %f6, %f2
brnz,pn $rem, .L${bits}_xts_${dir}steal
nop
ret
restore
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align 32
.L${bits}_xts_${dir}blk:
add $out, $len, $blk_init
and $blk_init, 63, $blk_init ! tail
sub $len, $blk_init, $len
add $blk_init, 15, $blk_init ! round up to 16n
srlx $len, 4, $len
srl $blk_init, 4, $blk_init
sub $len, 1, $len
add $blk_init, 1, $blk_init
.L${bits}_xts_${dir}blk2x:
ldx [$inp + 0], %o0
ldx [$inp + 8], %o1
ldx [$inp + 16], %o2
brz,pt $ileft, 5f
ldx [$inp + 24], %o3
ldx [$inp + 32], %o4
sllx %o0, $ileft, %o0
srlx %o1, $iright, %g1
or %g1, %o0, %o0
sllx %o1, $ileft, %o1
srlx %o2, $iright, %g1
or %g1, %o1, %o1
sllx %o2, $ileft, %o2
srlx %o3, $iright, %g1
or %g1, %o2, %o2
sllx %o3, $ileft, %o3
srlx %o4, $iright, %o4
or %o4, %o3, %o3
5:
movxtod %g2, %f12
movxtod %g3, %f14
bshuffle %f12, %f12, %f12
bshuffle %f14, %f14, %f14
srax %g3, 63, %l7 ! next tweak value
addcc %g2, %g2, %g2
and %l7, 0x87, %l7
addxc %g3, %g3, %g3
xor %l7, %g2, %g2
movxtod %g2, %f8
movxtod %g3, %f10
bshuffle %f8, %f8, %f8
bshuffle %f10, %f10, %f10
xor %g4, %o0, %o0 ! ^= rk[0]
xor %g5, %o1, %o1
xor %g4, %o2, %o2 ! ^= rk[0]
xor %g5, %o3, %o3
movxtod %o0, %f0
movxtod %o1, %f2
movxtod %o2, %f4
movxtod %o3, %f6
fxor %f12, %f0, %f0 ! ^= tweak[0]
fxor %f14, %f2, %f2
fxor %f8, %f4, %f4 ! ^= tweak[0]
fxor %f10, %f6, %f6
prefetch [$inp + 32+63], 20
call _${alg}${bits}_${dir}crypt_2x
add $inp, 32, $inp
movxtod %g2, %f8
movxtod %g3, %f10
srax %g3, 63, %l7 ! next tweak value
addcc %g2, %g2, %g2
and %l7, 0x87, %l7
addxc %g3, %g3, %g3
xor %l7, %g2, %g2
bshuffle %f8, %f8, %f8
bshuffle %f10, %f10, %f10
fxor %f12, %f0, %f0 ! ^= tweak[0]
fxor %f14, %f2, %f2
fxor %f8, %f4, %f4
fxor %f10, %f6, %f6
stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
add $out, 8, $out
stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
add $out, 8, $out
stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
add $out, 8, $out
stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x
add $out, 8, $out
add $blk_init, $len, $len
andcc $len, 1, %g0 ! is number of blocks even?
membar #StoreLoad|#StoreStore
bnz,pt %icc, .L${bits}_xts_${dir}loop
srl $len, 0, $len
brnz,pn $len, .L${bits}_xts_${dir}loop2x
nop
fsrc2 %f4, %f0
fsrc2 %f6, %f2
brnz,pn $rem, .L${bits}_xts_${dir}steal
nop
ret
restore
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
___
$code.=<<___ if ($dir eq "en");
.align 32
.L${bits}_xts_${dir}steal:
std %f0, [%fp + $::bias-16] ! copy of output
std %f2, [%fp + $::bias-8]
srl $ileft, 3, $ileft
add %fp, $::bias-16, %l7
add $inp, $ileft, $inp ! original $inp+$len&-15
add $out, $ooff, $out ! original $out+$len&-15
mov 0, $ileft
nop ! align
.L${bits}_xts_${dir}stealing:
ldub [$inp + $ileft], %o0
ldub [%l7 + $ileft], %o1
dec $rem
stb %o0, [%l7 + $ileft]
stb %o1, [$out + $ileft]
brnz $rem, .L${bits}_xts_${dir}stealing
inc $ileft
mov %l7, $inp
sub $out, 16, $out
mov 0, $ileft
sub $out, $ooff, $out
ba .L${bits}_xts_${dir}loop ! one more time
mov 1, $len ! $rem is 0
___
$code.=<<___ if ($dir eq "de");
.align 32
.L${bits}_xts_${dir}steal:
ldx [$inp + 0], %o0
brz,pt $ileft, 8f
ldx [$inp + 8], %o1
ldx [$inp + 16], %o2
sllx %o0, $ileft, %o0
srlx %o1, $iright, %g1
sllx %o1, $ileft, %o1
or %g1, %o0, %o0
srlx %o2, $iright, %o2
or %o2, %o1, %o1
8:
srax %g3, 63, %l7 ! next tweak value
addcc %g2, %g2, %o2
and %l7, 0x87, %l7
addxc %g3, %g3, %o3
xor %l7, %o2, %o2
movxtod %o2, %f12
movxtod %o3, %f14
bshuffle %f12, %f12, %f12
bshuffle %f14, %f14, %f14
xor %g4, %o0, %o0 ! ^= rk[0]
xor %g5, %o1, %o1
movxtod %o0, %f0
movxtod %o1, %f2
fxor %f12, %f0, %f0 ! ^= tweak[0]
fxor %f14, %f2, %f2
call _${alg}${bits}_${dir}crypt_1x
add $inp, 16, $inp
fxor %f12, %f0, %f0 ! ^= tweak[0]
fxor %f14, %f2, %f2
std %f0, [%fp + $::bias-16]
std %f2, [%fp + $::bias-8]
srl $ileft, 3, $ileft
add %fp, $::bias-16, %l7
add $inp, $ileft, $inp ! original $inp+$len&-15
add $out, $ooff, $out ! original $out+$len&-15
mov 0, $ileft
add $out, 16, $out
nop ! align
.L${bits}_xts_${dir}stealing:
ldub [$inp + $ileft], %o0
ldub [%l7 + $ileft], %o1
dec $rem
stb %o0, [%l7 + $ileft]
stb %o1, [$out + $ileft]
brnz $rem, .L${bits}_xts_${dir}stealing
inc $ileft
mov %l7, $inp
sub $out, 16, $out
mov 0, $ileft
sub $out, $ooff, $out
ba .L${bits}_xts_${dir}loop ! one more time
mov 1, $len ! $rem is 0
___
$code.=<<___;
ret
restore
.type ${alg}${bits}_t4_xts_${dir}crypt,#function
.size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
___
}
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
@ -909,6 +1383,7 @@ sub unvis {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my ($ref,$opf);
my %visopf = ( "faligndata" => 0x048,
"bshuffle" => 0x04c,
"fnot2" => 0x066,
"fxor" => 0x06c,
"fsrc2" => 0x078 );
@ -933,13 +1408,17 @@ my %visopf = ( "faligndata" => 0x048,
return $ref;
}
}
sub unvis3 {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my ($ref,$opf);
my %visopf = ( "addxc" => 0x011,
"addxccc" => 0x013,
"umulxhi" => 0x016 );
"umulxhi" => 0x016,
"alignaddr" => 0x018,
"bmask" => 0x019,
"alignaddrl" => 0x01a );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
@ -956,20 +1435,6 @@ my %visopf = ( "addxc" => 0x011,
return $ref;
}
}
sub unalignaddr {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my $ref = "$mnemonic\t$rs1,$rs2,$rd";
my $opf = $mnemonic =~ /l$/ ? 0x01a :0x18;
foreach ($rs1,$rs2,$rd) {
if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
else { return $ref; }
}
return sprintf ".word\t0x%08x !%s",
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
$ref;
}
sub unaes_round { # 4-argument instructions
my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
@ -1121,35 +1586,32 @@ sub emit_assembler {
foreach (split("\n",$::code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/g;
s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
&unaes_round($1,$2,$3,$4,$5)
/ge or
/geo or
s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
&unaes_kexpand($1,$2,$3,$4)
/ge or
/geo or
s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
&uncamellia_f($1,$2,$3,$4,$5)
/ge or
/geo or
s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
&uncamellia3($1,$2,$3,$4)
/ge or
/geo or
s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
&unmovxtox($1,$2,$3)
/ge or
/geo or
s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
&unmovxtox($1,$2,$3)
/ge or
s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
/geo or
s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
&unvis($1,$2,$3,$4)
/ge or
s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
&unalignaddr($1,$2,$3,$4)
/ge or
s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
/geo or
s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
&unvis3($1,$2,$3,$4)
/ge;
/geo;
print $_,"\n";
}