openssl/crypto/sm4/asm/vpsm4_ex-armv8.pl
Richard Levitte b646179229 Copyright year updates
Reviewed-by: Neil Horman <nhorman@openssl.org>
Release: yes
(cherry picked from commit 0ce7d1f355)

Reviewed-by: Hugo Landau <hlandau@openssl.org>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/24034)
2024-04-09 13:43:26 +02:00

1554 lines
38 KiB
Raku

#! /usr/bin/env perl
# Copyright 2022-2024 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# This module implements SM4 with ASIMD and AESE on AARCH64
#
# Dec 2022
#
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour \"$output\""
or die "can't call $xlate: $!";
*STDOUT=*OUT;
$prefix="vpsm4_ex";
my @vtmp=map("v$_",(0..3));
my @qtmp=map("q$_",(0..3));
my @data=map("v$_",(4..7));
my @datax=map("v$_",(8..11));
my ($rk0,$rk1)=("v12","v13");
my ($rka,$rkb)=("v14","v15");
my @vtmpx=map("v$_",(12..15));
my ($vtmp4,$vtmp5)=("v24","v25");
my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31");
my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31");
my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
my ($xtmp1,$xtmp2)=("x8","x9");
my ($ptr,$counter)=("x10","w11");
my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
sub rev32() {
my $dst = shift;
my $src = shift;
if ($src and ("$src" ne "$dst")) {
$code.=<<___;
#ifndef __AARCH64EB__
rev32 $dst.16b,$src.16b
#else
mov $dst.16b,$src.16b
#endif
___
} else {
$code.=<<___;
#ifndef __AARCH64EB__
rev32 $dst.16b,$dst.16b
#endif
___
}
}
sub rev32_armeb() {
my $dst = shift;
my $src = shift;
if ($src and ("$src" ne "$dst")) {
$code.=<<___;
#ifdef __AARCH64EB__
rev32 $dst.16b,$src.16b
#else
mov $dst.16b,$src.16b
#endif
___
} else {
$code.=<<___;
#ifdef __AARCH64EB__
rev32 $dst.16b,$dst.16b
#endif
___
}
}
sub rbit() {
my $dst = shift;
my $src = shift;
my $std = shift;
if ($src and ("$src" ne "$dst")) {
if ($std eq "_gb") {
$code.=<<___;
rbit $dst.16b,$src.16b
___
} else {
$code.=<<___;
mov $dst.16b,$src.16b
___
}
} else {
if ($std eq "_gb") {
$code.=<<___;
rbit $dst.16b,$src.16b
___
}
}
}
sub transpose() {
my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
$code.=<<___;
zip1 $vt0.4s,$dat0.4s,$dat1.4s
zip2 $vt1.4s,$dat0.4s,$dat1.4s
zip1 $vt2.4s,$dat2.4s,$dat3.4s
zip2 $vt3.4s,$dat2.4s,$dat3.4s
zip1 $dat0.2d,$vt0.2d,$vt2.2d
zip2 $dat1.2d,$vt0.2d,$vt2.2d
zip1 $dat2.2d,$vt1.2d,$vt3.2d
zip2 $dat3.2d,$vt1.2d,$vt3.2d
___
}
# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x)
sub mul_matrix() {
my $x = shift;
my $higherMat = shift;
my $lowerMat = shift;
my $tmp = shift;
$code.=<<___;
ushr $tmp.16b, $x.16b, 4
and $x.16b, $x.16b, $ANDMaskV.16b
tbl $x.16b, {$lowerMat.16b}, $x.16b
tbl $tmp.16b, {$higherMat.16b}, $tmp.16b
eor $x.16b, $x.16b, $tmp.16b
___
}
# sbox operations for 4-lane of words
# sbox operation for 4-lane of words
sub sbox() {
my $dat = shift;
$code.=<<___;
// optimize sbox using AESE instruction
tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b
___
&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
$code.=<<___;
eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
aese @vtmp[0].16b,@vtmp[1].16b
___
&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4);
$code.=<<___;
mov $dat.16b,@vtmp[0].16b
// linear transformation
ushr @vtmp[0].4s,$dat.4s,32-2
ushr @vtmp[1].4s,$dat.4s,32-10
ushr @vtmp[2].4s,$dat.4s,32-18
ushr @vtmp[3].4s,$dat.4s,32-24
sli @vtmp[0].4s,$dat.4s,2
sli @vtmp[1].4s,$dat.4s,10
sli @vtmp[2].4s,$dat.4s,18
sli @vtmp[3].4s,$dat.4s,24
eor $vtmp4.16b,@vtmp[0].16b,$dat.16b
eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b
eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b
eor $dat.16b,$dat.16b,$vtmp4.16b
___
}
# sbox operation for 8-lane of words
sub sbox_double() {
my $dat = shift;
my $datx = shift;
$code.=<<___;
// optimize sbox using AESE instruction
tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b
tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b
___
&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
&mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4);
$code.=<<___;
eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b
aese @vtmp[0].16b,$vtmp5.16b
aese @vtmp[1].16b,$vtmp5.16b
___
&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4);
&mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4);
$code.=<<___;
mov $dat.16b,@vtmp[0].16b
mov $datx.16b,@vtmp[1].16b
// linear transformation
ushr @vtmp[0].4s,$dat.4s,32-2
ushr $vtmp5.4s,$datx.4s,32-2
ushr @vtmp[1].4s,$dat.4s,32-10
ushr @vtmp[2].4s,$dat.4s,32-18
ushr @vtmp[3].4s,$dat.4s,32-24
sli @vtmp[0].4s,$dat.4s,2
sli $vtmp5.4s,$datx.4s,2
sli @vtmp[1].4s,$dat.4s,10
sli @vtmp[2].4s,$dat.4s,18
sli @vtmp[3].4s,$dat.4s,24
eor $vtmp4.16b,@vtmp[0].16b,$dat.16b
eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b
eor $dat.16b,$dat.16b,$vtmp4.16b
ushr @vtmp[1].4s,$datx.4s,32-10
ushr @vtmp[2].4s,$datx.4s,32-18
ushr @vtmp[3].4s,$datx.4s,32-24
sli @vtmp[1].4s,$datx.4s,10
sli @vtmp[2].4s,$datx.4s,18
sli @vtmp[3].4s,$datx.4s,24
eor $vtmp4.16b,$vtmp5.16b,$datx.16b
eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
eor $datx.16b,$datx.16b,$vtmp4.16b
___
}
# sbox operation for one single word
sub sbox_1word () {
my $word = shift;
$code.=<<___;
mov @vtmp[3].s[0],$word
// optimize sbox using AESE instruction
tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b
___
&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
$code.=<<___;
eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
aese @vtmp[0].16b,@vtmp[1].16b
___
&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
$code.=<<___;
mov $wtmp0,@vtmp[0].s[0]
eor $word,$wtmp0,$wtmp0,ror #32-2
eor $word,$word,$wtmp0,ror #32-10
eor $word,$word,$wtmp0,ror #32-18
eor $word,$word,$wtmp0,ror #32-24
___
}
# sm4 for one block of data, in scalar registers word0/word1/word2/word3
sub sm4_1blk () {
my $kptr = shift;
$code.=<<___;
ldp $wtmp0,$wtmp1,[$kptr],8
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
eor $tmpw,$word2,$word3
eor $wtmp2,$wtmp0,$word1
eor $tmpw,$tmpw,$wtmp2
___
&sbox_1word($tmpw);
$code.=<<___;
eor $word0,$word0,$tmpw
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
eor $tmpw,$word2,$word3
eor $wtmp2,$word0,$wtmp1
eor $tmpw,$tmpw,$wtmp2
___
&sbox_1word($tmpw);
$code.=<<___;
ldp $wtmp0,$wtmp1,[$kptr],8
eor $word1,$word1,$tmpw
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
eor $tmpw,$word0,$word1
eor $wtmp2,$wtmp0,$word3
eor $tmpw,$tmpw,$wtmp2
___
&sbox_1word($tmpw);
$code.=<<___;
eor $word2,$word2,$tmpw
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
eor $tmpw,$word0,$word1
eor $wtmp2,$word2,$wtmp1
eor $tmpw,$tmpw,$wtmp2
___
&sbox_1word($tmpw);
$code.=<<___;
eor $word3,$word3,$tmpw
___
}
# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
sub sm4_4blks () {
my $kptr = shift;
$code.=<<___;
ldp $wtmp0,$wtmp1,[$kptr],8
dup $rk0.4s,$wtmp0
dup $rk1.4s,$wtmp1
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
eor $rka.16b,@data[2].16b,@data[3].16b
eor $rk0.16b,@data[1].16b,$rk0.16b
eor $rk0.16b,$rka.16b,$rk0.16b
___
&sbox($rk0);
$code.=<<___;
eor @data[0].16b,@data[0].16b,$rk0.16b
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
eor $rka.16b,$rka.16b,@data[0].16b
eor $rk1.16b,$rka.16b,$rk1.16b
___
&sbox($rk1);
$code.=<<___;
ldp $wtmp0,$wtmp1,[$kptr],8
eor @data[1].16b,@data[1].16b,$rk1.16b
dup $rk0.4s,$wtmp0
dup $rk1.4s,$wtmp1
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
eor $rka.16b,@data[0].16b,@data[1].16b
eor $rk0.16b,@data[3].16b,$rk0.16b
eor $rk0.16b,$rka.16b,$rk0.16b
___
&sbox($rk0);
$code.=<<___;
eor @data[2].16b,@data[2].16b,$rk0.16b
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
eor $rka.16b,$rka.16b,@data[2].16b
eor $rk1.16b,$rka.16b,$rk1.16b
___
&sbox($rk1);
$code.=<<___;
eor @data[3].16b,@data[3].16b,$rk1.16b
___
}
# sm4 for 8 lanes of data, in neon registers
# data0/data1/data2/data3 datax0/datax1/datax2/datax3
sub sm4_8blks () {
my $kptr = shift;
$code.=<<___;
ldp $wtmp0,$wtmp1,[$kptr],8
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
dup $rk0.4s,$wtmp0
eor $rka.16b,@data[2].16b,@data[3].16b
eor $rkb.16b,@datax[2].16b,@datax[3].16b
eor @vtmp[0].16b,@data[1].16b,$rk0.16b
eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
eor $rk0.16b,$rka.16b,@vtmp[0].16b
eor $rk1.16b,$rkb.16b,@vtmp[1].16b
___
&sbox_double($rk0,$rk1);
$code.=<<___;
eor @data[0].16b,@data[0].16b,$rk0.16b
eor @datax[0].16b,@datax[0].16b,$rk1.16b
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
dup $rk1.4s,$wtmp1
eor $rka.16b,$rka.16b,@data[0].16b
eor $rkb.16b,$rkb.16b,@datax[0].16b
eor $rk0.16b,$rka.16b,$rk1.16b
eor $rk1.16b,$rkb.16b,$rk1.16b
___
&sbox_double($rk0,$rk1);
$code.=<<___;
ldp $wtmp0,$wtmp1,[$kptr],8
eor @data[1].16b,@data[1].16b,$rk0.16b
eor @datax[1].16b,@datax[1].16b,$rk1.16b
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
dup $rk0.4s,$wtmp0
eor $rka.16b,@data[0].16b,@data[1].16b
eor $rkb.16b,@datax[0].16b,@datax[1].16b
eor @vtmp[0].16b,@data[3].16b,$rk0.16b
eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
eor $rk0.16b,$rka.16b,@vtmp[0].16b
eor $rk1.16b,$rkb.16b,@vtmp[1].16b
___
&sbox_double($rk0,$rk1);
$code.=<<___;
eor @data[2].16b,@data[2].16b,$rk0.16b
eor @datax[2].16b,@datax[2].16b,$rk1.16b
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
dup $rk1.4s,$wtmp1
eor $rka.16b,$rka.16b,@data[2].16b
eor $rkb.16b,$rkb.16b,@datax[2].16b
eor $rk0.16b,$rka.16b,$rk1.16b
eor $rk1.16b,$rkb.16b,$rk1.16b
___
&sbox_double($rk0,$rk1);
$code.=<<___;
eor @data[3].16b,@data[3].16b,$rk0.16b
eor @datax[3].16b,@datax[3].16b,$rk1.16b
___
}
sub encrypt_1blk_norev() {
my $dat = shift;
$code.=<<___;
mov $ptr,$rks
mov $counter,#8
mov $word0,$dat.s[0]
mov $word1,$dat.s[1]
mov $word2,$dat.s[2]
mov $word3,$dat.s[3]
10:
___
&sm4_1blk($ptr);
$code.=<<___;
subs $counter,$counter,#1
b.ne 10b
mov $dat.s[0],$word3
mov $dat.s[1],$word2
mov $dat.s[2],$word1
mov $dat.s[3],$word0
___
}
sub encrypt_1blk() {
my $dat = shift;
&encrypt_1blk_norev($dat);
&rev32($dat,$dat);
}
sub encrypt_4blks() {
$code.=<<___;
mov $ptr,$rks
mov $counter,#8
10:
___
&sm4_4blks($ptr);
$code.=<<___;
subs $counter,$counter,#1
b.ne 10b
___
&rev32(@vtmp[3],@data[0]);
&rev32(@vtmp[2],@data[1]);
&rev32(@vtmp[1],@data[2]);
&rev32(@vtmp[0],@data[3]);
}
sub encrypt_8blks() {
$code.=<<___;
mov $ptr,$rks
mov $counter,#8
10:
___
&sm4_8blks($ptr);
$code.=<<___;
subs $counter,$counter,#1
b.ne 10b
___
&rev32(@vtmp[3],@data[0]);
&rev32(@vtmp[2],@data[1]);
&rev32(@vtmp[1],@data[2]);
&rev32(@vtmp[0],@data[3]);
&rev32(@data[3],@datax[0]);
&rev32(@data[2],@datax[1]);
&rev32(@data[1],@datax[2]);
&rev32(@data[0],@datax[3]);
}
sub load_sbox () {
my $data = shift;
$code.=<<___;
ldr $MaskQ, .Lsbox_magic
ldr $TAHMatQ, .Lsbox_magic+16
ldr $TALMatQ, .Lsbox_magic+32
ldr $ATAHMatQ, .Lsbox_magic+48
ldr $ATALMatQ, .Lsbox_magic+64
ldr $ANDMaskQ, .Lsbox_magic+80
___
}
sub mov_reg_to_vec() {
my $src0 = shift;
my $src1 = shift;
my $desv = shift;
$code.=<<___;
mov $desv.d[0],$src0
mov $desv.d[1],$src1
___
&rev32_armeb($desv,$desv);
}
sub mov_vec_to_reg() {
my $srcv = shift;
my $des0 = shift;
my $des1 = shift;
$code.=<<___;
mov $des0,$srcv.d[0]
mov $des1,$srcv.d[1]
___
}
sub compute_tweak() {
my $src0 = shift;
my $src1 = shift;
my $des0 = shift;
my $des1 = shift;
$code.=<<___;
mov $wtmp0,0x87
extr $xtmp2,$src1,$src1,#32
extr $des1,$src1,$src0,#63
and $wtmp1,$wtmp0,$wtmp2,asr#31
eor $des0,$xtmp1,$src0,lsl#1
___
}
sub compute_tweak_vec() {
my $src = shift;
my $des = shift;
my $std = shift;
&rbit(@vtmp[2],$src,$std);
$code.=<<___;
ldr @qtmp[0], .Lxts_magic
shl $des.16b, @vtmp[2].16b, #1
ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
ushr @vtmp[1].16b, @vtmp[1].16b, #7
mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
eor $des.16b, $des.16b, @vtmp[1].16b
___
&rbit($des,$des,$std);
}
$code=<<___;
#include "arm_arch.h"
.arch armv8-a+crypto
.text
.type _${prefix}_consts,%object
.align 7
_${prefix}_consts:
.Lck:
.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
.Lfk:
.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
.Lshuffles:
.quad 0x0B0A090807060504,0x030201000F0E0D0C
.Lxts_magic:
.quad 0x0101010101010187,0x0101010101010101
.Lsbox_magic:
.quad 0x0b0e0104070a0d00,0x0306090c0f020508
.quad 0x62185a2042387a00,0x22581a6002783a40
.quad 0x15df62a89e54e923,0xc10bb67c4a803df7
.quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead
.quad 0x6404462679195b3b,0xe383c1a1fe9edcbc
.quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
.size _${prefix}_consts,.-_${prefix}_consts
___
{{{
my ($key,$keys,$enc)=("x0","x1","w2");
my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
my ($vkey,$vfk,$vmap)=("v5","v6","v7");
$code.=<<___;
.type _${prefix}_set_key,%function
.align 4
_${prefix}_set_key:
AARCH64_VALID_CALL_TARGET
ld1 {$vkey.4s},[$key]
___
&load_sbox();
&rev32($vkey,$vkey);
$code.=<<___;
adr $pointer,.Lshuffles
ld1 {$vmap.2d},[$pointer]
adr $pointer,.Lfk
ld1 {$vfk.2d},[$pointer]
eor $vkey.16b,$vkey.16b,$vfk.16b
mov $schedules,#32
adr $pointer,.Lck
movi @vtmp[0].16b,#64
cbnz $enc,1f
add $keys,$keys,124
1:
mov $wtmp,$vkey.s[1]
ldr $roundkey,[$pointer],#4
eor $roundkey,$roundkey,$wtmp
mov $wtmp,$vkey.s[2]
eor $roundkey,$roundkey,$wtmp
mov $wtmp,$vkey.s[3]
eor $roundkey,$roundkey,$wtmp
// optimize sbox using AESE instruction
mov @data[0].s[0],$roundkey
tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b
___
&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
$code.=<<___;
eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
aese @vtmp[0].16b,@vtmp[1].16b
___
&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
$code.=<<___;
mov $wtmp,@vtmp[0].s[0]
eor $roundkey,$wtmp,$wtmp,ror #19
eor $roundkey,$roundkey,$wtmp,ror #9
mov $wtmp,$vkey.s[0]
eor $roundkey,$roundkey,$wtmp
mov $vkey.s[0],$roundkey
cbz $enc,2f
str $roundkey,[$keys],#4
b 3f
2:
str $roundkey,[$keys],#-4
3:
tbl $vkey.16b,{$vkey.16b},$vmap.16b
subs $schedules,$schedules,#1
b.ne 1b
ret
.size _${prefix}_set_key,.-_${prefix}_set_key
___
}}}
{{{
$code.=<<___;
.type _${prefix}_enc_4blks,%function
.align 4
_${prefix}_enc_4blks:
AARCH64_VALID_CALL_TARGET
___
&encrypt_4blks();
$code.=<<___;
ret
.size _${prefix}_enc_4blks,.-_${prefix}_enc_4blks
___
}}}
{{{
$code.=<<___;
.type _${prefix}_enc_8blks,%function
.align 4
_${prefix}_enc_8blks:
AARCH64_VALID_CALL_TARGET
___
&encrypt_8blks();
$code.=<<___;
ret
.size _${prefix}_enc_8blks,.-_${prefix}_enc_8blks
___
}}}
{{{
my ($key,$keys)=("x0","x1");
$code.=<<___;
.globl ${prefix}_set_encrypt_key
.type ${prefix}_set_encrypt_key,%function
.align 5
${prefix}_set_encrypt_key:
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-16]!
mov w2,1
bl _${prefix}_set_key
ldp x29,x30,[sp],#16
AARCH64_VALIDATE_LINK_REGISTER
ret
.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
___
}}}
{{{
my ($key,$keys)=("x0","x1");
$code.=<<___;
.globl ${prefix}_set_decrypt_key
.type ${prefix}_set_decrypt_key,%function
.align 5
${prefix}_set_decrypt_key:
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-16]!
mov w2,0
bl _${prefix}_set_key
ldp x29,x30,[sp],#16
AARCH64_VALIDATE_LINK_REGISTER
ret
.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
___
}}}
{{{
sub gen_block () {
my $dir = shift;
my ($inp,$outp,$rk)=map("x$_",(0..2));
$code.=<<___;
.globl ${prefix}_${dir}crypt
.type ${prefix}_${dir}crypt,%function
.align 5
${prefix}_${dir}crypt:
AARCH64_VALID_CALL_TARGET
ld1 {@data[0].4s},[$inp]
___
&load_sbox();
&rev32(@data[0],@data[0]);
$code.=<<___;
mov $rks,$rk
___
&encrypt_1blk(@data[0]);
$code.=<<___;
st1 {@data[0].4s},[$outp]
ret
.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
___
}
&gen_block("en");
&gen_block("de");
}}}
{{{
$code.=<<___;
.globl ${prefix}_ecb_encrypt
.type ${prefix}_ecb_encrypt,%function
.align 5
${prefix}_ecb_encrypt:
AARCH64_SIGN_LINK_REGISTER
// convert length into blocks
lsr x2,x2,4
stp d8,d9,[sp,#-80]!
stp d10,d11,[sp,#16]
stp d12,d13,[sp,#32]
stp d14,d15,[sp,#48]
stp x29,x30,[sp,#64]
___
&load_sbox();
$code.=<<___;
.Lecb_8_blocks_process:
cmp $blocks,#8
b.lt .Lecb_4_blocks_process
ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
&rev32(@datax[0],@datax[0]);
&rev32(@datax[1],@datax[1]);
&rev32(@datax[2],@datax[2]);
&rev32(@datax[3],@datax[3]);
$code.=<<___;
bl _${prefix}_enc_8blks
st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
subs $blocks,$blocks,#8
b.gt .Lecb_8_blocks_process
b 100f
.Lecb_4_blocks_process:
cmp $blocks,#4
b.lt 1f
ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
$code.=<<___;
bl _${prefix}_enc_4blks
st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
sub $blocks,$blocks,#4
1:
// process last block
cmp $blocks,#1
b.lt 100f
b.gt 1f
ld1 {@data[0].4s},[$inp]
___
&rev32(@data[0],@data[0]);
&encrypt_1blk(@data[0]);
$code.=<<___;
st1 {@data[0].4s},[$outp]
b 100f
1: // process last 2 blocks
ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
cmp $blocks,#2
b.gt 1f
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
$code.=<<___;
bl _${prefix}_enc_4blks
st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
b 100f
1: // process last 3 blocks
ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
$code.=<<___;
bl _${prefix}_enc_4blks
st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
100:
ldp d10,d11,[sp,#16]
ldp d12,d13,[sp,#32]
ldp d14,d15,[sp,#48]
ldp x29,x30,[sp,#64]
ldp d8,d9,[sp],#80
AARCH64_VALIDATE_LINK_REGISTER
ret
.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
___
}}}
{{{
my ($len,$ivp,$enc)=("x2","x4","w5");
my $ivec0=("v3");
my $ivec1=("v15");
$code.=<<___;
.globl ${prefix}_cbc_encrypt
.type ${prefix}_cbc_encrypt,%function
.align 5
${prefix}_cbc_encrypt:
AARCH64_VALID_CALL_TARGET
lsr $len,$len,4
___
&load_sbox();
$code.=<<___;
cbz $enc,.Ldec
ld1 {$ivec0.4s},[$ivp]
.Lcbc_4_blocks_enc:
cmp $blocks,#4
b.lt 1f
ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
eor @data[0].16b,@data[0].16b,$ivec0.16b
___
&rev32(@data[1],@data[1]);
&rev32(@data[0],@data[0]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
&encrypt_1blk_norev(@data[0]);
$code.=<<___;
eor @data[1].16b,@data[1].16b,@data[0].16b
___
&encrypt_1blk_norev(@data[1]);
&rev32(@data[0],@data[0]);
$code.=<<___;
eor @data[2].16b,@data[2].16b,@data[1].16b
___
&encrypt_1blk_norev(@data[2]);
&rev32(@data[1],@data[1]);
$code.=<<___;
eor @data[3].16b,@data[3].16b,@data[2].16b
___
&encrypt_1blk_norev(@data[3]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
$code.=<<___;
orr $ivec0.16b,@data[3].16b,@data[3].16b
st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
subs $blocks,$blocks,#4
b.ne .Lcbc_4_blocks_enc
b 2f
1:
subs $blocks,$blocks,#1
b.lt 2f
ld1 {@data[0].4s},[$inp],#16
eor $ivec0.16b,$ivec0.16b,@data[0].16b
___
&rev32($ivec0,$ivec0);
&encrypt_1blk($ivec0);
$code.=<<___;
st1 {$ivec0.4s},[$outp],#16
b 1b
2:
// save back IV
st1 {$ivec0.4s},[$ivp]
ret
.Ldec:
// decryption mode starts
AARCH64_SIGN_LINK_REGISTER
stp d8,d9,[sp,#-80]!
stp d10,d11,[sp,#16]
stp d12,d13,[sp,#32]
stp d14,d15,[sp,#48]
stp x29,x30,[sp,#64]
.Lcbc_8_blocks_dec:
cmp $blocks,#8
b.lt 1f
ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
add $ptr,$inp,#64
ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],$data[3]);
&rev32(@datax[0],@datax[0]);
&rev32(@datax[1],@datax[1]);
&rev32(@datax[2],@datax[2]);
&rev32(@datax[3],$datax[3]);
$code.=<<___;
bl _${prefix}_enc_8blks
___
&transpose(@vtmp,@datax);
&transpose(@data,@datax);
$code.=<<___;
ld1 {$ivec1.4s},[$ivp]
ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
// note ivec1 and vtmpx[3] are reusing the same register
// care needs to be taken to avoid conflict
eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
// save back IV
st1 {$vtmpx[3].4s}, [$ivp]
eor @data[0].16b,@data[0].16b,$datax[3].16b
eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
subs $blocks,$blocks,#8
b.gt .Lcbc_8_blocks_dec
b.eq 100f
1:
ld1 {$ivec1.4s},[$ivp]
.Lcbc_4_blocks_dec:
cmp $blocks,#4
b.lt 1f
ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],$data[3]);
$code.=<<___;
bl _${prefix}_enc_4blks
ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
___
&transpose(@vtmp,@datax);
$code.=<<___;
eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
orr $ivec1.16b,@data[3].16b,@data[3].16b
eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
subs $blocks,$blocks,#4
b.gt .Lcbc_4_blocks_dec
// save back IV
st1 {@data[3].4s}, [$ivp]
b 100f
1: // last block
subs $blocks,$blocks,#1
b.lt 100f
b.gt 1f
ld1 {@data[0].4s},[$inp],#16
// save back IV
st1 {$data[0].4s}, [$ivp]
___
&rev32(@datax[0],@data[0]);
&encrypt_1blk(@datax[0]);
$code.=<<___;
eor @datax[0].16b,@datax[0].16b,$ivec1.16b
st1 {@datax[0].4s},[$outp],#16
b 100f
1: // last two blocks
ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
add $ptr,$inp,#16
ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
subs $blocks,$blocks,1
b.gt 1f
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
$code.=<<___;
bl _${prefix}_enc_4blks
ld1 {@data[0].4s,@data[1].4s},[$inp],#32
___
&transpose(@vtmp,@datax);
$code.=<<___;
eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
// save back IV
st1 {@data[1].4s}, [$ivp]
b 100f
1: // last 3 blocks
ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
$code.=<<___;
bl _${prefix}_enc_4blks
ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
___
&transpose(@vtmp,@datax);
$code.=<<___;
eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
// save back IV
st1 {@data[2].4s}, [$ivp]
100:
ldp d10,d11,[sp,#16]
ldp d12,d13,[sp,#32]
ldp d14,d15,[sp,#48]
ldp x29,x30,[sp,#64]
ldp d8,d9,[sp],#80
AARCH64_VALIDATE_LINK_REGISTER
ret
.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
___
}}}
{{{
my ($ivp)=("x4");
my ($ctr)=("w5");
my $ivec=("v3");
$code.=<<___;
.globl ${prefix}_ctr32_encrypt_blocks
.type ${prefix}_ctr32_encrypt_blocks,%function
.align 5
${prefix}_ctr32_encrypt_blocks:
AARCH64_VALID_CALL_TARGET
ld1 {$ivec.4s},[$ivp]
___
&rev32($ivec,$ivec);
&load_sbox();
$code.=<<___;
cmp $blocks,#1
b.ne 1f
// fast processing for one single block without
// context saving overhead
___
&encrypt_1blk($ivec);
$code.=<<___;
ld1 {@data[0].4s},[$inp]
eor @data[0].16b,@data[0].16b,$ivec.16b
st1 {@data[0].4s},[$outp]
ret
1:
AARCH64_SIGN_LINK_REGISTER
stp d8,d9,[sp,#-80]!
stp d10,d11,[sp,#16]
stp d12,d13,[sp,#32]
stp d14,d15,[sp,#48]
stp x29,x30,[sp,#64]
mov $word0,$ivec.s[0]
mov $word1,$ivec.s[1]
mov $word2,$ivec.s[2]
mov $ctr,$ivec.s[3]
.Lctr32_4_blocks_process:
cmp $blocks,#4
b.lt 1f
dup @data[0].4s,$word0
dup @data[1].4s,$word1
dup @data[2].4s,$word2
mov @data[3].s[0],$ctr
add $ctr,$ctr,#1
mov $data[3].s[1],$ctr
add $ctr,$ctr,#1
mov @data[3].s[2],$ctr
add $ctr,$ctr,#1
mov @data[3].s[3],$ctr
add $ctr,$ctr,#1
cmp $blocks,#8
b.ge .Lctr32_8_blocks_process
bl _${prefix}_enc_4blks
ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
subs $blocks,$blocks,#4
b.ne .Lctr32_4_blocks_process
b 100f
.Lctr32_8_blocks_process:
dup @datax[0].4s,$word0
dup @datax[1].4s,$word1
dup @datax[2].4s,$word2
mov @datax[3].s[0],$ctr
add $ctr,$ctr,#1
mov $datax[3].s[1],$ctr
add $ctr,$ctr,#1
mov @datax[3].s[2],$ctr
add $ctr,$ctr,#1
mov @datax[3].s[3],$ctr
add $ctr,$ctr,#1
bl _${prefix}_enc_8blks
ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
eor @data[0].16b,@data[0].16b,@datax[0].16b
eor @data[1].16b,@data[1].16b,@datax[1].16b
eor @data[2].16b,@data[2].16b,@datax[2].16b
eor @data[3].16b,@data[3].16b,@datax[3].16b
st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
subs $blocks,$blocks,#8
b.ne .Lctr32_4_blocks_process
b 100f
1: // last block processing
subs $blocks,$blocks,#1
b.lt 100f
b.gt 1f
mov $ivec.s[0],$word0
mov $ivec.s[1],$word1
mov $ivec.s[2],$word2
mov $ivec.s[3],$ctr
___
&encrypt_1blk($ivec);
$code.=<<___;
ld1 {@data[0].4s},[$inp]
eor @data[0].16b,@data[0].16b,$ivec.16b
st1 {@data[0].4s},[$outp]
b 100f
1: // last 2 blocks processing
dup @data[0].4s,$word0
dup @data[1].4s,$word1
dup @data[2].4s,$word2
mov @data[3].s[0],$ctr
add $ctr,$ctr,#1
mov @data[3].s[1],$ctr
subs $blocks,$blocks,#1
b.ne 1f
bl _${prefix}_enc_4blks
ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
b 100f
1: // last 3 blocks processing
add $ctr,$ctr,#1
mov @data[3].s[2],$ctr
bl _${prefix}_enc_4blks
ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
100:
ldp d10,d11,[sp,#16]
ldp d12,d13,[sp,#32]
ldp d14,d15,[sp,#48]
ldp x29,x30,[sp,#64]
ldp d8,d9,[sp],#80
AARCH64_VALIDATE_LINK_REGISTER
ret
.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
___
}}}
{{{
my ($blocks,$len)=("x2","x2");
my $ivp=("x5");
my @twx=map("x$_",(12..27));
my ($rks1,$rks2)=("x26","x27");
my $lastBlk=("x26");
my $enc=("w28");
my $remain=("x29");
my @tweak=map("v$_",(16..23));
my $lastTweak=("v25");
sub gen_xts_cipher() {
my $std = shift;
$code.=<<___;
.globl ${prefix}_xts_encrypt${std}
.type ${prefix}_xts_encrypt${std},%function
.align 5
${prefix}_xts_encrypt${std}:
AARCH64_SIGN_LINK_REGISTER
stp x15, x16, [sp, #-0x10]!
stp x17, x18, [sp, #-0x10]!
stp x19, x20, [sp, #-0x10]!
stp x21, x22, [sp, #-0x10]!
stp x23, x24, [sp, #-0x10]!
stp x25, x26, [sp, #-0x10]!
stp x27, x28, [sp, #-0x10]!
stp x29, x30, [sp, #-0x10]!
stp d8, d9, [sp, #-0x10]!
stp d10, d11, [sp, #-0x10]!
stp d12, d13, [sp, #-0x10]!
stp d14, d15, [sp, #-0x10]!
mov $rks1,x3
mov $rks2,x4
mov $enc,w6
ld1 {@tweak[0].4s}, [$ivp]
mov $rks,$rks2
___
&load_sbox();
&rev32(@tweak[0],@tweak[0]);
&encrypt_1blk(@tweak[0]);
$code.=<<___;
mov $rks,$rks1
and $remain,$len,#0x0F
// convert length into blocks
lsr $blocks,$len,4
cmp $blocks,#1
b.lt .return${std}
cmp $remain,0
// If the encryption/decryption Length is N times of 16,
// the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
b.eq .xts_encrypt_blocks${std}
// If the encryption/decryption length is not N times of 16,
// the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
// the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
subs $blocks,$blocks,#1
b.eq .only_2blks_tweak${std}
.xts_encrypt_blocks${std}:
___
&rbit(@tweak[0],@tweak[0],$std);
&rev32_armeb(@tweak[0],@tweak[0]);
&mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
$code.=<<___;
.Lxts_8_blocks_process${std}:
cmp $blocks,#8
___
&mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
&compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
&mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
&mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
&mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
&mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]);
&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
&mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]);
&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
&mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]);
&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
&mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]);
&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
$code.=<<___;
b.lt .Lxts_4_blocks_process${std}
ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
___
&rbit(@tweak[0],@tweak[0],$std);
&rbit(@tweak[1],@tweak[1],$std);
&rbit(@tweak[2],@tweak[2],$std);
&rbit(@tweak[3],@tweak[3],$std);
$code.=<<___;
eor @data[0].16b, @data[0].16b, @tweak[0].16b
eor @data[1].16b, @data[1].16b, @tweak[1].16b
eor @data[2].16b, @data[2].16b, @tweak[2].16b
eor @data[3].16b, @data[3].16b, @tweak[3].16b
ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
___
&rbit(@tweak[4],@tweak[4],$std);
&rbit(@tweak[5],@tweak[5],$std);
&rbit(@tweak[6],@tweak[6],$std);
&rbit(@tweak[7],@tweak[7],$std);
$code.=<<___;
eor @datax[0].16b, @datax[0].16b, @tweak[4].16b
eor @datax[1].16b, @datax[1].16b, @tweak[5].16b
eor @datax[2].16b, @datax[2].16b, @tweak[6].16b
eor @datax[3].16b, @datax[3].16b, @tweak[7].16b
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
&rev32(@datax[0],@datax[0]);
&rev32(@datax[1],@datax[1]);
&rev32(@datax[2],@datax[2]);
&rev32(@datax[3],@datax[3]);
&transpose(@data,@vtmp);
&transpose(@datax,@vtmp);
$code.=<<___;
bl _${prefix}_enc_8blks
___
&transpose(@vtmp,@datax);
&transpose(@data,@datax);
$code.=<<___;
eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
eor @data[0].16b, @data[0].16b, @tweak[4].16b
eor @data[1].16b, @data[1].16b, @tweak[5].16b
eor @data[2].16b, @data[2].16b, @tweak[6].16b
eor @data[3].16b, @data[3].16b, @tweak[7].16b
// save the last tweak
mov $lastTweak.16b,@tweak[7].16b
st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
subs $blocks,$blocks,#8
b.gt .Lxts_8_blocks_process${std}
b 100f
.Lxts_4_blocks_process${std}:
cmp $blocks,#4
b.lt 1f
ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
___
&rbit(@tweak[0],@tweak[0],$std);
&rbit(@tweak[1],@tweak[1],$std);
&rbit(@tweak[2],@tweak[2],$std);
&rbit(@tweak[3],@tweak[3],$std);
$code.=<<___;
eor @data[0].16b, @data[0].16b, @tweak[0].16b
eor @data[1].16b, @data[1].16b, @tweak[1].16b
eor @data[2].16b, @data[2].16b, @tweak[2].16b
eor @data[3].16b, @data[3].16b, @tweak[3].16b
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&rev32(@data[3],@data[3]);
&transpose(@data,@vtmp);
$code.=<<___;
bl _${prefix}_enc_4blks
___
&transpose(@vtmp,@data);
$code.=<<___;
eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
sub $blocks,$blocks,#4
mov @tweak[0].16b,@tweak[4].16b
mov @tweak[1].16b,@tweak[5].16b
mov @tweak[2].16b,@tweak[6].16b
// save the last tweak
mov $lastTweak.16b,@tweak[3].16b
1:
// process last block
cmp $blocks,#1
b.lt 100f
b.gt 1f
ld1 {@data[0].4s},[$inp],#16
___
&rbit(@tweak[0],@tweak[0],$std);
$code.=<<___;
eor @data[0].16b, @data[0].16b, @tweak[0].16b
___
&rev32(@data[0],@data[0]);
&encrypt_1blk(@data[0]);
$code.=<<___;
eor @data[0].16b, @data[0].16b, @tweak[0].16b
st1 {@data[0].4s},[$outp],#16
// save the last tweak
mov $lastTweak.16b,@tweak[0].16b
b 100f
1: // process last 2 blocks
cmp $blocks,#2
b.gt 1f
ld1 {@data[0].4s,@data[1].4s},[$inp],#32
___
&rbit(@tweak[0],@tweak[0],$std);
&rbit(@tweak[1],@tweak[1],$std);
$code.=<<___;
eor @data[0].16b, @data[0].16b, @tweak[0].16b
eor @data[1].16b, @data[1].16b, @tweak[1].16b
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&transpose(@data,@vtmp);
$code.=<<___;
bl _${prefix}_enc_4blks
___
&transpose(@vtmp,@data);
$code.=<<___;
eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
// save the last tweak
mov $lastTweak.16b,@tweak[1].16b
b 100f
1: // process last 3 blocks
ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
___
&rbit(@tweak[0],@tweak[0],$std);
&rbit(@tweak[1],@tweak[1],$std);
&rbit(@tweak[2],@tweak[2],$std);
$code.=<<___;
eor @data[0].16b, @data[0].16b, @tweak[0].16b
eor @data[1].16b, @data[1].16b, @tweak[1].16b
eor @data[2].16b, @data[2].16b, @tweak[2].16b
___
&rev32(@data[0],@data[0]);
&rev32(@data[1],@data[1]);
&rev32(@data[2],@data[2]);
&transpose(@data,@vtmp);
$code.=<<___;
bl _${prefix}_enc_4blks
___
&transpose(@vtmp,@data);
$code.=<<___;
eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
// save the last tweak
mov $lastTweak.16b,@tweak[2].16b
100:
cmp $remain,0
b.eq .return${std}
// This branch calculates the last two tweaks,
// while the encryption/decryption length is larger than 32
.last_2blks_tweak${std}:
___
&rev32_armeb($lastTweak,$lastTweak);
&compute_tweak_vec($lastTweak,@tweak[1],$std);
&compute_tweak_vec(@tweak[1],@tweak[2],$std);
$code.=<<___;
b .check_dec${std}
// This branch calculates the last two tweaks,
// while the encryption/decryption length is equal to 32, who only need two tweaks
.only_2blks_tweak${std}:
mov @tweak[1].16b,@tweak[0].16b
___
&rev32_armeb(@tweak[1],@tweak[1]);
&compute_tweak_vec(@tweak[1],@tweak[2],$std);
$code.=<<___;
b .check_dec${std}
// Determine whether encryption or decryption is required.
// The last two tweaks need to be swapped for decryption.
.check_dec${std}:
// encryption:1 decryption:0
cmp $enc,1
b.eq .process_last_2blks${std}
mov @vtmp[0].16B,@tweak[1].16b
mov @tweak[1].16B,@tweak[2].16b
mov @tweak[2].16B,@vtmp[0].16b
.process_last_2blks${std}:
___
&rev32_armeb(@tweak[1],@tweak[1]);
&rev32_armeb(@tweak[2],@tweak[2]);
$code.=<<___;
ld1 {@data[0].4s},[$inp],#16
eor @data[0].16b, @data[0].16b, @tweak[1].16b
___
&rev32(@data[0],@data[0]);
&encrypt_1blk(@data[0]);
$code.=<<___;
eor @data[0].16b, @data[0].16b, @tweak[1].16b
st1 {@data[0].4s},[$outp],#16
sub $lastBlk,$outp,16
.loop${std}:
subs $remain,$remain,1
ldrb $wtmp0,[$lastBlk,$remain]
ldrb $wtmp1,[$inp,$remain]
strb $wtmp1,[$lastBlk,$remain]
strb $wtmp0,[$outp,$remain]
b.gt .loop${std}
ld1 {@data[0].4s}, [$lastBlk]
eor @data[0].16b, @data[0].16b, @tweak[2].16b
___
&rev32(@data[0],@data[0]);
&encrypt_1blk(@data[0]);
$code.=<<___;
eor @data[0].16b, @data[0].16b, @tweak[2].16b
st1 {@data[0].4s}, [$lastBlk]
.return${std}:
ldp d14, d15, [sp], #0x10
ldp d12, d13, [sp], #0x10
ldp d10, d11, [sp], #0x10
ldp d8, d9, [sp], #0x10
ldp x29, x30, [sp], #0x10
ldp x27, x28, [sp], #0x10
ldp x25, x26, [sp], #0x10
ldp x23, x24, [sp], #0x10
ldp x21, x22, [sp], #0x10
ldp x19, x20, [sp], #0x10
ldp x17, x18, [sp], #0x10
ldp x15, x16, [sp], #0x10
AARCH64_VALIDATE_LINK_REGISTER
ret
.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
___
} # end of gen_xts_cipher
&gen_xts_cipher("_gb");
&gen_xts_cipher("");
}}}
########################################
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/\/\// and !/^$/);
print;
}
close SELF;
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";