openssl/crypto/sm4/asm/sm4-armv8.pl
Daniel Hu 15b7175f55 SM4 optimization for ARM by HW instruction
This patch implements the SM4 optimization for ARM processor,
using SM4 HW instruction, which is an optional feature of
crypto extension for aarch64 V8.

Tested on some modern ARM micro-architectures with SM4 support, the
performance uplift can be observed around 8X~40X over existing
C implementation in openssl. Algorithms that can be parallelized
(like CTR, ECB, CBC decryption) are on higher end, with algorithm
like CBC encryption on lower end (due to inter-block dependency)

Perf data on Yitian-710 2.75GHz hardware, before and after optimization:

Before:
  type      16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes  16384 bytes
  SM4-CTR  105787.80k   107837.87k   108380.84k   108462.08k   108549.46k   108554.92k
  SM4-ECB  111924.58k   118173.76k   119776.00k   120093.70k   120264.02k   120274.94k
  SM4-CBC  106428.09k   109190.98k   109674.33k   109774.51k   109827.41k   109827.41k

After (7.4x - 36.6x faster):
  type      16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes  16384 bytes
  SM4-CTR  781979.02k  2432994.28k  3437753.86k  3834177.88k  3963715.58k  3974556.33k
  SM4-ECB  937590.69k  2941689.02k  3945751.81k  4328655.87k  4459181.40k  4468692.31k
  SM4-CBC  890639.88k  1027746.58k  1050621.78k  1056696.66k  1058613.93k  1058701.31k

Signed-off-by: Daniel Hu <Daniel.Hu@arm.com>

Reviewed-by: Paul Dale <pauli@openssl.org>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/17455)
2022-01-18 11:52:14 +01:00

636 lines
16 KiB
Raku
Executable File

#! /usr/bin/env perl
# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# This module implements support for SM4 hw support on aarch64
# Oct 2021
#
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour \"$output\""
or die "can't call $xlate: $!";
*STDOUT=*OUT;
$prefix="sm4_v8";
my @rks=map("v$_",(0..7));
sub rev32() {
my $dst = shift;
my $src = shift;
$code.=<<___;
#ifndef __ARMEB__
rev32 $dst.16b,$src.16b
#endif
___
}
sub enc_blk () {
my $data = shift;
$code.=<<___;
sm4e $data.4s,@rks[0].4s
sm4e $data.4s,@rks[1].4s
sm4e $data.4s,@rks[2].4s
sm4e $data.4s,@rks[3].4s
sm4e $data.4s,@rks[4].4s
sm4e $data.4s,@rks[5].4s
sm4e $data.4s,@rks[6].4s
sm4e $data.4s,@rks[7].4s
rev64 $data.4S,$data.4S
ext $data.16b,$data.16b,$data.16b,#8
___
}
sub enc_4blks () {
my $data0 = shift;
my $data1 = shift;
my $data2 = shift;
my $data3 = shift;
$code.=<<___;
sm4e $data0.4s,@rks[0].4s
sm4e $data1.4s,@rks[0].4s
sm4e $data2.4s,@rks[0].4s
sm4e $data3.4s,@rks[0].4s
sm4e $data0.4s,@rks[1].4s
sm4e $data1.4s,@rks[1].4s
sm4e $data2.4s,@rks[1].4s
sm4e $data3.4s,@rks[1].4s
sm4e $data0.4s,@rks[2].4s
sm4e $data1.4s,@rks[2].4s
sm4e $data2.4s,@rks[2].4s
sm4e $data3.4s,@rks[2].4s
sm4e $data0.4s,@rks[3].4s
sm4e $data1.4s,@rks[3].4s
sm4e $data2.4s,@rks[3].4s
sm4e $data3.4s,@rks[3].4s
sm4e $data0.4s,@rks[4].4s
sm4e $data1.4s,@rks[4].4s
sm4e $data2.4s,@rks[4].4s
sm4e $data3.4s,@rks[4].4s
sm4e $data0.4s,@rks[5].4s
sm4e $data1.4s,@rks[5].4s
sm4e $data2.4s,@rks[5].4s
sm4e $data3.4s,@rks[5].4s
sm4e $data0.4s,@rks[6].4s
sm4e $data1.4s,@rks[6].4s
sm4e $data2.4s,@rks[6].4s
sm4e $data3.4s,@rks[6].4s
sm4e $data0.4s,@rks[7].4s
rev64 $data0.4S,$data0.4S
sm4e $data1.4s,@rks[7].4s
ext $data0.16b,$data0.16b,$data0.16b,#8
rev64 $data1.4S,$data1.4S
sm4e $data2.4s,@rks[7].4s
ext $data1.16b,$data1.16b,$data1.16b,#8
rev64 $data2.4S,$data2.4S
sm4e $data3.4s,@rks[7].4s
ext $data2.16b,$data2.16b,$data2.16b,#8
rev64 $data3.4S,$data3.4S
ext $data3.16b,$data3.16b,$data3.16b,#8
___
}
$code=<<___;
#include "arm_arch.h"
.arch armv8-a+crypto
.text
___
{{{
$code.=<<___;
.align 6
.Lck:
.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
.Lfk:
.long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
___
}}}
{{{
my ($key,$keys)=("x0","x1");
my ($tmp)=("x2");
my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7));
my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
my ($fkconst) = ("v24");
$code.=<<___;
.globl ${prefix}_set_encrypt_key
.type ${prefix}_set_encrypt_key,%function
.align 5
${prefix}_set_encrypt_key:
AARCH64_VALID_CALL_TARGET
ld1 {$key0.4s},[$key]
adr $tmp,.Lfk
ld1 {$fkconst.4s},[$tmp]
adr $tmp,.Lck
ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
___
&rev32($key0, $key0);
$code.=<<___;
ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
eor $key0.16b,$key0.16b,$fkconst.16b;
sm4ekey $key0.4S,$key0.4S,$const0.4S
sm4ekey $key1.4S,$key0.4S,$const1.4S
sm4ekey $key2.4S,$key1.4S,$const2.4S
sm4ekey $key3.4S,$key2.4S,$const3.4S
sm4ekey $key4.4S,$key3.4S,$const4.4S
st1 {$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64
sm4ekey $key5.4S,$key4.4S,$const5.4S
sm4ekey $key6.4S,$key5.4S,$const6.4S
sm4ekey $key7.4S,$key6.4S,$const7.4S
st1 {$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys]
ret
.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
___
}}}
{{{
my ($key,$keys)=("x0","x1");
my ($tmp)=("x2");
my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7));
my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
my ($fkconst) = ("v24");
$code.=<<___;
.globl ${prefix}_set_decrypt_key
.type ${prefix}_set_decrypt_key,%function
.align 5
${prefix}_set_decrypt_key:
AARCH64_VALID_CALL_TARGET
ld1 {$key0.4s},[$key]
adr $tmp,.Lfk
ld1 {$fkconst.4s},[$tmp]
adr $tmp, .Lck
ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
___
&rev32($key0, $key0);
$code.=<<___;
ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
eor $key0.16b, $key0.16b,$fkconst.16b;
sm4ekey $key0.4S,$key0.4S,$const0.4S
sm4ekey $key1.4S,$key0.4S,$const1.4S
sm4ekey $key2.4S,$key1.4S,$const2.4S
rev64 $key0.4s,$key0.4s
rev64 $key1.4s,$key1.4s
ext $key0.16b,$key0.16b,$key0.16b,#8
ext $key1.16b,$key1.16b,$key1.16b,#8
sm4ekey $key3.4S,$key2.4S,$const3.4S
sm4ekey $key4.4S,$key3.4S,$const4.4S
rev64 $key2.4s,$key2.4s
rev64 $key3.4s,$key3.4s
ext $key2.16b,$key2.16b,$key2.16b,#8
ext $key3.16b,$key3.16b,$key3.16b,#8
sm4ekey $key5.4S,$key4.4S,$const5.4S
sm4ekey $key6.4S,$key5.4S,$const6.4S
rev64 $key4.4s,$key4.4s
rev64 $key5.4s,$key5.4s
ext $key4.16b,$key4.16b,$key4.16b,#8
ext $key5.16b,$key5.16b,$key5.16b,#8
sm4ekey $key7.4S,$key6.4S,$const7.4S
rev64 $key6.4s, $key6.4s
rev64 $key7.4s, $key7.4s
ext $key6.16b,$key6.16b,$key6.16b,#8
ext $key7.16b,$key7.16b,$key7.16b,#8
st1 {$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64
st1 {$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys]
ret
.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
___
}}}
{{{
sub gen_block () {
my $dir = shift;
my ($inp,$out,$rk)=map("x$_",(0..2));
my ($data)=("v16");
$code.=<<___;
.globl ${prefix}_${dir}crypt
.type ${prefix}_${dir}crypt,%function
.align 5
${prefix}_${dir}crypt:
AARCH64_VALID_CALL_TARGET
ld1 {$data.4s},[$inp]
ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
___
&rev32($data,$data);
&enc_blk($data);
&rev32($data,$data);
$code.=<<___;
st1 {$data.4s},[$out]
ret
.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
___
}
&gen_block("en");
&gen_block("de");
}}}
{{{
my ($inp,$out,$len,$rk)=map("x$_",(0..3));
my ($enc) = ("w4");
my @dat=map("v$_",(16..23));
$code.=<<___;
.globl ${prefix}_ecb_encrypt
.type ${prefix}_ecb_encrypt,%function
.align 5
${prefix}_ecb_encrypt:
AARCH64_VALID_CALL_TARGET
ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
1:
cmp $len,#64
b.lt 1f
ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
cmp $len,#128
b.lt 2f
ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64
// 8 blocks
___
&rev32(@dat[0],@dat[0]);
&rev32(@dat[1],@dat[1]);
&rev32(@dat[2],@dat[2]);
&rev32(@dat[3],@dat[3]);
&rev32(@dat[4],@dat[4]);
&rev32(@dat[5],@dat[5]);
&rev32(@dat[6],@dat[6]);
&rev32(@dat[7],@dat[7]);
&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
&enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
&rev32(@dat[0],@dat[0]);
&rev32(@dat[1],@dat[1]);
&rev32(@dat[2],@dat[2]);
&rev32(@dat[3],@dat[3]);
&rev32(@dat[4],@dat[4]);
&rev32(@dat[5],@dat[5]);
$code.=<<___;
st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
___
&rev32(@dat[6],@dat[6]);
&rev32(@dat[7],@dat[7]);
$code.=<<___;
st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
subs $len,$len,#128
b.gt 1b
ret
// 4 blocks
2:
___
&rev32(@dat[0],@dat[0]);
&rev32(@dat[1],@dat[1]);
&rev32(@dat[2],@dat[2]);
&rev32(@dat[3],@dat[3]);
&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
&rev32(@dat[0],@dat[0]);
&rev32(@dat[1],@dat[1]);
&rev32(@dat[2],@dat[2]);
&rev32(@dat[3],@dat[3]);
$code.=<<___;
st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
subs $len,$len,#64
b.gt 1b
1:
subs $len,$len,#16
b.lt 1f
ld1 {@dat[0].4s},[$inp],#16
___
&rev32(@dat[0],@dat[0]);
&enc_blk(@dat[0]);
&rev32(@dat[0],@dat[0]);
$code.=<<___;
st1 {@dat[0].4s},[$out],#16
b.ne 1b
1:
ret
.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
___
}}}
{{{
my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
my ($enc) = ("w5");
my @dat=map("v$_",(16..23));
my @in=map("v$_",(24..31));
my ($ivec) = ("v8");
$code.=<<___;
.globl ${prefix}_cbc_encrypt
.type ${prefix}_cbc_encrypt,%function
.align 5
${prefix}_cbc_encrypt:
AARCH64_VALID_CALL_TARGET
stp d8,d9,[sp, #-16]!
ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
ld1 {$ivec.4s},[$ivp]
cmp $enc,#0
b.eq .Ldec
1:
cmp $len, #64
b.lt 1f
ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
eor @dat[0].16b,@dat[0].16b,$ivec.16b
___
&rev32(@dat[1],@dat[1]);
&rev32(@dat[0],@dat[0]);
&rev32(@dat[2],@dat[2]);
&rev32(@dat[3],@dat[3]);
&enc_blk(@dat[0]);
$code.=<<___;
eor @dat[1].16b,@dat[1].16b,@dat[0].16b
___
&enc_blk(@dat[1]);
&rev32(@dat[0],@dat[0]);
$code.=<<___;
eor @dat[2].16b,@dat[2].16b,@dat[1].16b
___
&enc_blk(@dat[2]);
&rev32(@dat[1],@dat[1]);
$code.=<<___;
eor @dat[3].16b,@dat[3].16b,@dat[2].16b
___
&enc_blk(@dat[3]);
&rev32(@dat[2],@dat[2]);
&rev32(@dat[3],@dat[3]);
$code.=<<___;
mov $ivec.16b,@dat[3].16b
st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
subs $len,$len,#64
b.ne 1b
1:
subs $len,$len,#16
b.lt 3f
ld1 {@dat[0].4s},[$inp],#16
eor $ivec.16b,$ivec.16b,@dat[0].16b
___
&rev32($ivec,$ivec);
&enc_blk($ivec);
&rev32($ivec,$ivec);
$code.=<<___;
st1 {$ivec.16b},[$out],#16
b.ne 1b
b 3f
.Ldec:
1:
cmp $len, #64
b.lt 1f
ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp]
ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
cmp $len,#128
b.lt 2f
// 8 blocks mode
ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp]
ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
___
&rev32(@dat[0],@dat[0]);
&rev32(@dat[1],@dat[1]);
&rev32(@dat[2],@dat[2]);
&rev32(@dat[3],$dat[3]);
&rev32(@dat[4],@dat[4]);
&rev32(@dat[5],@dat[5]);
&rev32(@dat[6],@dat[6]);
&rev32(@dat[7],$dat[7]);
&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
&enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
&rev32(@dat[0],@dat[0]);
&rev32(@dat[1],@dat[1]);
&rev32(@dat[2],@dat[2]);
&rev32(@dat[3],@dat[3]);
&rev32(@dat[4],@dat[4]);
&rev32(@dat[5],@dat[5]);
&rev32(@dat[6],@dat[6]);
&rev32(@dat[7],@dat[7]);
$code.=<<___;
eor @dat[0].16b,@dat[0].16b,$ivec.16b
eor @dat[1].16b,@dat[1].16b,@in[0].16b
eor @dat[2].16b,@dat[2].16b,@in[1].16b
mov $ivec.16b,@in[7].16b
eor @dat[3].16b,$dat[3].16b,@in[2].16b
eor @dat[4].16b,$dat[4].16b,@in[3].16b
eor @dat[5].16b,$dat[5].16b,@in[4].16b
eor @dat[6].16b,$dat[6].16b,@in[5].16b
eor @dat[7].16b,$dat[7].16b,@in[6].16b
st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
subs $len,$len,128
b.gt 1b
b 3f
// 4 blocks mode
2:
___
&rev32(@dat[0],@dat[0]);
&rev32(@dat[1],@dat[1]);
&rev32(@dat[2],@dat[2]);
&rev32(@dat[3],$dat[3]);
&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
&rev32(@dat[0],@dat[0]);
&rev32(@dat[1],@dat[1]);
&rev32(@dat[2],@dat[2]);
&rev32(@dat[3],@dat[3]);
$code.=<<___;
eor @dat[0].16b,@dat[0].16b,$ivec.16b
eor @dat[1].16b,@dat[1].16b,@in[0].16b
mov $ivec.16b,@in[3].16b
eor @dat[2].16b,@dat[2].16b,@in[1].16b
eor @dat[3].16b,$dat[3].16b,@in[2].16b
st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
subs $len,$len,#64
b.gt 1b
1:
subs $len,$len,#16
b.lt 3f
ld1 {@dat[0].4s},[$inp],#16
mov @in[0].16b,@dat[0].16b
___
&rev32(@dat[0],@dat[0]);
&enc_blk(@dat[0]);
&rev32(@dat[0],@dat[0]);
$code.=<<___;
eor @dat[0].16b,@dat[0].16b,$ivec.16b
mov $ivec.16b,@in[0].16b
st1 {@dat[0].16b},[$out],#16
b.ne 1b
3:
// save back IV
st1 {$ivec.16b},[$ivp]
ldp d8,d9,[sp],#16
ret
.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
___
}}}
{{{
my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
my ($ctr)=("w5");
my @dat=map("v$_",(16..23));
my @in=map("v$_",(24..31));
my ($ivec)=("v8");
$code.=<<___;
.globl ${prefix}_ctr32_encrypt_blocks
.type ${prefix}_ctr32_encrypt_blocks,%function
.align 5
${prefix}_ctr32_encrypt_blocks:
AARCH64_VALID_CALL_TARGET
stp d8,d9,[sp, #-16]!
ld1 {$ivec.4s},[$ivp]
ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
___
&rev32($ivec,$ivec);
$code.=<<___;
mov $ctr,$ivec.s[3]
1:
cmp $len,#4
b.lt 1f
ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
mov @dat[0].16b,$ivec.16b
mov @dat[1].16b,$ivec.16b
mov @dat[2].16b,$ivec.16b
mov @dat[3].16b,$ivec.16b
add $ctr,$ctr,#1
mov $dat[1].s[3],$ctr
add $ctr,$ctr,#1
mov @dat[2].s[3],$ctr
add $ctr,$ctr,#1
mov @dat[3].s[3],$ctr
cmp $len,#8
b.lt 2f
ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
mov @dat[4].16b,$ivec.16b
mov @dat[5].16b,$ivec.16b
mov @dat[6].16b,$ivec.16b
mov @dat[7].16b,$ivec.16b
add $ctr,$ctr,#1
mov $dat[4].s[3],$ctr
add $ctr,$ctr,#1
mov @dat[5].s[3],$ctr
add $ctr,$ctr,#1
mov @dat[6].s[3],$ctr
add $ctr,$ctr,#1
mov @dat[7].s[3],$ctr
___
&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
&enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
&rev32(@dat[0],@dat[0]);
&rev32(@dat[1],@dat[1]);
&rev32(@dat[2],@dat[2]);
&rev32(@dat[3],@dat[3]);
&rev32(@dat[4],@dat[4]);
&rev32(@dat[5],@dat[5]);
&rev32(@dat[6],@dat[6]);
&rev32(@dat[7],@dat[7]);
$code.=<<___;
eor @dat[0].16b,@dat[0].16b,@in[0].16b
eor @dat[1].16b,@dat[1].16b,@in[1].16b
eor @dat[2].16b,@dat[2].16b,@in[2].16b
eor @dat[3].16b,@dat[3].16b,@in[3].16b
eor @dat[4].16b,@dat[4].16b,@in[4].16b
eor @dat[5].16b,@dat[5].16b,@in[5].16b
eor @dat[6].16b,@dat[6].16b,@in[6].16b
eor @dat[7].16b,@dat[7].16b,@in[7].16b
st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
subs $len,$len,#8
b.eq 3f
add $ctr,$ctr,#1
mov $ivec.s[3],$ctr
b 1b
2:
___
&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
&rev32(@dat[0],@dat[0]);
&rev32(@dat[1],@dat[1]);
&rev32(@dat[2],@dat[2]);
&rev32(@dat[3],@dat[3]);
$code.=<<___;
eor @dat[0].16b,@dat[0].16b,@in[0].16b
eor @dat[1].16b,@dat[1].16b,@in[1].16b
eor @dat[2].16b,@dat[2].16b,@in[2].16b
eor @dat[3].16b,@dat[3].16b,@in[3].16b
st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
subs $len,$len,#4
b.eq 3f
add $ctr,$ctr,#1
mov $ivec.s[3],$ctr
b 1b
1:
subs $len,$len,#1
b.lt 3f
mov $dat[0].16b,$ivec.16b
ld1 {@in[0].4s},[$inp],#16
___
&enc_blk(@dat[0]);
&rev32(@dat[0],@dat[0]);
$code.=<<___;
eor $dat[0].16b,$dat[0].16b,@in[0].16b
st1 {$dat[0].4s},[$out],#16
b.eq 3f
add $ctr,$ctr,#1
mov $ivec.s[3],$ctr
b 1b
3:
ldp d8,d9,[sp],#16
ret
.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
___
}}}
########################################
{ my %opcode = (
"sm4e" => 0xcec08400,
"sm4ekey" => 0xce60c800);
sub unsm4 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/\/\// and !/^$/);
print;
}
close SELF;
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";