linux/arch/arm64/crypto/sm4-ce-core.S
Tianjia Zhang 6b5360a5e0 crypto: arm64/sm4 - add CE implementation for cmac/xcbc/cbcmac
This patch is a CE-optimized assembly implementation for cmac/xcbc/cbcmac.

Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 300 mode of
tcrypt, and compared the performance before and after this patch (the driver
used before this patch is XXXmac(sm4-ce)). The abscissas are blocks of
different lengths. The data is tabulated and the unit is Mb/s:

Before:

update-size    |      16      64     256    1024    2048    4096    8192
---------------+--------------------------------------------------------
cmac(sm4-ce)   |  293.33  403.69  503.76  527.78  531.10  535.46  535.81
xcbc(sm4-ce)   |  292.83  402.50  504.02  529.08  529.87  536.55  538.24
cbcmac(sm4-ce) |  318.42  415.79  497.12  515.05  523.15  521.19  523.01

After:

update-size    |      16      64     256    1024    2048    4096    8192
---------------+--------------------------------------------------------
cmac-sm4-ce    |  371.99  675.28  903.56  971.65  980.57  990.40  991.04
xcbc-sm4-ce    |  372.11  674.55  903.47  971.61  980.96  990.42  991.10
cbcmac-sm4-ce  |  371.63  675.33  903.23  972.07  981.42  990.93  991.45

Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2022-11-04 17:34:43 +08:00

1094 lines
20 KiB
ArmAsm

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
* as specified in
* https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
*
* Copyright (C) 2022, Alibaba Group.
* Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
#include "sm4-ce-asm.h"
.arch armv8-a+crypto
.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
20, 24, 25, 26, 27, 28, 29, 30, 31
.set .Lv\b\().4s, \b
.endr
.macro sm4e, vd, vn
.inst 0xcec08400 | (.L\vn << 5) | .L\vd
.endm
.macro sm4ekey, vd, vn, vm
.inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
.endm
/* Register macros */
#define RTMP0 v16
#define RTMP1 v17
#define RTMP2 v18
#define RTMP3 v19
#define RIV v20
#define RMAC v20
#define RMASK v21
.align 3
SYM_FUNC_START(sm4_ce_expand_key)
/* input:
* x0: 128-bit key
* x1: rkey_enc
* x2: rkey_dec
* x3: fk array
* x4: ck array
*/
ld1 {v0.16b}, [x0];
rev32 v0.16b, v0.16b;
ld1 {v1.16b}, [x3];
/* load ck */
ld1 {v24.16b-v27.16b}, [x4], #64;
ld1 {v28.16b-v31.16b}, [x4];
/* input ^ fk */
eor v0.16b, v0.16b, v1.16b;
sm4ekey v0.4s, v0.4s, v24.4s;
sm4ekey v1.4s, v0.4s, v25.4s;
sm4ekey v2.4s, v1.4s, v26.4s;
sm4ekey v3.4s, v2.4s, v27.4s;
sm4ekey v4.4s, v3.4s, v28.4s;
sm4ekey v5.4s, v4.4s, v29.4s;
sm4ekey v6.4s, v5.4s, v30.4s;
sm4ekey v7.4s, v6.4s, v31.4s;
adr_l x5, .Lbswap128_mask
ld1 {v24.16b}, [x5]
st1 {v0.16b-v3.16b}, [x1], #64;
st1 {v4.16b-v7.16b}, [x1];
tbl v16.16b, {v7.16b}, v24.16b
tbl v17.16b, {v6.16b}, v24.16b
tbl v18.16b, {v5.16b}, v24.16b
tbl v19.16b, {v4.16b}, v24.16b
tbl v20.16b, {v3.16b}, v24.16b
tbl v21.16b, {v2.16b}, v24.16b
tbl v22.16b, {v1.16b}, v24.16b
tbl v23.16b, {v0.16b}, v24.16b
st1 {v16.16b-v19.16b}, [x2], #64
st1 {v20.16b-v23.16b}, [x2]
ret;
SYM_FUNC_END(sm4_ce_expand_key)
.align 3
SYM_FUNC_START(sm4_ce_crypt_block)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
*/
SM4_PREPARE(x0)
ld1 {v0.16b}, [x2];
SM4_CRYPT_BLK(v0);
st1 {v0.16b}, [x1];
ret;
SYM_FUNC_END(sm4_ce_crypt_block)
.align 3
SYM_FUNC_START(sm4_ce_crypt)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* w3: nblocks
*/
SM4_PREPARE(x0)
.Lcrypt_loop_blk:
sub w3, w3, #8;
tbnz w3, #31, .Lcrypt_tail8;
ld1 {v0.16b-v3.16b}, [x2], #64;
ld1 {v4.16b-v7.16b}, [x2], #64;
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
st1 {v0.16b-v3.16b}, [x1], #64;
st1 {v4.16b-v7.16b}, [x1], #64;
cbz w3, .Lcrypt_end;
b .Lcrypt_loop_blk;
.Lcrypt_tail8:
add w3, w3, #8;
cmp w3, #4;
blt .Lcrypt_tail4;
sub w3, w3, #4;
ld1 {v0.16b-v3.16b}, [x2], #64;
SM4_CRYPT_BLK4(v0, v1, v2, v3);
st1 {v0.16b-v3.16b}, [x1], #64;
cbz w3, .Lcrypt_end;
.Lcrypt_tail4:
sub w3, w3, #1;
ld1 {v0.16b}, [x2], #16;
SM4_CRYPT_BLK(v0);
st1 {v0.16b}, [x1], #16;
cbnz w3, .Lcrypt_tail4;
.Lcrypt_end:
ret;
SYM_FUNC_END(sm4_ce_crypt)
.align 3
SYM_FUNC_START(sm4_ce_cbc_enc)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: iv (big endian, 128 bit)
* w4: nblocks
*/
SM4_PREPARE(x0)
ld1 {RIV.16b}, [x3]
.Lcbc_enc_loop_4x:
cmp w4, #4
blt .Lcbc_enc_loop_1x
sub w4, w4, #4
ld1 {v0.16b-v3.16b}, [x2], #64
eor v0.16b, v0.16b, RIV.16b
SM4_CRYPT_BLK(v0)
eor v1.16b, v1.16b, v0.16b
SM4_CRYPT_BLK(v1)
eor v2.16b, v2.16b, v1.16b
SM4_CRYPT_BLK(v2)
eor v3.16b, v3.16b, v2.16b
SM4_CRYPT_BLK(v3)
st1 {v0.16b-v3.16b}, [x1], #64
mov RIV.16b, v3.16b
cbz w4, .Lcbc_enc_end
b .Lcbc_enc_loop_4x
.Lcbc_enc_loop_1x:
sub w4, w4, #1
ld1 {v0.16b}, [x2], #16
eor RIV.16b, RIV.16b, v0.16b
SM4_CRYPT_BLK(RIV)
st1 {RIV.16b}, [x1], #16
cbnz w4, .Lcbc_enc_loop_1x
.Lcbc_enc_end:
/* store new IV */
st1 {RIV.16b}, [x3]
ret
SYM_FUNC_END(sm4_ce_cbc_enc)
.align 3
SYM_FUNC_START(sm4_ce_cbc_dec)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: iv (big endian, 128 bit)
* w4: nblocks
*/
SM4_PREPARE(x0)
ld1 {RIV.16b}, [x3]
.Lcbc_dec_loop_8x:
sub w4, w4, #8
tbnz w4, #31, .Lcbc_dec_4x
ld1 {v0.16b-v3.16b}, [x2], #64
ld1 {v4.16b-v7.16b}, [x2], #64
rev32 v8.16b, v0.16b
rev32 v9.16b, v1.16b
rev32 v10.16b, v2.16b
rev32 v11.16b, v3.16b
rev32 v12.16b, v4.16b
rev32 v13.16b, v5.16b
rev32 v14.16b, v6.16b
rev32 v15.16b, v7.16b
SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
eor v8.16b, v8.16b, RIV.16b
eor v9.16b, v9.16b, v0.16b
eor v10.16b, v10.16b, v1.16b
eor v11.16b, v11.16b, v2.16b
eor v12.16b, v12.16b, v3.16b
eor v13.16b, v13.16b, v4.16b
eor v14.16b, v14.16b, v5.16b
eor v15.16b, v15.16b, v6.16b
st1 {v8.16b-v11.16b}, [x1], #64
st1 {v12.16b-v15.16b}, [x1], #64
mov RIV.16b, v7.16b
cbz w4, .Lcbc_dec_end
b .Lcbc_dec_loop_8x
.Lcbc_dec_4x:
add w4, w4, #8
cmp w4, #4
blt .Lcbc_dec_loop_1x
sub w4, w4, #4
ld1 {v0.16b-v3.16b}, [x2], #64
rev32 v8.16b, v0.16b
rev32 v9.16b, v1.16b
rev32 v10.16b, v2.16b
rev32 v11.16b, v3.16b
SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
eor v8.16b, v8.16b, RIV.16b
eor v9.16b, v9.16b, v0.16b
eor v10.16b, v10.16b, v1.16b
eor v11.16b, v11.16b, v2.16b
st1 {v8.16b-v11.16b}, [x1], #64
mov RIV.16b, v3.16b
cbz w4, .Lcbc_dec_end
.Lcbc_dec_loop_1x:
sub w4, w4, #1
ld1 {v0.16b}, [x2], #16
rev32 v8.16b, v0.16b
SM4_CRYPT_BLK_BE(v8)
eor v8.16b, v8.16b, RIV.16b
st1 {v8.16b}, [x1], #16
mov RIV.16b, v0.16b
cbnz w4, .Lcbc_dec_loop_1x
.Lcbc_dec_end:
/* store new IV */
st1 {RIV.16b}, [x3]
ret
SYM_FUNC_END(sm4_ce_cbc_dec)
.align 3
SYM_FUNC_START(sm4_ce_cbc_cts_enc)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: iv (big endian, 128 bit)
* w4: nbytes
*/
SM4_PREPARE(x0)
sub w5, w4, #16
uxtw x5, w5
ld1 {RIV.16b}, [x3]
ld1 {v0.16b}, [x2]
eor RIV.16b, RIV.16b, v0.16b
SM4_CRYPT_BLK(RIV)
/* load permute table */
adr_l x6, .Lcts_permute_table
add x7, x6, #32
add x6, x6, x5
sub x7, x7, x5
ld1 {v3.16b}, [x6]
ld1 {v4.16b}, [x7]
/* overlapping loads */
add x2, x2, x5
ld1 {v1.16b}, [x2]
/* create Cn from En-1 */
tbl v0.16b, {RIV.16b}, v3.16b
/* padding Pn with zeros */
tbl v1.16b, {v1.16b}, v4.16b
eor v1.16b, v1.16b, RIV.16b
SM4_CRYPT_BLK(v1)
/* overlapping stores */
add x5, x1, x5
st1 {v0.16b}, [x5]
st1 {v1.16b}, [x1]
ret
SYM_FUNC_END(sm4_ce_cbc_cts_enc)
.align 3
SYM_FUNC_START(sm4_ce_cbc_cts_dec)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: iv (big endian, 128 bit)
* w4: nbytes
*/
SM4_PREPARE(x0)
sub w5, w4, #16
uxtw x5, w5
ld1 {RIV.16b}, [x3]
/* load permute table */
adr_l x6, .Lcts_permute_table
add x7, x6, #32
add x6, x6, x5
sub x7, x7, x5
ld1 {v3.16b}, [x6]
ld1 {v4.16b}, [x7]
/* overlapping loads */
ld1 {v0.16b}, [x2], x5
ld1 {v1.16b}, [x2]
SM4_CRYPT_BLK(v0)
/* select the first Ln bytes of Xn to create Pn */
tbl v2.16b, {v0.16b}, v3.16b
eor v2.16b, v2.16b, v1.16b
/* overwrite the first Ln bytes with Cn to create En-1 */
tbx v0.16b, {v1.16b}, v4.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, RIV.16b
/* overlapping stores */
add x5, x1, x5
st1 {v2.16b}, [x5]
st1 {v0.16b}, [x1]
ret
SYM_FUNC_END(sm4_ce_cbc_cts_dec)
.align 3
SYM_FUNC_START(sm4_ce_cfb_enc)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: iv (big endian, 128 bit)
* w4: nblocks
*/
SM4_PREPARE(x0)
ld1 {RIV.16b}, [x3]
.Lcfb_enc_loop_4x:
cmp w4, #4
blt .Lcfb_enc_loop_1x
sub w4, w4, #4
ld1 {v0.16b-v3.16b}, [x2], #64
rev32 v8.16b, RIV.16b
SM4_CRYPT_BLK_BE(v8)
eor v0.16b, v0.16b, v8.16b
rev32 v8.16b, v0.16b
SM4_CRYPT_BLK_BE(v8)
eor v1.16b, v1.16b, v8.16b
rev32 v8.16b, v1.16b
SM4_CRYPT_BLK_BE(v8)
eor v2.16b, v2.16b, v8.16b
rev32 v8.16b, v2.16b
SM4_CRYPT_BLK_BE(v8)
eor v3.16b, v3.16b, v8.16b
st1 {v0.16b-v3.16b}, [x1], #64
mov RIV.16b, v3.16b
cbz w4, .Lcfb_enc_end
b .Lcfb_enc_loop_4x
.Lcfb_enc_loop_1x:
sub w4, w4, #1
ld1 {v0.16b}, [x2], #16
SM4_CRYPT_BLK(RIV)
eor RIV.16b, RIV.16b, v0.16b
st1 {RIV.16b}, [x1], #16
cbnz w4, .Lcfb_enc_loop_1x
.Lcfb_enc_end:
/* store new IV */
st1 {RIV.16b}, [x3]
ret
SYM_FUNC_END(sm4_ce_cfb_enc)
.align 3
SYM_FUNC_START(sm4_ce_cfb_dec)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: iv (big endian, 128 bit)
* w4: nblocks
*/
SM4_PREPARE(x0)
ld1 {RIV.16b}, [x3]
.Lcfb_dec_loop_8x:
sub w4, w4, #8
tbnz w4, #31, .Lcfb_dec_4x
ld1 {v0.16b-v3.16b}, [x2], #64
ld1 {v4.16b-v7.16b}, [x2], #64
rev32 v8.16b, RIV.16b
rev32 v9.16b, v0.16b
rev32 v10.16b, v1.16b
rev32 v11.16b, v2.16b
rev32 v12.16b, v3.16b
rev32 v13.16b, v4.16b
rev32 v14.16b, v5.16b
rev32 v15.16b, v6.16b
SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
mov RIV.16b, v7.16b
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
st1 {v0.16b-v3.16b}, [x1], #64
st1 {v4.16b-v7.16b}, [x1], #64
cbz w4, .Lcfb_dec_end
b .Lcfb_dec_loop_8x
.Lcfb_dec_4x:
add w4, w4, #8
cmp w4, #4
blt .Lcfb_dec_loop_1x
sub w4, w4, #4
ld1 {v0.16b-v3.16b}, [x2], #64
rev32 v8.16b, RIV.16b
rev32 v9.16b, v0.16b
rev32 v10.16b, v1.16b
rev32 v11.16b, v2.16b
SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
mov RIV.16b, v3.16b
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
st1 {v0.16b-v3.16b}, [x1], #64
cbz w4, .Lcfb_dec_end
.Lcfb_dec_loop_1x:
sub w4, w4, #1
ld1 {v0.16b}, [x2], #16
SM4_CRYPT_BLK(RIV)
eor RIV.16b, RIV.16b, v0.16b
st1 {RIV.16b}, [x1], #16
mov RIV.16b, v0.16b
cbnz w4, .Lcfb_dec_loop_1x
.Lcfb_dec_end:
/* store new IV */
st1 {RIV.16b}, [x3]
ret
SYM_FUNC_END(sm4_ce_cfb_dec)
.align 3
SYM_FUNC_START(sm4_ce_ctr_enc)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: ctr (big endian, 128 bit)
* w4: nblocks
*/
SM4_PREPARE(x0)
ldp x7, x8, [x3]
rev x7, x7
rev x8, x8
.Lctr_loop_8x:
sub w4, w4, #8
tbnz w4, #31, .Lctr_4x
#define inc_le128(vctr) \
mov vctr.d[1], x8; \
mov vctr.d[0], x7; \
adds x8, x8, #1; \
rev64 vctr.16b, vctr.16b; \
adc x7, x7, xzr;
/* construct CTRs */
inc_le128(v0) /* +0 */
inc_le128(v1) /* +1 */
inc_le128(v2) /* +2 */
inc_le128(v3) /* +3 */
inc_le128(v4) /* +4 */
inc_le128(v5) /* +5 */
inc_le128(v6) /* +6 */
inc_le128(v7) /* +7 */
ld1 {v8.16b-v11.16b}, [x2], #64
ld1 {v12.16b-v15.16b}, [x2], #64
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
st1 {v0.16b-v3.16b}, [x1], #64
st1 {v4.16b-v7.16b}, [x1], #64
cbz w4, .Lctr_end
b .Lctr_loop_8x
.Lctr_4x:
add w4, w4, #8
cmp w4, #4
blt .Lctr_loop_1x
sub w4, w4, #4
/* construct CTRs */
inc_le128(v0) /* +0 */
inc_le128(v1) /* +1 */
inc_le128(v2) /* +2 */
inc_le128(v3) /* +3 */
ld1 {v8.16b-v11.16b}, [x2], #64
SM4_CRYPT_BLK4(v0, v1, v2, v3)
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
st1 {v0.16b-v3.16b}, [x1], #64
cbz w4, .Lctr_end
.Lctr_loop_1x:
sub w4, w4, #1
/* construct CTRs */
inc_le128(v0)
ld1 {v8.16b}, [x2], #16
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v8.16b
st1 {v0.16b}, [x1], #16
cbnz w4, .Lctr_loop_1x
.Lctr_end:
/* store new CTR */
rev x7, x7
rev x8, x8
stp x7, x8, [x3]
ret
SYM_FUNC_END(sm4_ce_ctr_enc)
#define tweak_next(vt, vin, RTMP) \
sshr RTMP.2d, vin.2d, #63; \
and RTMP.16b, RTMP.16b, RMASK.16b; \
add vt.2d, vin.2d, vin.2d; \
ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \
eor vt.16b, vt.16b, RTMP.16b;
.align 3
SYM_FUNC_START(sm4_ce_xts_enc)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: tweak (big endian, 128 bit)
* w4: nbytes
* x5: round key array for IV
*/
ld1 {v8.16b}, [x3]
cbz x5, .Lxts_enc_nofirst
SM4_PREPARE(x5)
/* Generate first tweak */
SM4_CRYPT_BLK(v8)
.Lxts_enc_nofirst:
SM4_PREPARE(x0)
ands w5, w4, #15
lsr w4, w4, #4
sub w6, w4, #1
csel w4, w4, w6, eq
uxtw x5, w5
movi RMASK.2s, #0x1
movi RTMP0.2s, #0x87
uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
cbz w4, .Lxts_enc_cts
.Lxts_enc_loop_8x:
sub w4, w4, #8
tbnz w4, #31, .Lxts_enc_4x
tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)
tweak_next(v12, v11, RTMP3)
tweak_next(v13, v12, RTMP0)
tweak_next(v14, v13, RTMP1)
tweak_next(v15, v14, RTMP2)
ld1 {v0.16b-v3.16b}, [x2], #64
ld1 {v4.16b-v7.16b}, [x2], #64
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
st1 {v0.16b-v3.16b}, [x1], #64
st1 {v4.16b-v7.16b}, [x1], #64
tweak_next(v8, v15, RTMP3)
cbz w4, .Lxts_enc_cts
b .Lxts_enc_loop_8x
.Lxts_enc_4x:
add w4, w4, #8
cmp w4, #4
blt .Lxts_enc_loop_1x
sub w4, w4, #4
tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)
ld1 {v0.16b-v3.16b}, [x2], #64
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
SM4_CRYPT_BLK4(v0, v1, v2, v3)
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
st1 {v0.16b-v3.16b}, [x1], #64
tweak_next(v8, v11, RTMP3)
cbz w4, .Lxts_enc_cts
.Lxts_enc_loop_1x:
sub w4, w4, #1
ld1 {v0.16b}, [x2], #16
eor v0.16b, v0.16b, v8.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v8.16b
st1 {v0.16b}, [x1], #16
tweak_next(v8, v8, RTMP0)
cbnz w4, .Lxts_enc_loop_1x
.Lxts_enc_cts:
cbz x5, .Lxts_enc_end
/* cipher text stealing */
tweak_next(v9, v8, RTMP0)
ld1 {v0.16b}, [x2]
eor v0.16b, v0.16b, v8.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v8.16b
/* load permute table */
adr_l x6, .Lcts_permute_table
add x7, x6, #32
add x6, x6, x5
sub x7, x7, x5
ld1 {v3.16b}, [x6]
ld1 {v4.16b}, [x7]
/* overlapping loads */
add x2, x2, x5
ld1 {v1.16b}, [x2]
/* create Cn from En-1 */
tbl v2.16b, {v0.16b}, v3.16b
/* padding Pn with En-1 at the end */
tbx v0.16b, {v1.16b}, v4.16b
eor v0.16b, v0.16b, v9.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v9.16b
/* overlapping stores */
add x5, x1, x5
st1 {v2.16b}, [x5]
st1 {v0.16b}, [x1]
b .Lxts_enc_ret
.Lxts_enc_end:
/* store new tweak */
st1 {v8.16b}, [x3]
.Lxts_enc_ret:
ret
SYM_FUNC_END(sm4_ce_xts_enc)
.align 3
SYM_FUNC_START(sm4_ce_xts_dec)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: tweak (big endian, 128 bit)
* w4: nbytes
* x5: round key array for IV
*/
ld1 {v8.16b}, [x3]
cbz x5, .Lxts_dec_nofirst
SM4_PREPARE(x5)
/* Generate first tweak */
SM4_CRYPT_BLK(v8)
.Lxts_dec_nofirst:
SM4_PREPARE(x0)
ands w5, w4, #15
lsr w4, w4, #4
sub w6, w4, #1
csel w4, w4, w6, eq
uxtw x5, w5
movi RMASK.2s, #0x1
movi RTMP0.2s, #0x87
uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
cbz w4, .Lxts_dec_cts
.Lxts_dec_loop_8x:
sub w4, w4, #8
tbnz w4, #31, .Lxts_dec_4x
tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)
tweak_next(v12, v11, RTMP3)
tweak_next(v13, v12, RTMP0)
tweak_next(v14, v13, RTMP1)
tweak_next(v15, v14, RTMP2)
ld1 {v0.16b-v3.16b}, [x2], #64
ld1 {v4.16b-v7.16b}, [x2], #64
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
st1 {v0.16b-v3.16b}, [x1], #64
st1 {v4.16b-v7.16b}, [x1], #64
tweak_next(v8, v15, RTMP3)
cbz w4, .Lxts_dec_cts
b .Lxts_dec_loop_8x
.Lxts_dec_4x:
add w4, w4, #8
cmp w4, #4
blt .Lxts_dec_loop_1x
sub w4, w4, #4
tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)
ld1 {v0.16b-v3.16b}, [x2], #64
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
SM4_CRYPT_BLK4(v0, v1, v2, v3)
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
st1 {v0.16b-v3.16b}, [x1], #64
tweak_next(v8, v11, RTMP3)
cbz w4, .Lxts_dec_cts
.Lxts_dec_loop_1x:
sub w4, w4, #1
ld1 {v0.16b}, [x2], #16
eor v0.16b, v0.16b, v8.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v8.16b
st1 {v0.16b}, [x1], #16
tweak_next(v8, v8, RTMP0)
cbnz w4, .Lxts_dec_loop_1x
.Lxts_dec_cts:
cbz x5, .Lxts_dec_end
/* cipher text stealing */
tweak_next(v9, v8, RTMP0)
ld1 {v0.16b}, [x2]
eor v0.16b, v0.16b, v9.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v9.16b
/* load permute table */
adr_l x6, .Lcts_permute_table
add x7, x6, #32
add x6, x6, x5
sub x7, x7, x5
ld1 {v3.16b}, [x6]
ld1 {v4.16b}, [x7]
/* overlapping loads */
add x2, x2, x5
ld1 {v1.16b}, [x2]
/* create Cn from En-1 */
tbl v2.16b, {v0.16b}, v3.16b
/* padding Pn with En-1 at the end */
tbx v0.16b, {v1.16b}, v4.16b
eor v0.16b, v0.16b, v8.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v8.16b
/* overlapping stores */
add x5, x1, x5
st1 {v2.16b}, [x5]
st1 {v0.16b}, [x1]
b .Lxts_dec_ret
.Lxts_dec_end:
/* store new tweak */
st1 {v8.16b}, [x3]
.Lxts_dec_ret:
ret
SYM_FUNC_END(sm4_ce_xts_dec)
.align 3
SYM_FUNC_START(sm4_ce_mac_update)
/* input:
* x0: round key array, CTX
* x1: digest
* x2: src
* w3: nblocks
* w4: enc_before
* w5: enc_after
*/
SM4_PREPARE(x0)
ld1 {RMAC.16b}, [x1]
cbz w4, .Lmac_update
SM4_CRYPT_BLK(RMAC)
.Lmac_update:
cbz w3, .Lmac_ret
sub w6, w3, #1
cmp w5, wzr
csel w3, w3, w6, ne
cbz w3, .Lmac_end
.Lmac_loop_4x:
cmp w3, #4
blt .Lmac_loop_1x
sub w3, w3, #4
ld1 {v0.16b-v3.16b}, [x2], #64
eor RMAC.16b, RMAC.16b, v0.16b
SM4_CRYPT_BLK(RMAC)
eor RMAC.16b, RMAC.16b, v1.16b
SM4_CRYPT_BLK(RMAC)
eor RMAC.16b, RMAC.16b, v2.16b
SM4_CRYPT_BLK(RMAC)
eor RMAC.16b, RMAC.16b, v3.16b
SM4_CRYPT_BLK(RMAC)
cbz w3, .Lmac_end
b .Lmac_loop_4x
.Lmac_loop_1x:
sub w3, w3, #1
ld1 {v0.16b}, [x2], #16
eor RMAC.16b, RMAC.16b, v0.16b
SM4_CRYPT_BLK(RMAC)
cbnz w3, .Lmac_loop_1x
.Lmac_end:
cbnz w5, .Lmac_ret
ld1 {v0.16b}, [x2], #16
eor RMAC.16b, RMAC.16b, v0.16b
.Lmac_ret:
st1 {RMAC.16b}, [x1]
ret
SYM_FUNC_END(sm4_ce_mac_update)
.section ".rodata", "a"
.align 4
.Lbswap128_mask:
.byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
.byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
.Lcts_permute_table:
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff