2019-06-04 16:11:33 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-only */
|
2017-01-12 00:41:54 +08:00
|
|
|
/*
|
|
|
|
* Bit sliced AES using NEON instructions
|
|
|
|
*
|
|
|
|
* Copyright (C) 2017 Linaro Ltd.
|
|
|
|
* Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The algorithm implemented here is described in detail by the paper
|
|
|
|
* 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
|
|
|
|
* Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
|
|
|
|
*
|
|
|
|
* This implementation is based primarily on the OpenSSL implementation
|
|
|
|
* for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
#include <asm/assembler.h>
|
|
|
|
|
|
|
|
.text
|
|
|
|
.fpu neon
|
|
|
|
|
|
|
|
rounds .req ip
|
|
|
|
bskey .req r4
|
|
|
|
|
|
|
|
q0l .req d0
|
|
|
|
q0h .req d1
|
|
|
|
q1l .req d2
|
|
|
|
q1h .req d3
|
|
|
|
q2l .req d4
|
|
|
|
q2h .req d5
|
|
|
|
q3l .req d6
|
|
|
|
q3h .req d7
|
|
|
|
q4l .req d8
|
|
|
|
q4h .req d9
|
|
|
|
q5l .req d10
|
|
|
|
q5h .req d11
|
|
|
|
q6l .req d12
|
|
|
|
q6h .req d13
|
|
|
|
q7l .req d14
|
|
|
|
q7h .req d15
|
|
|
|
q8l .req d16
|
|
|
|
q8h .req d17
|
|
|
|
q9l .req d18
|
|
|
|
q9h .req d19
|
|
|
|
q10l .req d20
|
|
|
|
q10h .req d21
|
|
|
|
q11l .req d22
|
|
|
|
q11h .req d23
|
|
|
|
q12l .req d24
|
|
|
|
q12h .req d25
|
|
|
|
q13l .req d26
|
|
|
|
q13h .req d27
|
|
|
|
q14l .req d28
|
|
|
|
q14h .req d29
|
|
|
|
q15l .req d30
|
|
|
|
q15h .req d31
|
|
|
|
|
|
|
|
.macro __tbl, out, tbl, in, tmp
|
|
|
|
.ifc \out, \tbl
|
|
|
|
.ifb \tmp
|
|
|
|
.error __tbl needs temp register if out == tbl
|
|
|
|
.endif
|
|
|
|
vmov \tmp, \out
|
|
|
|
.endif
|
|
|
|
vtbl.8 \out\()l, {\tbl}, \in\()l
|
|
|
|
.ifc \out, \tbl
|
|
|
|
vtbl.8 \out\()h, {\tmp}, \in\()h
|
|
|
|
.else
|
|
|
|
vtbl.8 \out\()h, {\tbl}, \in\()h
|
|
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro __ldr, out, sym
|
|
|
|
vldr \out\()l, \sym
|
|
|
|
vldr \out\()h, \sym + 8
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
|
|
|
|
veor \b2, \b2, \b1
|
|
|
|
veor \b5, \b5, \b6
|
|
|
|
veor \b3, \b3, \b0
|
|
|
|
veor \b6, \b6, \b2
|
|
|
|
veor \b5, \b5, \b0
|
|
|
|
veor \b6, \b6, \b3
|
|
|
|
veor \b3, \b3, \b7
|
|
|
|
veor \b7, \b7, \b5
|
|
|
|
veor \b3, \b3, \b4
|
|
|
|
veor \b4, \b4, \b5
|
|
|
|
veor \b2, \b2, \b7
|
|
|
|
veor \b3, \b3, \b1
|
|
|
|
veor \b1, \b1, \b5
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
|
|
|
|
veor \b0, \b0, \b6
|
|
|
|
veor \b1, \b1, \b4
|
|
|
|
veor \b4, \b4, \b6
|
|
|
|
veor \b2, \b2, \b0
|
|
|
|
veor \b6, \b6, \b1
|
|
|
|
veor \b1, \b1, \b5
|
|
|
|
veor \b5, \b5, \b3
|
|
|
|
veor \b3, \b3, \b7
|
|
|
|
veor \b7, \b7, \b5
|
|
|
|
veor \b2, \b2, \b5
|
|
|
|
veor \b4, \b4, \b7
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
|
|
|
|
veor \b1, \b1, \b7
|
|
|
|
veor \b4, \b4, \b7
|
|
|
|
veor \b7, \b7, \b5
|
|
|
|
veor \b1, \b1, \b3
|
|
|
|
veor \b2, \b2, \b5
|
|
|
|
veor \b3, \b3, \b7
|
|
|
|
veor \b6, \b6, \b1
|
|
|
|
veor \b2, \b2, \b0
|
|
|
|
veor \b5, \b5, \b3
|
|
|
|
veor \b4, \b4, \b6
|
|
|
|
veor \b0, \b0, \b6
|
|
|
|
veor \b1, \b1, \b4
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
|
|
|
|
veor \b1, \b1, \b5
|
|
|
|
veor \b2, \b2, \b7
|
|
|
|
veor \b3, \b3, \b1
|
|
|
|
veor \b4, \b4, \b5
|
|
|
|
veor \b7, \b7, \b5
|
|
|
|
veor \b3, \b3, \b4
|
|
|
|
veor \b5, \b5, \b0
|
|
|
|
veor \b3, \b3, \b7
|
|
|
|
veor \b6, \b6, \b2
|
|
|
|
veor \b2, \b2, \b1
|
|
|
|
veor \b6, \b6, \b3
|
|
|
|
veor \b3, \b3, \b0
|
|
|
|
veor \b5, \b5, \b6
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro mul_gf4, x0, x1, y0, y1, t0, t1
|
|
|
|
veor \t0, \y0, \y1
|
|
|
|
vand \t0, \t0, \x0
|
|
|
|
veor \x0, \x0, \x1
|
|
|
|
vand \t1, \x1, \y0
|
|
|
|
vand \x0, \x0, \y1
|
|
|
|
veor \x1, \t1, \t0
|
|
|
|
veor \x0, \x0, \t1
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
|
|
|
|
veor \t0, \y0, \y1
|
|
|
|
veor \t1, \y2, \y3
|
|
|
|
vand \t0, \t0, \x0
|
|
|
|
vand \t1, \t1, \x2
|
|
|
|
veor \x0, \x0, \x1
|
|
|
|
veor \x2, \x2, \x3
|
|
|
|
vand \x1, \x1, \y0
|
|
|
|
vand \x3, \x3, \y2
|
|
|
|
vand \x0, \x0, \y1
|
|
|
|
vand \x2, \x2, \y3
|
|
|
|
veor \x1, \x1, \x0
|
|
|
|
veor \x2, \x2, \x3
|
|
|
|
veor \x0, \x0, \t0
|
|
|
|
veor \x3, \x3, \t1
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
|
|
|
|
y0, y1, y2, y3, t0, t1, t2, t3
|
|
|
|
veor \t0, \x0, \x2
|
|
|
|
veor \t1, \x1, \x3
|
|
|
|
mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
|
|
|
|
veor \y0, \y0, \y2
|
|
|
|
veor \y1, \y1, \y3
|
|
|
|
mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
|
|
|
|
veor \x0, \x0, \t0
|
|
|
|
veor \x2, \x2, \t0
|
|
|
|
veor \x1, \x1, \t1
|
|
|
|
veor \x3, \x3, \t1
|
|
|
|
veor \t0, \x4, \x6
|
|
|
|
veor \t1, \x5, \x7
|
|
|
|
mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
|
|
|
|
veor \y0, \y0, \y2
|
|
|
|
veor \y1, \y1, \y3
|
|
|
|
mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
|
|
|
|
veor \x4, \x4, \t0
|
|
|
|
veor \x6, \x6, \t0
|
|
|
|
veor \x5, \x5, \t1
|
|
|
|
veor \x7, \x7, \t1
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
|
|
|
|
t0, t1, t2, t3, s0, s1, s2, s3
|
|
|
|
veor \t3, \x4, \x6
|
|
|
|
veor \t0, \x5, \x7
|
|
|
|
veor \t1, \x1, \x3
|
|
|
|
veor \s1, \x7, \x6
|
|
|
|
veor \s0, \x0, \x2
|
|
|
|
veor \s3, \t3, \t0
|
|
|
|
vorr \t2, \t0, \t1
|
|
|
|
vand \s2, \t3, \s0
|
|
|
|
vorr \t3, \t3, \s0
|
|
|
|
veor \s0, \s0, \t1
|
|
|
|
vand \t0, \t0, \t1
|
|
|
|
veor \t1, \x3, \x2
|
|
|
|
vand \s3, \s3, \s0
|
|
|
|
vand \s1, \s1, \t1
|
|
|
|
veor \t1, \x4, \x5
|
|
|
|
veor \s0, \x1, \x0
|
|
|
|
veor \t3, \t3, \s1
|
|
|
|
veor \t2, \t2, \s1
|
|
|
|
vand \s1, \t1, \s0
|
|
|
|
vorr \t1, \t1, \s0
|
|
|
|
veor \t3, \t3, \s3
|
|
|
|
veor \t0, \t0, \s1
|
|
|
|
veor \t2, \t2, \s2
|
|
|
|
veor \t1, \t1, \s3
|
|
|
|
veor \t0, \t0, \s2
|
|
|
|
vand \s0, \x7, \x3
|
|
|
|
veor \t1, \t1, \s2
|
|
|
|
vand \s1, \x6, \x2
|
|
|
|
vand \s2, \x5, \x1
|
|
|
|
vorr \s3, \x4, \x0
|
|
|
|
veor \t3, \t3, \s0
|
|
|
|
veor \t1, \t1, \s2
|
|
|
|
veor \s0, \t0, \s3
|
|
|
|
veor \t2, \t2, \s1
|
|
|
|
vand \s2, \t3, \t1
|
|
|
|
veor \s1, \t2, \s2
|
|
|
|
veor \s3, \s0, \s2
|
|
|
|
vbsl \s1, \t1, \s0
|
|
|
|
vmvn \t0, \s0
|
|
|
|
vbsl \s0, \s1, \s3
|
|
|
|
vbsl \t0, \s1, \s3
|
|
|
|
vbsl \s3, \t3, \t2
|
|
|
|
veor \t3, \t3, \t2
|
|
|
|
vand \s2, \s0, \s3
|
|
|
|
veor \t1, \t1, \t0
|
|
|
|
veor \s2, \s2, \t3
|
|
|
|
mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
|
|
|
|
\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
|
|
|
|
t0, t1, t2, t3, s0, s1, s2, s3
|
|
|
|
in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
|
|
|
|
inv_gf256 \b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
|
|
|
|
\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
|
|
|
|
out_bs_ch \b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
|
|
|
|
t0, t1, t2, t3, s0, s1, s2, s3
|
|
|
|
inv_in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
|
|
|
|
inv_gf256 \b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
|
|
|
|
\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
|
|
|
|
inv_out_bs_ch \b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
|
|
|
|
t0, t1, t2, t3, mask
|
|
|
|
vld1.8 {\t0-\t1}, [bskey, :256]!
|
|
|
|
veor \t0, \t0, \x0
|
|
|
|
vld1.8 {\t2-\t3}, [bskey, :256]!
|
|
|
|
veor \t1, \t1, \x1
|
|
|
|
__tbl \x0, \t0, \mask
|
|
|
|
veor \t2, \t2, \x2
|
|
|
|
__tbl \x1, \t1, \mask
|
|
|
|
vld1.8 {\t0-\t1}, [bskey, :256]!
|
|
|
|
veor \t3, \t3, \x3
|
|
|
|
__tbl \x2, \t2, \mask
|
|
|
|
__tbl \x3, \t3, \mask
|
|
|
|
vld1.8 {\t2-\t3}, [bskey, :256]!
|
|
|
|
veor \t0, \t0, \x4
|
|
|
|
veor \t1, \t1, \x5
|
|
|
|
__tbl \x4, \t0, \mask
|
|
|
|
veor \t2, \t2, \x6
|
|
|
|
__tbl \x5, \t1, \mask
|
|
|
|
veor \t3, \t3, \x7
|
|
|
|
__tbl \x6, \t2, \mask
|
|
|
|
__tbl \x7, \t3, \mask
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
|
|
|
|
t0, t1, t2, t3, mask
|
|
|
|
__tbl \x0, \x0, \mask, \t0
|
|
|
|
__tbl \x1, \x1, \mask, \t1
|
|
|
|
__tbl \x2, \x2, \mask, \t2
|
|
|
|
__tbl \x3, \x3, \mask, \t3
|
|
|
|
__tbl \x4, \x4, \mask, \t0
|
|
|
|
__tbl \x5, \x5, \mask, \t1
|
|
|
|
__tbl \x6, \x6, \mask, \t2
|
|
|
|
__tbl \x7, \x7, \mask, \t3
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
|
|
|
|
t0, t1, t2, t3, t4, t5, t6, t7, inv
|
|
|
|
vext.8 \t0, \x0, \x0, #12
|
|
|
|
vext.8 \t1, \x1, \x1, #12
|
|
|
|
veor \x0, \x0, \t0
|
|
|
|
vext.8 \t2, \x2, \x2, #12
|
|
|
|
veor \x1, \x1, \t1
|
|
|
|
vext.8 \t3, \x3, \x3, #12
|
|
|
|
veor \x2, \x2, \t2
|
|
|
|
vext.8 \t4, \x4, \x4, #12
|
|
|
|
veor \x3, \x3, \t3
|
|
|
|
vext.8 \t5, \x5, \x5, #12
|
|
|
|
veor \x4, \x4, \t4
|
|
|
|
vext.8 \t6, \x6, \x6, #12
|
|
|
|
veor \x5, \x5, \t5
|
|
|
|
vext.8 \t7, \x7, \x7, #12
|
|
|
|
veor \x6, \x6, \t6
|
|
|
|
veor \t1, \t1, \x0
|
|
|
|
veor.8 \x7, \x7, \t7
|
|
|
|
vext.8 \x0, \x0, \x0, #8
|
|
|
|
veor \t2, \t2, \x1
|
|
|
|
veor \t0, \t0, \x7
|
|
|
|
veor \t1, \t1, \x7
|
|
|
|
vext.8 \x1, \x1, \x1, #8
|
|
|
|
veor \t5, \t5, \x4
|
|
|
|
veor \x0, \x0, \t0
|
|
|
|
veor \t6, \t6, \x5
|
|
|
|
veor \x1, \x1, \t1
|
|
|
|
vext.8 \t0, \x4, \x4, #8
|
|
|
|
veor \t4, \t4, \x3
|
|
|
|
vext.8 \t1, \x5, \x5, #8
|
|
|
|
veor \t7, \t7, \x6
|
|
|
|
vext.8 \x4, \x3, \x3, #8
|
|
|
|
veor \t3, \t3, \x2
|
|
|
|
vext.8 \x5, \x7, \x7, #8
|
|
|
|
veor \t4, \t4, \x7
|
|
|
|
vext.8 \x3, \x6, \x6, #8
|
|
|
|
veor \t3, \t3, \x7
|
|
|
|
vext.8 \x6, \x2, \x2, #8
|
|
|
|
veor \x7, \t1, \t5
|
|
|
|
.ifb \inv
|
|
|
|
veor \x2, \t0, \t4
|
|
|
|
veor \x4, \x4, \t3
|
|
|
|
veor \x5, \x5, \t7
|
|
|
|
veor \x3, \x3, \t6
|
|
|
|
veor \x6, \x6, \t2
|
|
|
|
.else
|
|
|
|
veor \t3, \t3, \x4
|
|
|
|
veor \x5, \x5, \t7
|
|
|
|
veor \x2, \x3, \t6
|
|
|
|
veor \x3, \t0, \t4
|
|
|
|
veor \x4, \x6, \t2
|
|
|
|
vmov \x6, \t3
|
|
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
|
|
|
|
t0, t1, t2, t3, t4, t5, t6, t7
|
|
|
|
vld1.8 {\t0-\t1}, [bskey, :256]!
|
|
|
|
veor \x0, \x0, \t0
|
|
|
|
vld1.8 {\t2-\t3}, [bskey, :256]!
|
|
|
|
veor \x1, \x1, \t1
|
|
|
|
vld1.8 {\t4-\t5}, [bskey, :256]!
|
|
|
|
veor \x2, \x2, \t2
|
|
|
|
vld1.8 {\t6-\t7}, [bskey, :256]
|
|
|
|
sub bskey, bskey, #224
|
|
|
|
veor \x3, \x3, \t3
|
|
|
|
veor \x4, \x4, \t4
|
|
|
|
veor \x5, \x5, \t5
|
|
|
|
veor \x6, \x6, \t6
|
|
|
|
veor \x7, \x7, \t7
|
|
|
|
vext.8 \t0, \x0, \x0, #8
|
|
|
|
vext.8 \t6, \x6, \x6, #8
|
|
|
|
vext.8 \t7, \x7, \x7, #8
|
|
|
|
veor \t0, \t0, \x0
|
|
|
|
vext.8 \t1, \x1, \x1, #8
|
|
|
|
veor \t6, \t6, \x6
|
|
|
|
vext.8 \t2, \x2, \x2, #8
|
|
|
|
veor \t7, \t7, \x7
|
|
|
|
vext.8 \t3, \x3, \x3, #8
|
|
|
|
veor \t1, \t1, \x1
|
|
|
|
vext.8 \t4, \x4, \x4, #8
|
|
|
|
veor \t2, \t2, \x2
|
|
|
|
vext.8 \t5, \x5, \x5, #8
|
|
|
|
veor \t3, \t3, \x3
|
|
|
|
veor \t4, \t4, \x4
|
|
|
|
veor \t5, \t5, \x5
|
|
|
|
veor \x0, \x0, \t6
|
|
|
|
veor \x1, \x1, \t6
|
|
|
|
veor \x2, \x2, \t0
|
|
|
|
veor \x4, \x4, \t2
|
|
|
|
veor \x3, \x3, \t1
|
|
|
|
veor \x1, \x1, \t7
|
|
|
|
veor \x2, \x2, \t7
|
|
|
|
veor \x4, \x4, \t6
|
|
|
|
veor \x5, \x5, \t3
|
|
|
|
veor \x3, \x3, \t6
|
|
|
|
veor \x6, \x6, \t4
|
|
|
|
veor \x4, \x4, \t7
|
|
|
|
veor \x5, \x5, \t7
|
|
|
|
veor \x7, \x7, \t5
|
|
|
|
mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
|
|
|
|
\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
|
|
|
|
vshr.u64 \t0, \b0, #\n
|
|
|
|
vshr.u64 \t1, \b1, #\n
|
|
|
|
veor \t0, \t0, \a0
|
|
|
|
veor \t1, \t1, \a1
|
|
|
|
vand \t0, \t0, \mask
|
|
|
|
vand \t1, \t1, \mask
|
|
|
|
veor \a0, \a0, \t0
|
|
|
|
vshl.s64 \t0, \t0, #\n
|
|
|
|
veor \a1, \a1, \t1
|
|
|
|
vshl.s64 \t1, \t1, #\n
|
|
|
|
veor \b0, \b0, \t0
|
|
|
|
veor \b1, \b1, \t1
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
|
|
|
|
vmov.i8 \t0, #0x55
|
|
|
|
vmov.i8 \t1, #0x33
|
|
|
|
swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
|
|
|
|
swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
|
|
|
|
vmov.i8 \t0, #0x0f
|
|
|
|
swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
|
|
|
|
swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
|
|
|
|
swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
|
|
|
|
swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.align 4
|
|
|
|
M0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d
|
|
|
|
|
|
|
|
/*
|
|
|
|
* void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
|
|
|
|
*/
|
|
|
|
ENTRY(aesbs_convert_key)
|
|
|
|
vld1.32 {q7}, [r1]! // load round 0 key
|
|
|
|
vld1.32 {q15}, [r1]! // load round 1 key
|
|
|
|
|
|
|
|
vmov.i8 q8, #0x01 // bit masks
|
|
|
|
vmov.i8 q9, #0x02
|
|
|
|
vmov.i8 q10, #0x04
|
|
|
|
vmov.i8 q11, #0x08
|
|
|
|
vmov.i8 q12, #0x10
|
|
|
|
vmov.i8 q13, #0x20
|
|
|
|
__ldr q14, M0
|
|
|
|
|
|
|
|
sub r2, r2, #1
|
|
|
|
vst1.8 {q7}, [r0, :128]! // save round 0 key
|
|
|
|
|
|
|
|
.Lkey_loop:
|
|
|
|
__tbl q7, q15, q14
|
|
|
|
vmov.i8 q6, #0x40
|
|
|
|
vmov.i8 q15, #0x80
|
|
|
|
|
|
|
|
vtst.8 q0, q7, q8
|
|
|
|
vtst.8 q1, q7, q9
|
|
|
|
vtst.8 q2, q7, q10
|
|
|
|
vtst.8 q3, q7, q11
|
|
|
|
vtst.8 q4, q7, q12
|
|
|
|
vtst.8 q5, q7, q13
|
|
|
|
vtst.8 q6, q7, q6
|
|
|
|
vtst.8 q7, q7, q15
|
|
|
|
vld1.32 {q15}, [r1]! // load next round key
|
|
|
|
vmvn q0, q0
|
|
|
|
vmvn q1, q1
|
|
|
|
vmvn q5, q5
|
|
|
|
vmvn q6, q6
|
|
|
|
|
|
|
|
subs r2, r2, #1
|
|
|
|
vst1.8 {q0-q1}, [r0, :256]!
|
|
|
|
vst1.8 {q2-q3}, [r0, :256]!
|
|
|
|
vst1.8 {q4-q5}, [r0, :256]!
|
|
|
|
vst1.8 {q6-q7}, [r0, :256]!
|
|
|
|
bne .Lkey_loop
|
|
|
|
|
|
|
|
vmov.i8 q7, #0x63 // compose .L63
|
|
|
|
veor q15, q15, q7
|
|
|
|
vst1.8 {q15}, [r0, :128]
|
|
|
|
bx lr
|
|
|
|
ENDPROC(aesbs_convert_key)
|
|
|
|
|
|
|
|
.align 4
|
|
|
|
M0SR: .quad 0x0a0e02060f03070b, 0x0004080c05090d01
|
|
|
|
|
|
|
|
aesbs_encrypt8:
|
|
|
|
vld1.8 {q9}, [bskey, :128]! // round 0 key
|
|
|
|
__ldr q8, M0SR
|
|
|
|
|
|
|
|
veor q10, q0, q9 // xor with round0 key
|
|
|
|
veor q11, q1, q9
|
|
|
|
__tbl q0, q10, q8
|
|
|
|
veor q12, q2, q9
|
|
|
|
__tbl q1, q11, q8
|
|
|
|
veor q13, q3, q9
|
|
|
|
__tbl q2, q12, q8
|
|
|
|
veor q14, q4, q9
|
|
|
|
__tbl q3, q13, q8
|
|
|
|
veor q15, q5, q9
|
|
|
|
__tbl q4, q14, q8
|
|
|
|
veor q10, q6, q9
|
|
|
|
__tbl q5, q15, q8
|
|
|
|
veor q11, q7, q9
|
|
|
|
__tbl q6, q10, q8
|
|
|
|
__tbl q7, q11, q8
|
|
|
|
|
|
|
|
bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
|
|
|
|
|
|
|
|
sub rounds, rounds, #1
|
|
|
|
b .Lenc_sbox
|
|
|
|
|
|
|
|
.align 5
|
|
|
|
SR: .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
|
|
|
|
SRM0: .quad 0x0304090e00050a0f, 0x01060b0c0207080d
|
|
|
|
|
|
|
|
.Lenc_last:
|
|
|
|
__ldr q12, SRM0
|
|
|
|
.Lenc_loop:
|
|
|
|
shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
|
|
|
|
.Lenc_sbox:
|
|
|
|
sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
|
|
|
|
q13, q14, q15
|
|
|
|
subs rounds, rounds, #1
|
|
|
|
bcc .Lenc_done
|
|
|
|
|
|
|
|
mix_cols q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
|
|
|
|
q13, q14, q15
|
|
|
|
|
|
|
|
beq .Lenc_last
|
|
|
|
__ldr q12, SR
|
|
|
|
b .Lenc_loop
|
|
|
|
|
|
|
|
.Lenc_done:
|
|
|
|
vld1.8 {q12}, [bskey, :128] // last round key
|
|
|
|
|
|
|
|
bitslice q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
|
|
|
|
|
|
|
|
veor q0, q0, q12
|
|
|
|
veor q1, q1, q12
|
|
|
|
veor q4, q4, q12
|
|
|
|
veor q6, q6, q12
|
|
|
|
veor q3, q3, q12
|
|
|
|
veor q7, q7, q12
|
|
|
|
veor q2, q2, q12
|
|
|
|
veor q5, q5, q12
|
|
|
|
bx lr
|
|
|
|
ENDPROC(aesbs_encrypt8)
|
|
|
|
|
|
|
|
.align 4
|
|
|
|
M0ISR: .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
|
|
|
|
|
|
|
|
aesbs_decrypt8:
|
|
|
|
add bskey, bskey, rounds, lsl #7
|
|
|
|
sub bskey, bskey, #112
|
|
|
|
vld1.8 {q9}, [bskey, :128] // round 0 key
|
|
|
|
sub bskey, bskey, #128
|
|
|
|
__ldr q8, M0ISR
|
|
|
|
|
|
|
|
veor q10, q0, q9 // xor with round0 key
|
|
|
|
veor q11, q1, q9
|
|
|
|
__tbl q0, q10, q8
|
|
|
|
veor q12, q2, q9
|
|
|
|
__tbl q1, q11, q8
|
|
|
|
veor q13, q3, q9
|
|
|
|
__tbl q2, q12, q8
|
|
|
|
veor q14, q4, q9
|
|
|
|
__tbl q3, q13, q8
|
|
|
|
veor q15, q5, q9
|
|
|
|
__tbl q4, q14, q8
|
|
|
|
veor q10, q6, q9
|
|
|
|
__tbl q5, q15, q8
|
|
|
|
veor q11, q7, q9
|
|
|
|
__tbl q6, q10, q8
|
|
|
|
__tbl q7, q11, q8
|
|
|
|
|
|
|
|
bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
|
|
|
|
|
|
|
|
sub rounds, rounds, #1
|
|
|
|
b .Ldec_sbox
|
|
|
|
|
|
|
|
.align 5
|
|
|
|
ISR: .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
|
|
|
|
ISRM0: .quad 0x01040b0e0205080f, 0x0306090c00070a0d
|
|
|
|
|
|
|
|
.Ldec_last:
|
|
|
|
__ldr q12, ISRM0
|
|
|
|
.Ldec_loop:
|
|
|
|
inv_shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
|
|
|
|
.Ldec_sbox:
|
|
|
|
inv_sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
|
|
|
|
q13, q14, q15
|
|
|
|
subs rounds, rounds, #1
|
|
|
|
bcc .Ldec_done
|
|
|
|
|
|
|
|
inv_mix_cols q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
|
|
|
|
q13, q14, q15
|
|
|
|
|
|
|
|
beq .Ldec_last
|
|
|
|
__ldr q12, ISR
|
|
|
|
b .Ldec_loop
|
|
|
|
|
|
|
|
.Ldec_done:
|
|
|
|
add bskey, bskey, #112
|
|
|
|
vld1.8 {q12}, [bskey, :128] // last round key
|
|
|
|
|
|
|
|
bitslice q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
|
|
|
|
|
|
|
|
veor q0, q0, q12
|
|
|
|
veor q1, q1, q12
|
|
|
|
veor q6, q6, q12
|
|
|
|
veor q4, q4, q12
|
|
|
|
veor q2, q2, q12
|
|
|
|
veor q7, q7, q12
|
|
|
|
veor q3, q3, q12
|
|
|
|
veor q5, q5, q12
|
|
|
|
bx lr
|
|
|
|
ENDPROC(aesbs_decrypt8)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
|
|
|
* int blocks)
|
|
|
|
* aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
|
|
|
* int blocks)
|
|
|
|
*/
|
|
|
|
.macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
|
|
|
|
push {r4-r6, lr}
|
|
|
|
ldr r5, [sp, #16] // number of blocks
|
|
|
|
|
2020-09-16 20:36:40 +08:00
|
|
|
99: adr ip, 0f
|
2017-01-12 00:41:54 +08:00
|
|
|
and lr, r5, #7
|
|
|
|
cmp r5, #8
|
|
|
|
sub ip, ip, lr, lsl #2
|
2020-09-16 20:36:40 +08:00
|
|
|
movlt pc, ip // computed goto if blocks < 8
|
2017-01-12 00:41:54 +08:00
|
|
|
|
|
|
|
vld1.8 {q0}, [r1]!
|
|
|
|
vld1.8 {q1}, [r1]!
|
|
|
|
vld1.8 {q2}, [r1]!
|
|
|
|
vld1.8 {q3}, [r1]!
|
|
|
|
vld1.8 {q4}, [r1]!
|
|
|
|
vld1.8 {q5}, [r1]!
|
|
|
|
vld1.8 {q6}, [r1]!
|
|
|
|
vld1.8 {q7}, [r1]!
|
|
|
|
|
|
|
|
0: mov bskey, r2
|
|
|
|
mov rounds, r3
|
|
|
|
bl \do8
|
|
|
|
|
2020-09-16 20:36:40 +08:00
|
|
|
adr ip, 1f
|
2017-01-12 00:41:54 +08:00
|
|
|
and lr, r5, #7
|
|
|
|
cmp r5, #8
|
|
|
|
sub ip, ip, lr, lsl #2
|
2020-09-16 20:36:40 +08:00
|
|
|
movlt pc, ip // computed goto if blocks < 8
|
2017-01-12 00:41:54 +08:00
|
|
|
|
|
|
|
vst1.8 {\o0}, [r0]!
|
|
|
|
vst1.8 {\o1}, [r0]!
|
|
|
|
vst1.8 {\o2}, [r0]!
|
|
|
|
vst1.8 {\o3}, [r0]!
|
|
|
|
vst1.8 {\o4}, [r0]!
|
|
|
|
vst1.8 {\o5}, [r0]!
|
|
|
|
vst1.8 {\o6}, [r0]!
|
|
|
|
vst1.8 {\o7}, [r0]!
|
|
|
|
|
|
|
|
1: subs r5, r5, #8
|
|
|
|
bgt 99b
|
|
|
|
|
|
|
|
pop {r4-r6, pc}
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.align 4
|
|
|
|
ENTRY(aesbs_ecb_encrypt)
|
|
|
|
__ecb_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
|
|
|
|
ENDPROC(aesbs_ecb_encrypt)
|
|
|
|
|
|
|
|
.align 4
|
|
|
|
ENTRY(aesbs_ecb_decrypt)
|
|
|
|
__ecb_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
|
|
|
|
ENDPROC(aesbs_ecb_decrypt)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
|
|
|
|
* int rounds, int blocks, u8 iv[])
|
|
|
|
*/
|
|
|
|
.align 4
|
|
|
|
ENTRY(aesbs_cbc_decrypt)
|
|
|
|
mov ip, sp
|
|
|
|
push {r4-r6, lr}
|
|
|
|
ldm ip, {r5-r6} // load args 4-5
|
|
|
|
|
2020-09-16 20:36:40 +08:00
|
|
|
99: adr ip, 0f
|
2017-01-12 00:41:54 +08:00
|
|
|
and lr, r5, #7
|
|
|
|
cmp r5, #8
|
|
|
|
sub ip, ip, lr, lsl #2
|
|
|
|
mov lr, r1
|
2020-09-16 20:36:40 +08:00
|
|
|
movlt pc, ip // computed goto if blocks < 8
|
2017-01-12 00:41:54 +08:00
|
|
|
|
|
|
|
vld1.8 {q0}, [lr]!
|
|
|
|
vld1.8 {q1}, [lr]!
|
|
|
|
vld1.8 {q2}, [lr]!
|
|
|
|
vld1.8 {q3}, [lr]!
|
|
|
|
vld1.8 {q4}, [lr]!
|
|
|
|
vld1.8 {q5}, [lr]!
|
|
|
|
vld1.8 {q6}, [lr]!
|
|
|
|
vld1.8 {q7}, [lr]
|
|
|
|
|
|
|
|
0: mov bskey, r2
|
|
|
|
mov rounds, r3
|
|
|
|
bl aesbs_decrypt8
|
|
|
|
|
|
|
|
vld1.8 {q8}, [r6]
|
|
|
|
vmov q9, q8
|
|
|
|
vmov q10, q8
|
|
|
|
vmov q11, q8
|
|
|
|
vmov q12, q8
|
|
|
|
vmov q13, q8
|
|
|
|
vmov q14, q8
|
|
|
|
vmov q15, q8
|
|
|
|
|
2020-09-16 20:36:40 +08:00
|
|
|
adr ip, 1f
|
2017-01-12 00:41:54 +08:00
|
|
|
and lr, r5, #7
|
|
|
|
cmp r5, #8
|
|
|
|
sub ip, ip, lr, lsl #2
|
2020-09-16 20:36:40 +08:00
|
|
|
movlt pc, ip // computed goto if blocks < 8
|
2017-01-12 00:41:54 +08:00
|
|
|
|
|
|
|
vld1.8 {q9}, [r1]!
|
|
|
|
vld1.8 {q10}, [r1]!
|
|
|
|
vld1.8 {q11}, [r1]!
|
|
|
|
vld1.8 {q12}, [r1]!
|
|
|
|
vld1.8 {q13}, [r1]!
|
|
|
|
vld1.8 {q14}, [r1]!
|
|
|
|
vld1.8 {q15}, [r1]!
|
|
|
|
W(nop)
|
|
|
|
|
2020-09-16 20:36:40 +08:00
|
|
|
1: adr ip, 2f
|
2017-01-12 00:41:54 +08:00
|
|
|
sub ip, ip, lr, lsl #3
|
2020-09-16 20:36:40 +08:00
|
|
|
movlt pc, ip // computed goto if blocks < 8
|
2017-01-12 00:41:54 +08:00
|
|
|
|
|
|
|
veor q0, q0, q8
|
|
|
|
vst1.8 {q0}, [r0]!
|
|
|
|
veor q1, q1, q9
|
|
|
|
vst1.8 {q1}, [r0]!
|
|
|
|
veor q6, q6, q10
|
|
|
|
vst1.8 {q6}, [r0]!
|
|
|
|
veor q4, q4, q11
|
|
|
|
vst1.8 {q4}, [r0]!
|
|
|
|
veor q2, q2, q12
|
|
|
|
vst1.8 {q2}, [r0]!
|
|
|
|
veor q7, q7, q13
|
|
|
|
vst1.8 {q7}, [r0]!
|
|
|
|
veor q3, q3, q14
|
|
|
|
vst1.8 {q3}, [r0]!
|
|
|
|
veor q5, q5, q15
|
|
|
|
vld1.8 {q8}, [r1]! // load next round's iv
|
|
|
|
2: vst1.8 {q5}, [r0]!
|
|
|
|
|
|
|
|
subs r5, r5, #8
|
|
|
|
vst1.8 {q8}, [r6] // store next round's iv
|
|
|
|
bgt 99b
|
|
|
|
|
|
|
|
pop {r4-r6, pc}
|
|
|
|
ENDPROC(aesbs_cbc_decrypt)
|
|
|
|
|
|
|
|
.macro next_ctr, q
|
crypto: arm/aes-neonbs - fix issue with v2.22 and older assembler
The GNU assembler for ARM version 2.22 or older fails to infer the
element size from the vmov instructions, and aborts the build in
the following way;
.../aes-neonbs-core.S: Assembler messages:
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1h[1],r10'
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1h[0],r9'
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1l[1],r8'
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1l[0],r7'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2h[1],r10'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2h[0],r9'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2l[1],r8'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2l[0],r7'
Fix this by setting the element size explicitly, by replacing vmov with
vmov.32.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 20:23:32 +08:00
|
|
|
vmov.32 \q\()h[1], r10
|
2017-01-12 00:41:54 +08:00
|
|
|
adds r10, r10, #1
|
crypto: arm/aes-neonbs - fix issue with v2.22 and older assembler
The GNU assembler for ARM version 2.22 or older fails to infer the
element size from the vmov instructions, and aborts the build in
the following way;
.../aes-neonbs-core.S: Assembler messages:
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1h[1],r10'
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1h[0],r9'
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1l[1],r8'
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1l[0],r7'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2h[1],r10'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2h[0],r9'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2l[1],r8'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2l[0],r7'
Fix this by setting the element size explicitly, by replacing vmov with
vmov.32.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 20:23:32 +08:00
|
|
|
vmov.32 \q\()h[0], r9
|
2017-01-12 00:41:54 +08:00
|
|
|
adcs r9, r9, #0
|
crypto: arm/aes-neonbs - fix issue with v2.22 and older assembler
The GNU assembler for ARM version 2.22 or older fails to infer the
element size from the vmov instructions, and aborts the build in
the following way;
.../aes-neonbs-core.S: Assembler messages:
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1h[1],r10'
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1h[0],r9'
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1l[1],r8'
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1l[0],r7'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2h[1],r10'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2h[0],r9'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2l[1],r8'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2l[0],r7'
Fix this by setting the element size explicitly, by replacing vmov with
vmov.32.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 20:23:32 +08:00
|
|
|
vmov.32 \q\()l[1], r8
|
2017-01-12 00:41:54 +08:00
|
|
|
adcs r8, r8, #0
|
crypto: arm/aes-neonbs - fix issue with v2.22 and older assembler
The GNU assembler for ARM version 2.22 or older fails to infer the
element size from the vmov instructions, and aborts the build in
the following way;
.../aes-neonbs-core.S: Assembler messages:
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1h[1],r10'
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1h[0],r9'
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1l[1],r8'
.../aes-neonbs-core.S:817: Error: bad type for scalar -- `vmov q1l[0],r7'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2h[1],r10'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2h[0],r9'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2l[1],r8'
.../aes-neonbs-core.S:818: Error: bad type for scalar -- `vmov q2l[0],r7'
Fix this by setting the element size explicitly, by replacing vmov with
vmov.32.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 20:23:32 +08:00
|
|
|
vmov.32 \q\()l[0], r7
|
2017-01-12 00:41:54 +08:00
|
|
|
adc r7, r7, #0
|
|
|
|
vrev32.8 \q, \q
|
|
|
|
.endm
|
|
|
|
|
|
|
|
/*
|
|
|
|
* aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
|
2017-02-02 19:38:56 +08:00
|
|
|
* int rounds, int blocks, u8 ctr[], u8 final[])
|
2017-01-12 00:41:54 +08:00
|
|
|
*/
|
|
|
|
ENTRY(aesbs_ctr_encrypt)
|
|
|
|
mov ip, sp
|
|
|
|
push {r4-r10, lr}
|
|
|
|
|
|
|
|
ldm ip, {r5-r7} // load args 4-6
|
2017-02-02 19:38:56 +08:00
|
|
|
teq r7, #0
|
|
|
|
addne r5, r5, #1 // one extra block if final != 0
|
2017-01-12 00:41:54 +08:00
|
|
|
|
|
|
|
vld1.8 {q0}, [r6] // load counter
|
|
|
|
vrev32.8 q1, q0
|
|
|
|
vmov r9, r10, d3
|
|
|
|
vmov r7, r8, d2
|
|
|
|
|
|
|
|
adds r10, r10, #1
|
|
|
|
adcs r9, r9, #0
|
|
|
|
adcs r8, r8, #0
|
|
|
|
adc r7, r7, #0
|
|
|
|
|
|
|
|
99: vmov q1, q0
|
|
|
|
vmov q2, q0
|
|
|
|
vmov q3, q0
|
|
|
|
vmov q4, q0
|
|
|
|
vmov q5, q0
|
|
|
|
vmov q6, q0
|
|
|
|
vmov q7, q0
|
|
|
|
|
2020-09-16 20:36:40 +08:00
|
|
|
adr ip, 0f
|
2017-01-12 00:41:54 +08:00
|
|
|
sub lr, r5, #1
|
|
|
|
and lr, lr, #7
|
|
|
|
cmp r5, #8
|
|
|
|
sub ip, ip, lr, lsl #5
|
|
|
|
sub ip, ip, lr, lsl #2
|
2020-09-16 20:36:40 +08:00
|
|
|
movlt pc, ip // computed goto if blocks < 8
|
2017-01-12 00:41:54 +08:00
|
|
|
|
|
|
|
next_ctr q1
|
|
|
|
next_ctr q2
|
|
|
|
next_ctr q3
|
|
|
|
next_ctr q4
|
|
|
|
next_ctr q5
|
|
|
|
next_ctr q6
|
|
|
|
next_ctr q7
|
|
|
|
|
|
|
|
0: mov bskey, r2
|
|
|
|
mov rounds, r3
|
|
|
|
bl aesbs_encrypt8
|
|
|
|
|
2020-09-16 20:36:40 +08:00
|
|
|
adr ip, 1f
|
2017-01-12 00:41:54 +08:00
|
|
|
and lr, r5, #7
|
|
|
|
cmp r5, #8
|
|
|
|
movgt r4, #0
|
|
|
|
ldrle r4, [sp, #40] // load final in the last round
|
|
|
|
sub ip, ip, lr, lsl #2
|
2020-09-16 20:36:40 +08:00
|
|
|
movlt pc, ip // computed goto if blocks < 8
|
2017-01-12 00:41:54 +08:00
|
|
|
|
|
|
|
vld1.8 {q8}, [r1]!
|
|
|
|
vld1.8 {q9}, [r1]!
|
|
|
|
vld1.8 {q10}, [r1]!
|
|
|
|
vld1.8 {q11}, [r1]!
|
|
|
|
vld1.8 {q12}, [r1]!
|
|
|
|
vld1.8 {q13}, [r1]!
|
|
|
|
vld1.8 {q14}, [r1]!
|
|
|
|
teq r4, #0 // skip last block if 'final'
|
|
|
|
1: bne 2f
|
|
|
|
vld1.8 {q15}, [r1]!
|
|
|
|
|
2020-09-16 20:36:40 +08:00
|
|
|
2: adr ip, 3f
|
2017-01-12 00:41:54 +08:00
|
|
|
cmp r5, #8
|
|
|
|
sub ip, ip, lr, lsl #3
|
2020-09-16 20:36:40 +08:00
|
|
|
movlt pc, ip // computed goto if blocks < 8
|
2017-01-12 00:41:54 +08:00
|
|
|
|
|
|
|
veor q0, q0, q8
|
|
|
|
vst1.8 {q0}, [r0]!
|
|
|
|
veor q1, q1, q9
|
|
|
|
vst1.8 {q1}, [r0]!
|
|
|
|
veor q4, q4, q10
|
|
|
|
vst1.8 {q4}, [r0]!
|
|
|
|
veor q6, q6, q11
|
|
|
|
vst1.8 {q6}, [r0]!
|
|
|
|
veor q3, q3, q12
|
|
|
|
vst1.8 {q3}, [r0]!
|
|
|
|
veor q7, q7, q13
|
|
|
|
vst1.8 {q7}, [r0]!
|
|
|
|
veor q2, q2, q14
|
|
|
|
vst1.8 {q2}, [r0]!
|
|
|
|
teq r4, #0 // skip last block if 'final'
|
2017-02-02 19:38:56 +08:00
|
|
|
W(bne) 5f
|
2017-01-12 00:41:54 +08:00
|
|
|
3: veor q5, q5, q15
|
|
|
|
vst1.8 {q5}, [r0]!
|
|
|
|
|
2017-02-02 19:38:56 +08:00
|
|
|
4: next_ctr q0
|
2017-01-12 00:41:54 +08:00
|
|
|
|
|
|
|
subs r5, r5, #8
|
|
|
|
bgt 99b
|
|
|
|
|
2017-02-02 19:38:56 +08:00
|
|
|
vst1.8 {q0}, [r6]
|
2017-01-12 00:41:54 +08:00
|
|
|
pop {r4-r10, pc}
|
2017-02-02 19:38:56 +08:00
|
|
|
|
|
|
|
5: vst1.8 {q5}, [r4]
|
|
|
|
b 4b
|
2017-01-12 00:41:54 +08:00
|
|
|
ENDPROC(aesbs_ctr_encrypt)
|
|
|
|
|
|
|
|
.macro next_tweak, out, in, const, tmp
|
|
|
|
vshr.s64 \tmp, \in, #63
|
|
|
|
vand \tmp, \tmp, \const
|
|
|
|
vadd.u64 \out, \in, \in
|
|
|
|
vext.8 \tmp, \tmp, \tmp, #8
|
|
|
|
veor \out, \out, \tmp
|
|
|
|
.endm
|
|
|
|
|
|
|
|
/*
|
|
|
|
* aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
2019-09-04 00:43:36 +08:00
|
|
|
* int blocks, u8 iv[], int reorder_last_tweak)
|
2017-01-12 00:41:54 +08:00
|
|
|
* aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
2019-09-04 00:43:36 +08:00
|
|
|
* int blocks, u8 iv[], int reorder_last_tweak)
|
2017-01-12 00:41:54 +08:00
|
|
|
*/
|
|
|
|
__xts_prepare8:
|
|
|
|
vld1.8 {q14}, [r7] // load iv
|
2019-09-04 00:43:27 +08:00
|
|
|
vmov.i32 d30, #0x87 // compose tweak mask vector
|
|
|
|
vmovl.u32 q15, d30
|
|
|
|
vshr.u64 d30, d31, #7
|
2017-01-12 00:41:54 +08:00
|
|
|
vmov q12, q14
|
|
|
|
|
2020-09-16 20:36:40 +08:00
|
|
|
adr ip, 0f
|
2017-01-12 00:41:54 +08:00
|
|
|
and r4, r6, #7
|
|
|
|
cmp r6, #8
|
|
|
|
sub ip, ip, r4, lsl #5
|
|
|
|
mov r4, sp
|
2020-09-16 20:36:40 +08:00
|
|
|
movlt pc, ip // computed goto if blocks < 8
|
2017-01-12 00:41:54 +08:00
|
|
|
|
|
|
|
vld1.8 {q0}, [r1]!
|
|
|
|
next_tweak q12, q14, q15, q13
|
|
|
|
veor q0, q0, q14
|
|
|
|
vst1.8 {q14}, [r4, :128]!
|
|
|
|
|
|
|
|
vld1.8 {q1}, [r1]!
|
|
|
|
next_tweak q14, q12, q15, q13
|
|
|
|
veor q1, q1, q12
|
|
|
|
vst1.8 {q12}, [r4, :128]!
|
|
|
|
|
|
|
|
vld1.8 {q2}, [r1]!
|
|
|
|
next_tweak q12, q14, q15, q13
|
|
|
|
veor q2, q2, q14
|
|
|
|
vst1.8 {q14}, [r4, :128]!
|
|
|
|
|
|
|
|
vld1.8 {q3}, [r1]!
|
|
|
|
next_tweak q14, q12, q15, q13
|
|
|
|
veor q3, q3, q12
|
|
|
|
vst1.8 {q12}, [r4, :128]!
|
|
|
|
|
|
|
|
vld1.8 {q4}, [r1]!
|
|
|
|
next_tweak q12, q14, q15, q13
|
|
|
|
veor q4, q4, q14
|
|
|
|
vst1.8 {q14}, [r4, :128]!
|
|
|
|
|
|
|
|
vld1.8 {q5}, [r1]!
|
|
|
|
next_tweak q14, q12, q15, q13
|
|
|
|
veor q5, q5, q12
|
|
|
|
vst1.8 {q12}, [r4, :128]!
|
|
|
|
|
|
|
|
vld1.8 {q6}, [r1]!
|
|
|
|
next_tweak q12, q14, q15, q13
|
|
|
|
veor q6, q6, q14
|
|
|
|
vst1.8 {q14}, [r4, :128]!
|
|
|
|
|
|
|
|
vld1.8 {q7}, [r1]!
|
|
|
|
next_tweak q14, q12, q15, q13
|
2019-09-04 00:43:36 +08:00
|
|
|
THUMB( itt le )
|
|
|
|
W(cmple) r8, #0
|
|
|
|
ble 1f
|
|
|
|
0: veor q7, q7, q12
|
2017-01-12 00:41:54 +08:00
|
|
|
vst1.8 {q12}, [r4, :128]
|
|
|
|
|
2019-09-04 00:43:36 +08:00
|
|
|
vst1.8 {q14}, [r7] // store next iv
|
2017-01-12 00:41:54 +08:00
|
|
|
bx lr
|
2019-09-04 00:43:36 +08:00
|
|
|
|
|
|
|
1: vswp q12, q14
|
|
|
|
b 0b
|
2017-01-12 00:41:54 +08:00
|
|
|
ENDPROC(__xts_prepare8)
|
|
|
|
|
|
|
|
.macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
|
|
|
|
push {r4-r8, lr}
|
|
|
|
mov r5, sp // preserve sp
|
|
|
|
ldrd r6, r7, [sp, #24] // get blocks and iv args
|
2020-09-16 20:36:41 +08:00
|
|
|
rsb r8, ip, #1
|
2017-01-12 00:41:54 +08:00
|
|
|
sub ip, sp, #128 // make room for 8x tweak
|
|
|
|
bic ip, ip, #0xf // align sp to 16 bytes
|
|
|
|
mov sp, ip
|
|
|
|
|
|
|
|
99: bl __xts_prepare8
|
|
|
|
|
|
|
|
mov bskey, r2
|
|
|
|
mov rounds, r3
|
|
|
|
bl \do8
|
|
|
|
|
2020-09-16 20:36:40 +08:00
|
|
|
adr ip, 0f
|
2017-01-12 00:41:54 +08:00
|
|
|
and lr, r6, #7
|
|
|
|
cmp r6, #8
|
|
|
|
sub ip, ip, lr, lsl #2
|
|
|
|
mov r4, sp
|
2020-09-16 20:36:40 +08:00
|
|
|
movlt pc, ip // computed goto if blocks < 8
|
2017-01-12 00:41:54 +08:00
|
|
|
|
|
|
|
vld1.8 {q8}, [r4, :128]!
|
|
|
|
vld1.8 {q9}, [r4, :128]!
|
|
|
|
vld1.8 {q10}, [r4, :128]!
|
|
|
|
vld1.8 {q11}, [r4, :128]!
|
|
|
|
vld1.8 {q12}, [r4, :128]!
|
|
|
|
vld1.8 {q13}, [r4, :128]!
|
|
|
|
vld1.8 {q14}, [r4, :128]!
|
|
|
|
vld1.8 {q15}, [r4, :128]
|
|
|
|
|
2020-09-16 20:36:40 +08:00
|
|
|
0: adr ip, 1f
|
2017-01-12 00:41:54 +08:00
|
|
|
sub ip, ip, lr, lsl #3
|
2020-09-16 20:36:40 +08:00
|
|
|
movlt pc, ip // computed goto if blocks < 8
|
2017-01-12 00:41:54 +08:00
|
|
|
|
|
|
|
veor \o0, \o0, q8
|
|
|
|
vst1.8 {\o0}, [r0]!
|
|
|
|
veor \o1, \o1, q9
|
|
|
|
vst1.8 {\o1}, [r0]!
|
|
|
|
veor \o2, \o2, q10
|
|
|
|
vst1.8 {\o2}, [r0]!
|
|
|
|
veor \o3, \o3, q11
|
|
|
|
vst1.8 {\o3}, [r0]!
|
|
|
|
veor \o4, \o4, q12
|
|
|
|
vst1.8 {\o4}, [r0]!
|
|
|
|
veor \o5, \o5, q13
|
|
|
|
vst1.8 {\o5}, [r0]!
|
|
|
|
veor \o6, \o6, q14
|
|
|
|
vst1.8 {\o6}, [r0]!
|
|
|
|
veor \o7, \o7, q15
|
|
|
|
vst1.8 {\o7}, [r0]!
|
|
|
|
|
|
|
|
1: subs r6, r6, #8
|
|
|
|
bgt 99b
|
|
|
|
|
|
|
|
mov sp, r5
|
|
|
|
pop {r4-r8, pc}
|
|
|
|
.endm
|
|
|
|
|
|
|
|
ENTRY(aesbs_xts_encrypt)
|
2020-09-16 20:36:41 +08:00
|
|
|
mov ip, #0 // never reorder final tweak
|
2017-01-12 00:41:54 +08:00
|
|
|
__xts_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
|
|
|
|
ENDPROC(aesbs_xts_encrypt)
|
|
|
|
|
|
|
|
ENTRY(aesbs_xts_decrypt)
|
2020-09-16 20:36:41 +08:00
|
|
|
ldr ip, [sp, #8] // reorder final tweak?
|
2017-01-12 00:41:54 +08:00
|
|
|
__xts_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
|
|
|
|
ENDPROC(aesbs_xts_decrypt)
|