linux/arch/arm64/crypto/sha3-ce-core.S

/* SPDX-License-Identifier: GPL-2.0 */
/*
 * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions
 *
 * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#include <linux/linkage.h>
#include <asm/assembler.h>

	.irp	b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
	.set	.Lv\b\().2d, \b
	.set	.Lv\b\().16b, \b
	.endr

	/*
	 * ARMv8.2 Crypto Extensions instructions
	 */
	.macro	eor3, rd, rn, rm, ra
	.inst	0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
	.endm

	.macro	rax1, rd, rn, rm
	.inst	0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
	.endm

	.macro	bcax, rd, rn, rm, ra
	.inst	0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
	.endm

	.macro	xar, rd, rn, rm, imm6
	.inst	0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
	.endm

	/*
	 * sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
	 */
	.text
ENTRY(sha3_ce_transform)
	frame_push	4

	mov	x19, x0
	mov	x20, x1
	mov	x21, x2
	mov	x22, x3

0:	/* load state */
	add	x8, x19, #32
	ld1	{ v0.1d- v3.1d}, [x19]
	ld1	{ v4.1d- v7.1d}, [x8], #32
	ld1	{ v8.1d-v11.1d}, [x8], #32
	ld1	{v12.1d-v15.1d}, [x8], #32
	ld1	{v16.1d-v19.1d}, [x8], #32
	ld1	{v20.1d-v23.1d}, [x8], #32
	ld1	{v24.1d}, [x8]

1:	sub	w21, w21, #1
	mov	w8, #24
	adr_l	x9, .Lsha3_rcon

	/* load input */
	ld1	{v25.8b-v28.8b}, [x20], #32
	ld1	{v29.8b-v31.8b}, [x20], #24
	eor	v0.8b, v0.8b, v25.8b
	eor	v1.8b, v1.8b, v26.8b
	eor	v2.8b, v2.8b, v27.8b
	eor	v3.8b, v3.8b, v28.8b
	eor	v4.8b, v4.8b, v29.8b
	eor	v5.8b, v5.8b, v30.8b
	eor	v6.8b, v6.8b, v31.8b

	tbnz	x22, #6, 3f		// SHA3-512

	ld1	{v25.8b-v28.8b}, [x20], #32
	ld1	{v29.8b-v30.8b}, [x20], #16
	eor	 v7.8b,  v7.8b, v25.8b
	eor	 v8.8b,  v8.8b, v26.8b
	eor	 v9.8b,  v9.8b, v27.8b
	eor	v10.8b, v10.8b, v28.8b
	eor	v11.8b, v11.8b, v29.8b
	eor	v12.8b, v12.8b, v30.8b

	tbnz	x22, #4, 2f		// SHA3-384 or SHA3-224

	// SHA3-256
	ld1	{v25.8b-v28.8b}, [x20], #32
	eor	v13.8b, v13.8b, v25.8b
	eor	v14.8b, v14.8b, v26.8b
	eor	v15.8b, v15.8b, v27.8b
	eor	v16.8b, v16.8b, v28.8b
	b	4f

2:	tbz	x22, #2, 4f		// bit 2 cleared? SHA-384

	// SHA3-224
	ld1	{v25.8b-v28.8b}, [x20], #32
	ld1	{v29.8b}, [x20], #8
	eor	v13.8b, v13.8b, v25.8b
	eor	v14.8b, v14.8b, v26.8b
	eor	v15.8b, v15.8b, v27.8b
	eor	v16.8b, v16.8b, v28.8b
	eor	v17.8b, v17.8b, v29.8b
	b	4f

	// SHA3-512
3:	ld1	{v25.8b-v26.8b}, [x20], #16
	eor	 v7.8b,  v7.8b, v25.8b
	eor	 v8.8b,  v8.8b, v26.8b

4:	sub	w8, w8, #1

	eor3	v29.16b,  v4.16b,  v9.16b, v14.16b
	eor3	v26.16b,  v1.16b,  v6.16b, v11.16b
	eor3	v28.16b,  v3.16b,  v8.16b, v13.16b
	eor3	v25.16b,  v0.16b,  v5.16b, v10.16b
	eor3	v27.16b,  v2.16b,  v7.16b, v12.16b
	eor3	v29.16b, v29.16b, v19.16b, v24.16b
	eor3	v26.16b, v26.16b, v16.16b, v21.16b
	eor3	v28.16b, v28.16b, v18.16b, v23.16b
	eor3	v25.16b, v25.16b, v15.16b, v20.16b
	eor3	v27.16b, v27.16b, v17.16b, v22.16b

	rax1	v30.2d, v29.2d, v26.2d	// bc[0]
	rax1	v26.2d, v26.2d, v28.2d	// bc[2]
	rax1	v28.2d, v28.2d, v25.2d	// bc[4]
	rax1	v25.2d, v25.2d, v27.2d	// bc[1]
	rax1	v27.2d, v27.2d, v29.2d	// bc[3]

	eor	 v0.16b,  v0.16b, v30.16b
	xar	 v29.2d,   v1.2d,  v25.2d, (64 - 1)
	xar	  v1.2d,   v6.2d,  v25.2d, (64 - 44)
	xar	  v6.2d,   v9.2d,  v28.2d, (64 - 20)
	xar	  v9.2d,  v22.2d,  v26.2d, (64 - 61)
	xar	 v22.2d,  v14.2d,  v28.2d, (64 - 39)
	xar	 v14.2d,  v20.2d,  v30.2d, (64 - 18)
	xar	 v31.2d,   v2.2d,  v26.2d, (64 - 62)
	xar	  v2.2d,  v12.2d,  v26.2d, (64 - 43)
	xar	 v12.2d,  v13.2d,  v27.2d, (64 - 25)
	xar	 v13.2d,  v19.2d,  v28.2d, (64 - 8)
	xar	 v19.2d,  v23.2d,  v27.2d, (64 - 56)
	xar	 v23.2d,  v15.2d,  v30.2d, (64 - 41)
	xar	 v15.2d,   v4.2d,  v28.2d, (64 - 27)
	xar	 v28.2d,  v24.2d,  v28.2d, (64 - 14)
	xar	 v24.2d,  v21.2d,  v25.2d, (64 - 2)
	xar	  v8.2d,   v8.2d,  v27.2d, (64 - 55)
	xar	  v4.2d,  v16.2d,  v25.2d, (64 - 45)
	xar	 v16.2d,   v5.2d,  v30.2d, (64 - 36)
	xar	  v5.2d,   v3.2d,  v27.2d, (64 - 28)
	xar	 v27.2d,  v18.2d,  v27.2d, (64 - 21)
	xar	  v3.2d,  v17.2d,  v26.2d, (64 - 15)
	xar	 v25.2d,  v11.2d,  v25.2d, (64 - 10)
	xar	 v26.2d,   v7.2d,  v26.2d, (64 - 6)
	xar	 v30.2d,  v10.2d,  v30.2d, (64 - 3)

	bcax	v20.16b, v31.16b, v22.16b,  v8.16b
	bcax	v21.16b,  v8.16b, v23.16b, v22.16b
	bcax	v22.16b, v22.16b, v24.16b, v23.16b
	bcax	v23.16b, v23.16b, v31.16b, v24.16b
	bcax	v24.16b, v24.16b,  v8.16b, v31.16b

	ld1r	{v31.2d}, [x9], #8

	bcax	v17.16b, v25.16b, v19.16b,  v3.16b
	bcax	v18.16b,  v3.16b, v15.16b, v19.16b
	bcax	v19.16b, v19.16b, v16.16b, v15.16b
	bcax	v15.16b, v15.16b, v25.16b, v16.16b
	bcax	v16.16b, v16.16b,  v3.16b, v25.16b

	bcax	v10.16b, v29.16b, v12.16b, v26.16b
	bcax	v11.16b, v26.16b, v13.16b, v12.16b
	bcax	v12.16b, v12.16b, v14.16b, v13.16b
	bcax	v13.16b, v13.16b, v29.16b, v14.16b
	bcax	v14.16b, v14.16b, v26.16b, v29.16b

	bcax	 v7.16b, v30.16b,  v9.16b,  v4.16b
	bcax	 v8.16b,  v4.16b,  v5.16b,  v9.16b
	bcax	 v9.16b,  v9.16b,  v6.16b,  v5.16b
	bcax	 v5.16b,  v5.16b, v30.16b,  v6.16b
	bcax	 v6.16b,  v6.16b,  v4.16b, v30.16b

	bcax	 v3.16b, v27.16b,  v0.16b, v28.16b
	bcax	 v4.16b, v28.16b,  v1.16b,  v0.16b
	bcax	 v0.16b,  v0.16b,  v2.16b,  v1.16b
	bcax	 v1.16b,  v1.16b, v27.16b,  v2.16b
	bcax	 v2.16b,  v2.16b, v28.16b, v27.16b

	eor	 v0.16b,  v0.16b, v31.16b

	cbnz	w8, 4b
	cbz	w21, 5f

	if_will_cond_yield_neon
	add	x8, x19, #32
	st1	{ v0.1d- v3.1d}, [x19]
	st1	{ v4.1d- v7.1d}, [x8], #32
	st1	{ v8.1d-v11.1d}, [x8], #32
	st1	{v12.1d-v15.1d}, [x8], #32
	st1	{v16.1d-v19.1d}, [x8], #32
	st1	{v20.1d-v23.1d}, [x8], #32
	st1	{v24.1d}, [x8]
	do_cond_yield_neon
	b		0b
	endif_yield_neon

	b	1b

	/* save state */
5:	st1	{ v0.1d- v3.1d}, [x19], #32
	st1	{ v4.1d- v7.1d}, [x19], #32
	st1	{ v8.1d-v11.1d}, [x19], #32
	st1	{v12.1d-v15.1d}, [x19], #32
	st1	{v16.1d-v19.1d}, [x19], #32
	st1	{v20.1d-v23.1d}, [x19], #32
	st1	{v24.1d}, [x19]
	frame_pop
	ret
ENDPROC(sha3_ce_transform)

	.section	".rodata", "a"
	.align		8
.Lsha3_rcon:
	.quad	0x0000000000000001, 0x0000000000008082, 0x800000000000808a
	.quad	0x8000000080008000, 0x000000000000808b, 0x0000000080000001
	.quad	0x8000000080008081, 0x8000000000008009, 0x000000000000008a
	.quad	0x0000000000000088, 0x0000000080008009, 0x000000008000000a
	.quad	0x000000008000808b, 0x800000000000008b, 0x8000000000008089
	.quad	0x8000000000008003, 0x8000000000008002, 0x8000000000000080
	.quad	0x000000000000800a, 0x800000008000000a, 0x8000000080008081
	.quad	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00			`/* SPDX-License-Identifier: GPL-2.0 */`
			`/*`
			`* sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions`
			`*`
			`* Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>`
			`*`
			`* This program is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License version 2 as`
			`* published by the Free Software Foundation.`
			`*/`

			`#include <linux/linkage.h>`
			`#include <asm/assembler.h>`

			`.irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31`
			`.set .Lv\b\().2d, \b`
			`.set .Lv\b\().16b, \b`
			`.endr`

			`/*`
			`* ARMv8.2 Crypto Extensions instructions`
			`*/`
			`.macro eor3, rd, rn, rm, ra`
			`.inst 0xce000000 \| .L\rd \| (.L\rn << 5) \| (.L\ra << 10) \| (.L\rm << 16)`
			`.endm`

			`.macro rax1, rd, rn, rm`
			`.inst 0xce608c00 \| .L\rd \| (.L\rn << 5) \| (.L\rm << 16)`
			`.endm`

			`.macro bcax, rd, rn, rm, ra`
			`.inst 0xce200000 \| .L\rd \| (.L\rn << 5) \| (.L\ra << 10) \| (.L\rm << 16)`
			`.endm`

			`.macro xar, rd, rn, rm, imm6`
			`.inst 0xce800000 \| .L\rd \| (.L\rn << 5) \| ((\imm6) << 10) \| (.L\rm << 16)`
			`.endm`

			`/*`
			`* sha3_ce_transform(u64 st, const u8 data, int blocks, int dg_size)`
			`*/`
			`.text`
			`ENTRY(sha3_ce_transform)`
crypto: arm64/sha3-ce - yield NEON after every block of input Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-05-01 00:18:29 +08:00			`frame_push 4`

			`mov x19, x0`
			`mov x20, x1`
			`mov x21, x2`
			`mov x22, x3`

			`0: /* load state */`
			`add x8, x19, #32`
			`ld1 { v0.1d- v3.1d}, [x19]`
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00			`ld1 { v4.1d- v7.1d}, [x8], #32`
			`ld1 { v8.1d-v11.1d}, [x8], #32`
			`ld1 {v12.1d-v15.1d}, [x8], #32`
			`ld1 {v16.1d-v19.1d}, [x8], #32`
			`ld1 {v20.1d-v23.1d}, [x8], #32`
			`ld1 {v24.1d}, [x8]`

crypto: arm64/sha3-ce - yield NEON after every block of input Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-05-01 00:18:29 +08:00			`1: sub w21, w21, #1`
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00			`mov w8, #24`
			`adr_l x9, .Lsha3_rcon`

			`/* load input */`
crypto: arm64/sha3-ce - yield NEON after every block of input Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-05-01 00:18:29 +08:00			`ld1 {v25.8b-v28.8b}, [x20], #32`
			`ld1 {v29.8b-v31.8b}, [x20], #24`
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00			`eor v0.8b, v0.8b, v25.8b`
			`eor v1.8b, v1.8b, v26.8b`
			`eor v2.8b, v2.8b, v27.8b`
			`eor v3.8b, v3.8b, v28.8b`
			`eor v4.8b, v4.8b, v29.8b`
			`eor v5.8b, v5.8b, v30.8b`
			`eor v6.8b, v6.8b, v31.8b`

crypto: arm64/sha3-ce - yield NEON after every block of input Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-05-01 00:18:29 +08:00			`tbnz x22, #6, 3f // SHA3-512`
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00
crypto: arm64/sha3-ce - yield NEON after every block of input Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-05-01 00:18:29 +08:00			`ld1 {v25.8b-v28.8b}, [x20], #32`
			`ld1 {v29.8b-v30.8b}, [x20], #16`
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00			`eor v7.8b, v7.8b, v25.8b`
			`eor v8.8b, v8.8b, v26.8b`
			`eor v9.8b, v9.8b, v27.8b`
			`eor v10.8b, v10.8b, v28.8b`
			`eor v11.8b, v11.8b, v29.8b`
			`eor v12.8b, v12.8b, v30.8b`

crypto: arm64/sha3-ce - yield NEON after every block of input Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-05-01 00:18:29 +08:00			`tbnz x22, #4, 2f // SHA3-384 or SHA3-224`
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00
			`// SHA3-256`
crypto: arm64/sha3-ce - yield NEON after every block of input Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-05-01 00:18:29 +08:00			`ld1 {v25.8b-v28.8b}, [x20], #32`
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00			`eor v13.8b, v13.8b, v25.8b`
			`eor v14.8b, v14.8b, v26.8b`
			`eor v15.8b, v15.8b, v27.8b`
			`eor v16.8b, v16.8b, v28.8b`
crypto: arm64/sha3-ce - yield NEON after every block of input Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-05-01 00:18:29 +08:00			`b 4f`
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00
crypto: arm64/sha3-ce - yield NEON after every block of input Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-05-01 00:18:29 +08:00			`2: tbz x22, #2, 4f // bit 2 cleared? SHA-384`
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00
			`// SHA3-224`
crypto: arm64/sha3-ce - yield NEON after every block of input Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-05-01 00:18:29 +08:00			`ld1 {v25.8b-v28.8b}, [x20], #32`
			`ld1 {v29.8b}, [x20], #8`
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00			`eor v13.8b, v13.8b, v25.8b`
			`eor v14.8b, v14.8b, v26.8b`
			`eor v15.8b, v15.8b, v27.8b`
			`eor v16.8b, v16.8b, v28.8b`
			`eor v17.8b, v17.8b, v29.8b`
crypto: arm64/sha3-ce - yield NEON after every block of input Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-05-01 00:18:29 +08:00			`b 4f`
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00
			`// SHA3-512`
crypto: arm64/sha3-ce - yield NEON after every block of input Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-05-01 00:18:29 +08:00			`3: ld1 {v25.8b-v26.8b}, [x20], #16`
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00			`eor v7.8b, v7.8b, v25.8b`
			`eor v8.8b, v8.8b, v26.8b`

crypto: arm64/sha3-ce - yield NEON after every block of input Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-05-01 00:18:29 +08:00			`4: sub w8, w8, #1`
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00
			`eor3 v29.16b, v4.16b, v9.16b, v14.16b`
			`eor3 v26.16b, v1.16b, v6.16b, v11.16b`
			`eor3 v28.16b, v3.16b, v8.16b, v13.16b`
			`eor3 v25.16b, v0.16b, v5.16b, v10.16b`
			`eor3 v27.16b, v2.16b, v7.16b, v12.16b`
			`eor3 v29.16b, v29.16b, v19.16b, v24.16b`
			`eor3 v26.16b, v26.16b, v16.16b, v21.16b`
			`eor3 v28.16b, v28.16b, v18.16b, v23.16b`
			`eor3 v25.16b, v25.16b, v15.16b, v20.16b`
			`eor3 v27.16b, v27.16b, v17.16b, v22.16b`

			`rax1 v30.2d, v29.2d, v26.2d // bc[0]`
			`rax1 v26.2d, v26.2d, v28.2d // bc[2]`
			`rax1 v28.2d, v28.2d, v25.2d // bc[4]`
			`rax1 v25.2d, v25.2d, v27.2d // bc[1]`
			`rax1 v27.2d, v27.2d, v29.2d // bc[3]`

			`eor v0.16b, v0.16b, v30.16b`
			`xar v29.2d, v1.2d, v25.2d, (64 - 1)`
			`xar v1.2d, v6.2d, v25.2d, (64 - 44)`
			`xar v6.2d, v9.2d, v28.2d, (64 - 20)`
			`xar v9.2d, v22.2d, v26.2d, (64 - 61)`
			`xar v22.2d, v14.2d, v28.2d, (64 - 39)`
			`xar v14.2d, v20.2d, v30.2d, (64 - 18)`
			`xar v31.2d, v2.2d, v26.2d, (64 - 62)`
			`xar v2.2d, v12.2d, v26.2d, (64 - 43)`
			`xar v12.2d, v13.2d, v27.2d, (64 - 25)`
			`xar v13.2d, v19.2d, v28.2d, (64 - 8)`
			`xar v19.2d, v23.2d, v27.2d, (64 - 56)`
			`xar v23.2d, v15.2d, v30.2d, (64 - 41)`
			`xar v15.2d, v4.2d, v28.2d, (64 - 27)`
			`xar v28.2d, v24.2d, v28.2d, (64 - 14)`
			`xar v24.2d, v21.2d, v25.2d, (64 - 2)`
			`xar v8.2d, v8.2d, v27.2d, (64 - 55)`
			`xar v4.2d, v16.2d, v25.2d, (64 - 45)`
			`xar v16.2d, v5.2d, v30.2d, (64 - 36)`
			`xar v5.2d, v3.2d, v27.2d, (64 - 28)`
			`xar v27.2d, v18.2d, v27.2d, (64 - 21)`
			`xar v3.2d, v17.2d, v26.2d, (64 - 15)`
			`xar v25.2d, v11.2d, v25.2d, (64 - 10)`
			`xar v26.2d, v7.2d, v26.2d, (64 - 6)`
			`xar v30.2d, v10.2d, v30.2d, (64 - 3)`

			`bcax v20.16b, v31.16b, v22.16b, v8.16b`
			`bcax v21.16b, v8.16b, v23.16b, v22.16b`
			`bcax v22.16b, v22.16b, v24.16b, v23.16b`
			`bcax v23.16b, v23.16b, v31.16b, v24.16b`
			`bcax v24.16b, v24.16b, v8.16b, v31.16b`

			`ld1r {v31.2d}, [x9], #8`

			`bcax v17.16b, v25.16b, v19.16b, v3.16b`
			`bcax v18.16b, v3.16b, v15.16b, v19.16b`
			`bcax v19.16b, v19.16b, v16.16b, v15.16b`
			`bcax v15.16b, v15.16b, v25.16b, v16.16b`
			`bcax v16.16b, v16.16b, v3.16b, v25.16b`

			`bcax v10.16b, v29.16b, v12.16b, v26.16b`
			`bcax v11.16b, v26.16b, v13.16b, v12.16b`
			`bcax v12.16b, v12.16b, v14.16b, v13.16b`
			`bcax v13.16b, v13.16b, v29.16b, v14.16b`
			`bcax v14.16b, v14.16b, v26.16b, v29.16b`

			`bcax v7.16b, v30.16b, v9.16b, v4.16b`
			`bcax v8.16b, v4.16b, v5.16b, v9.16b`
			`bcax v9.16b, v9.16b, v6.16b, v5.16b`
			`bcax v5.16b, v5.16b, v30.16b, v6.16b`
			`bcax v6.16b, v6.16b, v4.16b, v30.16b`

			`bcax v3.16b, v27.16b, v0.16b, v28.16b`
			`bcax v4.16b, v28.16b, v1.16b, v0.16b`
			`bcax v0.16b, v0.16b, v2.16b, v1.16b`
			`bcax v1.16b, v1.16b, v27.16b, v2.16b`
			`bcax v2.16b, v2.16b, v28.16b, v27.16b`

			`eor v0.16b, v0.16b, v31.16b`

crypto: arm64/sha3-ce - yield NEON after every block of input Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-05-01 00:18:29 +08:00			`cbnz w8, 4b`
			`cbz w21, 5f`

			`if_will_cond_yield_neon`
			`add x8, x19, #32`
			`st1 { v0.1d- v3.1d}, [x19]`
			`st1 { v4.1d- v7.1d}, [x8], #32`
			`st1 { v8.1d-v11.1d}, [x8], #32`
			`st1 {v12.1d-v15.1d}, [x8], #32`
			`st1 {v16.1d-v19.1d}, [x8], #32`
			`st1 {v20.1d-v23.1d}, [x8], #32`
			`st1 {v24.1d}, [x8]`
			`do_cond_yield_neon`
			`b 0b`
			`endif_yield_neon`

			`b 1b`
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00
			`/* save state */`
crypto: arm64/sha3-ce - yield NEON after every block of input Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-05-01 00:18:29 +08:00			`5: st1 { v0.1d- v3.1d}, [x19], #32`
			`st1 { v4.1d- v7.1d}, [x19], #32`
			`st1 { v8.1d-v11.1d}, [x19], #32`
			`st1 {v12.1d-v15.1d}, [x19], #32`
			`st1 {v16.1d-v19.1d}, [x19], #32`
			`st1 {v20.1d-v23.1d}, [x19], #32`
			`st1 {v24.1d}, [x19]`
			`frame_pop`
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Tested-by: Steve Capper <steve.capper@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2018-01-19 20:04:38 +08:00			`ret`
			`ENDPROC(sha3_ce_transform)`

			`.section ".rodata", "a"`
			`.align 8`
			`.Lsha3_rcon:`
			`.quad 0x0000000000000001, 0x0000000000008082, 0x800000000000808a`
			`.quad 0x8000000080008000, 0x000000000000808b, 0x0000000080000001`
			`.quad 0x8000000080008081, 0x8000000000008009, 0x000000000000008a`
			`.quad 0x0000000000000088, 0x0000000080008009, 0x000000008000000a`
			`.quad 0x000000008000808b, 0x800000000000008b, 0x8000000000008089`
			`.quad 0x8000000000008003, 0x8000000000008002, 0x8000000000000080`
			`.quad 0x000000000000800a, 0x800000008000000a, 0x8000000080008081`
			`.quad 0x8000000000008080, 0x0000000080000001, 0x8000000080008008`