2
0
mirror of https://github.com/edk2-porting/linux-next.git synced 2024-11-11 04:09:39 +08:00

Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git

# Conflicts:
#	arch/riscv/include/asm/bitops.h
#	drivers/perf/Kconfig
This commit is contained in:
Stephen Rothwell 2024-03-22 09:48:48 +11:00
commit e635cae984
105 changed files with 5183 additions and 747 deletions

View File

@ -144,14 +144,8 @@ passing 0 into the hint address parameter of mmap. On CPUs with an address space
smaller than sv48, the CPU maximum supported address space will be the default.
Software can "opt-in" to receiving VAs from another VA space by providing
a hint address to mmap. A hint address passed to mmap will cause the largest
address space that fits entirely into the hint to be used, unless there is no
space left in the address space. If there is no space available in the requested
address space, an address in the next smallest available address space will be
returned.
For example, in order to obtain 48-bit VA space, a hint address greater than
:code:`1 << 47` must be provided. Note that this is 47 due to sv48 userspace
ending at :code:`1 << 47` and the addresses beyond this are reserved for the
kernel. Similarly, to obtain 57-bit VA space addresses, a hint address greater
than or equal to :code:`1 << 56` must be provided.
a hint address to mmap. When a hint address is passed to mmap, the returned
address will never use more bits than the hint address. For example, if a hint
address of `1 << 40` is passed to mmap, a valid returned address will never use
bits 41 through 63. If no mappable addresses are available in that range, mmap
will return `MAP_FAILED`.

View File

@ -110,7 +110,11 @@ properties:
const: 1
compatible:
const: riscv,cpu-intc
oneOf:
- items:
- const: andestech,cpu-intc
- const: riscv,cpu-intc
- const: riscv,cpu-intc
interrupt-controller: true

View File

@ -477,5 +477,12 @@ properties:
latency, as ratified in commit 56ed795 ("Update
riscv-crypto-spec-vector.adoc") of riscv-crypto.
- const: xandespmu
description:
The Andes Technology performance monitor extension for counter overflow
and privilege mode filtering. For more details, see Counter Related
Registers in the AX45MP datasheet.
https://www.andestech.com/wp-content/uploads/AX45MP-1C-Rev.-5.0.0-Datasheet.pdf
additionalProperties: true
...

View File

@ -10,6 +10,22 @@
# Rely on implicit context synchronization as a result of exception return
# when returning from IPI handler, and when returning to user-space.
#
# * riscv
#
# riscv uses xRET as return from interrupt and to return to user-space.
#
# Given that xRET is not core serializing, we rely on FENCE.I for providing
# core serialization:
#
# - by calling sync_core_before_usermode() on return from interrupt (cf.
# ipi_sync_core()),
#
# - via switch_mm() and sync_core_before_usermode() (respectively, for
# uthread->uthread and kthread->uthread transitions) before returning
# to user-space.
#
# The serialization in switch_mm() is activated by prepare_sync_core_cmd().
#
# * x86
#
# x86-32 uses IRET as return from interrupt, which takes care of the IPI.
@ -43,7 +59,7 @@
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
| riscv: | TODO |
| riscv: | ok |
| s390: | ok |
| sh: | TODO |
| sparc: | TODO |

View File

@ -7,6 +7,7 @@ Scheduler
completion
membarrier
sched-arch
sched-bwc
sched-deadline

View File

@ -0,0 +1,39 @@
.. SPDX-License-Identifier: GPL-2.0
========================
membarrier() System Call
========================
MEMBARRIER_CMD_{PRIVATE,GLOBAL}_EXPEDITED - Architecture requirements
=====================================================================
Memory barriers before updating rq->curr
----------------------------------------
The commands MEMBARRIER_CMD_PRIVATE_EXPEDITED and MEMBARRIER_CMD_GLOBAL_EXPEDITED
require each architecture to have a full memory barrier after coming from
user-space, before updating rq->curr. This barrier is implied by the sequence
rq_lock(); smp_mb__after_spinlock() in __schedule(). The barrier matches a full
barrier in the proximity of the membarrier system call exit, cf.
membarrier_{private,global}_expedited().
Memory barriers after updating rq->curr
---------------------------------------
The commands MEMBARRIER_CMD_PRIVATE_EXPEDITED and MEMBARRIER_CMD_GLOBAL_EXPEDITED
require each architecture to have a full memory barrier after updating rq->curr,
before returning to user-space. The schemes providing this barrier on the various
architectures are as follows.
- alpha, arc, arm, hexagon, mips rely on the full barrier implied by
spin_unlock() in finish_lock_switch().
- arm64 relies on the full barrier implied by switch_to().
- powerpc, riscv, s390, sparc, x86 rely on the full barrier implied by
switch_mm(), if mm is not NULL; they rely on the full barrier implied
by mmdrop(), otherwise. On powerpc and riscv, switch_mm() relies on
membarrier_arch_switch_mm().
The barrier matches a full barrier in the proximity of the membarrier system call
entry, cf. membarrier_{private,global}_expedited().

View File

@ -14132,7 +14132,9 @@ M: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
M: "Paul E. McKenney" <paulmck@kernel.org>
L: linux-kernel@vger.kernel.org
S: Supported
F: arch/powerpc/include/asm/membarrier.h
F: Documentation/scheduler/membarrier.rst
F: arch/*/include/asm/membarrier.h
F: arch/*/include/asm/sync_core.h
F: include/uapi/linux/membarrier.h
F: kernel/sched/membarrier.c

View File

@ -2,6 +2,7 @@
obj-y += kernel/ mm/ net/
obj-$(CONFIG_BUILTIN_DTB) += boot/dts/
obj-$(CONFIG_CRYPTO) += crypto/
obj-y += errata/
obj-$(CONFIG_KVM) += kvm/

View File

@ -27,14 +27,18 @@ config RISCV
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
select ARCH_HAS_MEMBARRIER_CALLBACKS
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_MMIOWB
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PMEM_API
select ARCH_HAS_PREPARE_SYNC_CORE_CMD
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SET_DIRECT_MAP if MMU
select ARCH_HAS_SET_MEMORY if MMU
select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL
select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UBSAN
@ -47,6 +51,9 @@ config RISCV
select ARCH_SUPPORTS_CFI_CLANG
select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU
select ARCH_SUPPORTS_HUGETLBFS if MMU
# LLD >= 14: https://github.com/llvm/llvm-project/issues/50505
select ARCH_SUPPORTS_LTO_CLANG if LLD_VERSION >= 140000
select ARCH_SUPPORTS_LTO_CLANG_THIN if LLD_VERSION >= 140000
select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU
select ARCH_SUPPORTS_PER_VMA_LOCK if MMU
select ARCH_SUPPORTS_SHADOW_CALL_STACK if HAVE_SHADOW_CALL_STACK
@ -106,6 +113,7 @@ config RISCV
select HAVE_ARCH_KGDB_QXFER_PKT
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_TRACEHOOK
@ -124,6 +132,7 @@ config RISCV
select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !PREEMPTION
select HAVE_EBPF_JIT if MMU
select HAVE_FAST_GUP if MMU
select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_GCC_PLUGINS
@ -155,6 +164,7 @@ config RISCV
select IRQ_FORCED_THREADING
select KASAN_VMALLOC if KASAN
select LOCK_MM_AND_FIND_VMA
select MMU_GATHER_RCU_TABLE_FREE if SMP && MMU
select MODULES_USE_ELF_RELA if MODULES
select MODULE_SECTIONS if MODULES
select OF
@ -576,6 +586,13 @@ config TOOLCHAIN_HAS_ZBB
depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900
depends on AS_HAS_OPTION_ARCH
# This symbol indicates that the toolchain supports all v1.0 vector crypto
# extensions, including Zvk*, Zvbb, and Zvbc. LLVM added all of these at once.
# binutils added all except Zvkb, then added Zvkb. So we just check for Zvkb.
config TOOLCHAIN_HAS_VECTOR_CRYPTO
def_bool $(as-instr, .option arch$(comma) +v$(comma) +zvkb)
depends on AS_HAS_OPTION_ARCH
config RISCV_ISA_ZBB
bool "Zbb extension support for bit manipulation instructions"
depends on TOOLCHAIN_HAS_ZBB
@ -686,27 +703,61 @@ config THREAD_SIZE_ORDER
affects irq stack size, which is equal to thread stack size.
config RISCV_MISALIGNED
bool "Support misaligned load/store traps for kernel and userspace"
bool
select SYSCTL_ARCH_UNALIGN_ALLOW
default y
help
Say Y here if you want the kernel to embed support for misaligned
load/store for both kernel and userspace. When disable, misaligned
accesses will generate SIGBUS in userspace and panic in kernel.
Embed support for emulating misaligned loads and stores.
choice
prompt "Unaligned Accesses Support"
default RISCV_PROBE_UNALIGNED_ACCESS
help
This determines the level of support for unaligned accesses. This
information is used by the kernel to perform optimizations. It is also
exposed to user space via the hwprobe syscall. The hardware will be
probed at boot by default.
config RISCV_PROBE_UNALIGNED_ACCESS
bool "Probe for hardware unaligned access support"
select RISCV_MISALIGNED
help
During boot, the kernel will run a series of tests to determine the
speed of unaligned accesses. This probing will dynamically determine
the speed of unaligned accesses on the underlying system. If unaligned
memory accesses trap into the kernel as they are not supported by the
system, the kernel will emulate the unaligned accesses to preserve the
UABI.
config RISCV_EMULATED_UNALIGNED_ACCESS
bool "Emulate unaligned access where system support is missing"
select RISCV_MISALIGNED
help
If unaligned memory accesses trap into the kernel as they are not
supported by the system, the kernel will emulate the unaligned
accesses to preserve the UABI. When the underlying system does support
unaligned accesses, the unaligned accesses are assumed to be slow.
config RISCV_SLOW_UNALIGNED_ACCESS
bool "Assume the system supports slow unaligned memory accesses"
depends on NONPORTABLE
help
Assume that the system supports slow unaligned memory accesses. The
kernel and userspace programs may not be able to run at all on systems
that do not support unaligned memory accesses.
config RISCV_EFFICIENT_UNALIGNED_ACCESS
bool "Assume the CPU supports fast unaligned memory accesses"
bool "Assume the system supports fast unaligned memory accesses"
depends on NONPORTABLE
select DCACHE_WORD_ACCESS if MMU
select HAVE_EFFICIENT_UNALIGNED_ACCESS
help
Say Y here if you want the kernel to assume that the CPU supports
efficient unaligned memory accesses. When enabled, this option
improves the performance of the kernel on such CPUs. However, the
kernel will run much more slowly, or will not be able to run at all,
on CPUs that do not support efficient unaligned memory accesses.
Assume that the system supports fast unaligned memory accesses. When
enabled, this option improves the performance of the kernel on such
systems. However, the kernel and userspace programs will run much more
slowly, or will not be able to run at all, on systems that do not
support efficient unaligned memory accesses.
If unsure what to do here, say N.
endchoice
endmenu # "Platform type"
@ -1011,11 +1062,8 @@ menu "Power management options"
source "kernel/power/Kconfig"
# Hibernation is only possible on systems where the SBI implementation has
# marked its reserved memory as not accessible from, or does not run
# from the same memory as, Linux
config ARCH_HIBERNATION_POSSIBLE
def_bool NONPORTABLE
def_bool y
config ARCH_HIBERNATION_HEADER
def_bool HIBERNATION

View File

@ -50,6 +50,11 @@ ifndef CONFIG_AS_IS_LLVM
KBUILD_CFLAGS += -Wa,-mno-relax
KBUILD_AFLAGS += -Wa,-mno-relax
endif
# LLVM has an issue with target-features and LTO: https://github.com/llvm/llvm-project/issues/59350
# Ensure it is aware of linker relaxation with LTO, otherwise relocations may
# be incorrect: https://github.com/llvm/llvm-project/issues/65090
else ifeq ($(CONFIG_LTO_CLANG),y)
KBUILD_LDFLAGS += -mllvm -mattr=+c -mllvm -mattr=+relax
endif
ifeq ($(CONFIG_SHADOW_CALL_STACK),y)

View File

@ -27,7 +27,7 @@
riscv,isa-base = "rv64i";
riscv,isa-extensions = "i", "m", "a", "f", "d", "c",
"zicntr", "zicsr", "zifencei",
"zihpm";
"zihpm", "xandespmu";
mmu-type = "riscv,sv39";
i-cache-size = <0x8000>;
i-cache-line-size = <0x40>;
@ -39,7 +39,7 @@
cpu0_intc: interrupt-controller {
#interrupt-cells = <1>;
compatible = "riscv,cpu-intc";
compatible = "andestech,cpu-intc", "riscv,cpu-intc";
interrupt-controller;
};
};

View File

@ -44,6 +44,7 @@ CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_ONDEMAND=y
CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m
CONFIG_CPUFREQ_DT=y
CONFIG_ACPI_CPPC_CPUFREQ=m
CONFIG_VIRTUALIZATION=y
CONFIG_KVM=m
CONFIG_ACPI=y
@ -215,6 +216,7 @@ CONFIG_MMC=y
CONFIG_MMC_SDHCI=y
CONFIG_MMC_SDHCI_PLTFM=y
CONFIG_MMC_SDHCI_CADENCE=y
CONFIG_MMC_SDHCI_OF_DWCMSHC=y
CONFIG_MMC_SPI=y
CONFIG_MMC_DW=y
CONFIG_MMC_DW_STARFIVE=y
@ -224,6 +226,7 @@ CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_SUN6I=y
CONFIG_DMADEVICES=y
CONFIG_DMA_SUN6I=m
CONFIG_DW_AXI_DMAC=y
CONFIG_RZ_DMAC=y
CONFIG_VIRTIO_PCI=y
CONFIG_VIRTIO_BALLOON=y

93
arch/riscv/crypto/Kconfig Normal file
View File

@ -0,0 +1,93 @@
# SPDX-License-Identifier: GPL-2.0
menu "Accelerated Cryptographic Algorithms for CPU (riscv)"
config CRYPTO_AES_RISCV64
tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTS"
depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
select CRYPTO_ALGAPI
select CRYPTO_LIB_AES
select CRYPTO_SKCIPHER
help
Block cipher: AES cipher algorithms
Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XTS
Architecture: riscv64 using:
- Zvkned vector crypto extension
- Zvbb vector extension (XTS)
- Zvkb vector crypto extension (CTR)
- Zvkg vector crypto extension (XTS)
config CRYPTO_CHACHA_RISCV64
tristate "Ciphers: ChaCha"
depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
select CRYPTO_SKCIPHER
select CRYPTO_LIB_CHACHA_GENERIC
help
Length-preserving ciphers: ChaCha20 stream cipher algorithm
Architecture: riscv64 using:
- Zvkb vector crypto extension
config CRYPTO_GHASH_RISCV64
tristate "Hash functions: GHASH"
depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
select CRYPTO_GCM
help
GCM GHASH function (NIST SP 800-38D)
Architecture: riscv64 using:
- Zvkg vector crypto extension
config CRYPTO_SHA256_RISCV64
tristate "Hash functions: SHA-224 and SHA-256"
depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
select CRYPTO_SHA256
help
SHA-224 and SHA-256 secure hash algorithm (FIPS 180)
Architecture: riscv64 using:
- Zvknha or Zvknhb vector crypto extensions
- Zvkb vector crypto extension
config CRYPTO_SHA512_RISCV64
tristate "Hash functions: SHA-384 and SHA-512"
depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
select CRYPTO_SHA512
help
SHA-384 and SHA-512 secure hash algorithm (FIPS 180)
Architecture: riscv64 using:
- Zvknhb vector crypto extension
- Zvkb vector crypto extension
config CRYPTO_SM3_RISCV64
tristate "Hash functions: SM3 (ShangMi 3)"
depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
select CRYPTO_HASH
select CRYPTO_SM3
help
SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012)
Architecture: riscv64 using:
- Zvksh vector crypto extension
- Zvkb vector crypto extension
config CRYPTO_SM4_RISCV64
tristate "Ciphers: SM4 (ShangMi 4)"
depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
select CRYPTO_ALGAPI
select CRYPTO_SM4
help
SM4 block cipher algorithm (OSCCA GB/T 32907-2016,
ISO/IEC 18033-3:2010/Amd 1:2021)
SM4 (GBT.32907-2016) is a cryptographic standard issued by the
Organization of State Commercial Administration of China (OSCCA)
as an authorized cryptographic algorithm for use within China.
Architecture: riscv64 using:
- Zvksed vector crypto extension
- Zvkb vector crypto extension
endmenu

View File

@ -0,0 +1,23 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_CRYPTO_AES_RISCV64) += aes-riscv64.o
aes-riscv64-y := aes-riscv64-glue.o aes-riscv64-zvkned.o \
aes-riscv64-zvkned-zvbb-zvkg.o aes-riscv64-zvkned-zvkb.o
obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o
chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o
obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o
ghash-riscv64-y := ghash-riscv64-glue.o ghash-riscv64-zvkg.o
obj-$(CONFIG_CRYPTO_SHA256_RISCV64) += sha256-riscv64.o
sha256-riscv64-y := sha256-riscv64-glue.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o
obj-$(CONFIG_CRYPTO_SHA512_RISCV64) += sha512-riscv64.o
sha512-riscv64-y := sha512-riscv64-glue.o sha512-riscv64-zvknhb-zvkb.o
obj-$(CONFIG_CRYPTO_SM3_RISCV64) += sm3-riscv64.o
sm3-riscv64-y := sm3-riscv64-glue.o sm3-riscv64-zvksh-zvkb.o
obj-$(CONFIG_CRYPTO_SM4_RISCV64) += sm4-riscv64.o
sm4-riscv64-y := sm4-riscv64-glue.o sm4-riscv64-zvksed-zvkb.o

View File

@ -0,0 +1,156 @@
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// This file contains macros that are shared by the other aes-*.S files. The
// generated code of these macros depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128
// - RISC-V Vector AES block cipher extension ('Zvkned')
// Loads the AES round keys from \keyp into vector registers and jumps to code
// specific to the length of the key. Specifically:
// - If AES-128, loads round keys into v1-v11 and jumps to \label128.
// - If AES-192, loads round keys into v1-v13 and jumps to \label192.
// - If AES-256, loads round keys into v1-v15 and continues onwards.
//
// Also sets vl=4 and vtype=e32,m1,ta,ma. Clobbers t0 and t1.
.macro aes_begin keyp, label128, label192
lwu t0, 480(\keyp) // t0 = key length in bytes
li t1, 24 // t1 = key length for AES-192
vsetivli zero, 4, e32, m1, ta, ma
vle32.v v1, (\keyp)
addi \keyp, \keyp, 16
vle32.v v2, (\keyp)
addi \keyp, \keyp, 16
vle32.v v3, (\keyp)
addi \keyp, \keyp, 16
vle32.v v4, (\keyp)
addi \keyp, \keyp, 16
vle32.v v5, (\keyp)
addi \keyp, \keyp, 16
vle32.v v6, (\keyp)
addi \keyp, \keyp, 16
vle32.v v7, (\keyp)
addi \keyp, \keyp, 16
vle32.v v8, (\keyp)
addi \keyp, \keyp, 16
vle32.v v9, (\keyp)
addi \keyp, \keyp, 16
vle32.v v10, (\keyp)
addi \keyp, \keyp, 16
vle32.v v11, (\keyp)
blt t0, t1, \label128 // If AES-128, goto label128.
addi \keyp, \keyp, 16
vle32.v v12, (\keyp)
addi \keyp, \keyp, 16
vle32.v v13, (\keyp)
beq t0, t1, \label192 // If AES-192, goto label192.
// Else, it's AES-256.
addi \keyp, \keyp, 16
vle32.v v14, (\keyp)
addi \keyp, \keyp, 16
vle32.v v15, (\keyp)
.endm
// Encrypts \data using zvkned instructions, using the round keys loaded into
// v1-v11 (for AES-128), v1-v13 (for AES-192), or v1-v15 (for AES-256). \keylen
// is the AES key length in bits. vl and vtype must already be set
// appropriately. Note that if vl > 4, multiple blocks are encrypted.
.macro aes_encrypt data, keylen
vaesz.vs \data, v1
vaesem.vs \data, v2
vaesem.vs \data, v3
vaesem.vs \data, v4
vaesem.vs \data, v5
vaesem.vs \data, v6
vaesem.vs \data, v7
vaesem.vs \data, v8
vaesem.vs \data, v9
vaesem.vs \data, v10
.if \keylen == 128
vaesef.vs \data, v11
.elseif \keylen == 192
vaesem.vs \data, v11
vaesem.vs \data, v12
vaesef.vs \data, v13
.else
vaesem.vs \data, v11
vaesem.vs \data, v12
vaesem.vs \data, v13
vaesem.vs \data, v14
vaesef.vs \data, v15
.endif
.endm
// Same as aes_encrypt, but decrypts instead of encrypts.
.macro aes_decrypt data, keylen
.if \keylen == 128
vaesz.vs \data, v11
.elseif \keylen == 192
vaesz.vs \data, v13
vaesdm.vs \data, v12
vaesdm.vs \data, v11
.else
vaesz.vs \data, v15
vaesdm.vs \data, v14
vaesdm.vs \data, v13
vaesdm.vs \data, v12
vaesdm.vs \data, v11
.endif
vaesdm.vs \data, v10
vaesdm.vs \data, v9
vaesdm.vs \data, v8
vaesdm.vs \data, v7
vaesdm.vs \data, v6
vaesdm.vs \data, v5
vaesdm.vs \data, v4
vaesdm.vs \data, v3
vaesdm.vs \data, v2
vaesdf.vs \data, v1
.endm
// Expands to aes_encrypt or aes_decrypt according to \enc, which is 1 or 0.
.macro aes_crypt data, enc, keylen
.if \enc
aes_encrypt \data, \keylen
.else
aes_decrypt \data, \keylen
.endif
.endm

View File

@ -0,0 +1,637 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* AES using the RISC-V vector crypto extensions. Includes the bare block
* cipher and the ECB, CBC, CBC-CTS, CTR, and XTS modes.
*
* Copyright (C) 2023 VRULL GmbH
* Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
*
* Copyright (C) 2023 SiFive, Inc.
* Author: Jerry Shih <jerry.shih@sifive.com>
*
* Copyright 2024 Google LLC
*/
#include <asm/simd.h>
#include <asm/vector.h>
#include <crypto/aes.h>
#include <crypto/internal/cipher.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <crypto/xts.h>
#include <linux/linkage.h>
#include <linux/module.h>
asmlinkage void aes_encrypt_zvkned(const struct crypto_aes_ctx *key,
const u8 in[AES_BLOCK_SIZE],
u8 out[AES_BLOCK_SIZE]);
asmlinkage void aes_decrypt_zvkned(const struct crypto_aes_ctx *key,
const u8 in[AES_BLOCK_SIZE],
u8 out[AES_BLOCK_SIZE]);
asmlinkage void aes_ecb_encrypt_zvkned(const struct crypto_aes_ctx *key,
const u8 *in, u8 *out, size_t len);
asmlinkage void aes_ecb_decrypt_zvkned(const struct crypto_aes_ctx *key,
const u8 *in, u8 *out, size_t len);
asmlinkage void aes_cbc_encrypt_zvkned(const struct crypto_aes_ctx *key,
const u8 *in, u8 *out, size_t len,
u8 iv[AES_BLOCK_SIZE]);
asmlinkage void aes_cbc_decrypt_zvkned(const struct crypto_aes_ctx *key,
const u8 *in, u8 *out, size_t len,
u8 iv[AES_BLOCK_SIZE]);
asmlinkage void aes_cbc_cts_crypt_zvkned(const struct crypto_aes_ctx *key,
const u8 *in, u8 *out, size_t len,
const u8 iv[AES_BLOCK_SIZE], bool enc);
asmlinkage void aes_ctr32_crypt_zvkned_zvkb(const struct crypto_aes_ctx *key,
const u8 *in, u8 *out, size_t len,
u8 iv[AES_BLOCK_SIZE]);
asmlinkage void aes_xts_encrypt_zvkned_zvbb_zvkg(
const struct crypto_aes_ctx *key,
const u8 *in, u8 *out, size_t len,
u8 tweak[AES_BLOCK_SIZE]);
asmlinkage void aes_xts_decrypt_zvkned_zvbb_zvkg(
const struct crypto_aes_ctx *key,
const u8 *in, u8 *out, size_t len,
u8 tweak[AES_BLOCK_SIZE]);
static int riscv64_aes_setkey(struct crypto_aes_ctx *ctx,
const u8 *key, unsigned int keylen)
{
/*
* For now we just use the generic key expansion, for these reasons:
*
* - zvkned's key expansion instructions don't support AES-192.
* So, non-zvkned fallback code would be needed anyway.
*
* - Users of AES in Linux usually don't change keys frequently.
* So, key expansion isn't performance-critical.
*
* - For single-block AES exposed as a "cipher" algorithm, it's
* necessary to use struct crypto_aes_ctx and initialize its 'key_dec'
* field with the round keys for the Equivalent Inverse Cipher. This
* is because with "cipher", decryption can be requested from a
* context where the vector unit isn't usable, necessitating a
* fallback to aes_decrypt(). But, zvkned can only generate and use
* the normal round keys. Of course, it's preferable to not have
* special code just for "cipher", as e.g. XTS also uses a
* single-block AES encryption. It's simplest to just use
* struct crypto_aes_ctx and aes_expandkey() everywhere.
*/
return aes_expandkey(ctx, key, keylen);
}
static int riscv64_aes_setkey_cipher(struct crypto_tfm *tfm,
const u8 *key, unsigned int keylen)
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
return riscv64_aes_setkey(ctx, key, keylen);
}
static int riscv64_aes_setkey_skcipher(struct crypto_skcipher *tfm,
const u8 *key, unsigned int keylen)
{
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
return riscv64_aes_setkey(ctx, key, keylen);
}
/* Bare AES, without a mode of operation */
static void riscv64_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
if (crypto_simd_usable()) {
kernel_vector_begin();
aes_encrypt_zvkned(ctx, src, dst);
kernel_vector_end();
} else {
aes_encrypt(ctx, dst, src);
}
}
static void riscv64_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
if (crypto_simd_usable()) {
kernel_vector_begin();
aes_decrypt_zvkned(ctx, src, dst);
kernel_vector_end();
} else {
aes_decrypt(ctx, dst, src);
}
}
/* AES-ECB */
static inline int riscv64_aes_ecb_crypt(struct skcipher_request *req, bool enc)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
unsigned int nbytes;
int err;
err = skcipher_walk_virt(&walk, req, false);
while ((nbytes = walk.nbytes) != 0) {
kernel_vector_begin();
if (enc)
aes_ecb_encrypt_zvkned(ctx, walk.src.virt.addr,
walk.dst.virt.addr,
nbytes & ~(AES_BLOCK_SIZE - 1));
else
aes_ecb_decrypt_zvkned(ctx, walk.src.virt.addr,
walk.dst.virt.addr,
nbytes & ~(AES_BLOCK_SIZE - 1));
kernel_vector_end();
err = skcipher_walk_done(&walk, nbytes & (AES_BLOCK_SIZE - 1));
}
return err;
}
static int riscv64_aes_ecb_encrypt(struct skcipher_request *req)
{
return riscv64_aes_ecb_crypt(req, true);
}
static int riscv64_aes_ecb_decrypt(struct skcipher_request *req)
{
return riscv64_aes_ecb_crypt(req, false);
}
/* AES-CBC */
static int riscv64_aes_cbc_crypt(struct skcipher_request *req, bool enc)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
unsigned int nbytes;
int err;
err = skcipher_walk_virt(&walk, req, false);
while ((nbytes = walk.nbytes) != 0) {
kernel_vector_begin();
if (enc)
aes_cbc_encrypt_zvkned(ctx, walk.src.virt.addr,
walk.dst.virt.addr,
nbytes & ~(AES_BLOCK_SIZE - 1),
walk.iv);
else
aes_cbc_decrypt_zvkned(ctx, walk.src.virt.addr,
walk.dst.virt.addr,
nbytes & ~(AES_BLOCK_SIZE - 1),
walk.iv);
kernel_vector_end();
err = skcipher_walk_done(&walk, nbytes & (AES_BLOCK_SIZE - 1));
}
return err;
}
static int riscv64_aes_cbc_encrypt(struct skcipher_request *req)
{
return riscv64_aes_cbc_crypt(req, true);
}
static int riscv64_aes_cbc_decrypt(struct skcipher_request *req)
{
return riscv64_aes_cbc_crypt(req, false);
}
/* AES-CBC-CTS */
static int riscv64_aes_cbc_cts_crypt(struct skcipher_request *req, bool enc)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
struct scatterlist sg_src[2], sg_dst[2];
struct skcipher_request subreq;
struct scatterlist *src, *dst;
struct skcipher_walk walk;
unsigned int cbc_len;
int err;
if (req->cryptlen < AES_BLOCK_SIZE)
return -EINVAL;
err = skcipher_walk_virt(&walk, req, false);
if (err)
return err;
/*
* If the full message is available in one step, decrypt it in one call
* to the CBC-CTS assembly function. This reduces overhead, especially
* on short messages. Otherwise, fall back to doing CBC up to the last
* two blocks, then invoke CTS just for the ciphertext stealing.
*/
if (unlikely(walk.nbytes != req->cryptlen)) {
cbc_len = round_down(req->cryptlen - AES_BLOCK_SIZE - 1,
AES_BLOCK_SIZE);
skcipher_walk_abort(&walk);
skcipher_request_set_tfm(&subreq, tfm);
skcipher_request_set_callback(&subreq,
skcipher_request_flags(req),
NULL, NULL);
skcipher_request_set_crypt(&subreq, req->src, req->dst,
cbc_len, req->iv);
err = riscv64_aes_cbc_crypt(&subreq, enc);
if (err)
return err;
dst = src = scatterwalk_ffwd(sg_src, req->src, cbc_len);
if (req->dst != req->src)
dst = scatterwalk_ffwd(sg_dst, req->dst, cbc_len);
skcipher_request_set_crypt(&subreq, src, dst,
req->cryptlen - cbc_len, req->iv);
err = skcipher_walk_virt(&walk, &subreq, false);
if (err)
return err;
}
kernel_vector_begin();
aes_cbc_cts_crypt_zvkned(ctx, walk.src.virt.addr, walk.dst.virt.addr,
walk.nbytes, req->iv, enc);
kernel_vector_end();
return skcipher_walk_done(&walk, 0);
}
static int riscv64_aes_cbc_cts_encrypt(struct skcipher_request *req)
{
return riscv64_aes_cbc_cts_crypt(req, true);
}
static int riscv64_aes_cbc_cts_decrypt(struct skcipher_request *req)
{
return riscv64_aes_cbc_cts_crypt(req, false);
}
/* AES-CTR */
static int riscv64_aes_ctr_crypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
unsigned int nbytes, p1_nbytes;
struct skcipher_walk walk;
u32 ctr32, nblocks;
int err;
/* Get the low 32-bit word of the 128-bit big endian counter. */
ctr32 = get_unaligned_be32(req->iv + 12);
err = skcipher_walk_virt(&walk, req, false);
while ((nbytes = walk.nbytes) != 0) {
if (nbytes < walk.total) {
/* Not the end yet, so keep the length block-aligned. */
nbytes = round_down(nbytes, AES_BLOCK_SIZE);
nblocks = nbytes / AES_BLOCK_SIZE;
} else {
/* It's the end, so include any final partial block. */
nblocks = DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE);
}
ctr32 += nblocks;
kernel_vector_begin();
if (ctr32 >= nblocks) {
/* The low 32-bit word of the counter won't overflow. */
aes_ctr32_crypt_zvkned_zvkb(ctx, walk.src.virt.addr,
walk.dst.virt.addr, nbytes,
req->iv);
} else {
/*
* The low 32-bit word of the counter will overflow.
* The assembly doesn't handle this case, so split the
* operation into two at the point where the overflow
* will occur. After the first part, add the carry bit.
*/
p1_nbytes = min_t(unsigned int, nbytes,
(nblocks - ctr32) * AES_BLOCK_SIZE);
aes_ctr32_crypt_zvkned_zvkb(ctx, walk.src.virt.addr,
walk.dst.virt.addr,
p1_nbytes, req->iv);
crypto_inc(req->iv, 12);
if (ctr32) {
aes_ctr32_crypt_zvkned_zvkb(
ctx,
walk.src.virt.addr + p1_nbytes,
walk.dst.virt.addr + p1_nbytes,
nbytes - p1_nbytes, req->iv);
}
}
kernel_vector_end();
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
/* AES-XTS */
struct riscv64_aes_xts_ctx {
struct crypto_aes_ctx ctx1;
struct crypto_aes_ctx ctx2;
};
static int riscv64_aes_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
struct riscv64_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
return xts_verify_key(tfm, key, keylen) ?:
riscv64_aes_setkey(&ctx->ctx1, key, keylen / 2) ?:
riscv64_aes_setkey(&ctx->ctx2, key + keylen / 2, keylen / 2);
}
static int riscv64_aes_xts_crypt(struct skcipher_request *req, bool enc)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
const struct riscv64_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
int tail = req->cryptlen % AES_BLOCK_SIZE;
struct scatterlist sg_src[2], sg_dst[2];
struct skcipher_request subreq;
struct scatterlist *src, *dst;
struct skcipher_walk walk;
int err;
if (req->cryptlen < AES_BLOCK_SIZE)
return -EINVAL;
/* Encrypt the IV with the tweak key to get the first tweak. */
kernel_vector_begin();
aes_encrypt_zvkned(&ctx->ctx2, req->iv, req->iv);
kernel_vector_end();
err = skcipher_walk_virt(&walk, req, false);
/*
* If the message length isn't divisible by the AES block size and the
* full message isn't available in one step of the scatterlist walk,
* then separate off the last full block and the partial block. This
* ensures that they are processed in the same call to the assembly
* function, which is required for ciphertext stealing.
*/
if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
skcipher_walk_abort(&walk);
skcipher_request_set_tfm(&subreq, tfm);
skcipher_request_set_callback(&subreq,
skcipher_request_flags(req),
NULL, NULL);
skcipher_request_set_crypt(&subreq, req->src, req->dst,
req->cryptlen - tail - AES_BLOCK_SIZE,
req->iv);
req = &subreq;
err = skcipher_walk_virt(&walk, req, false);
} else {
tail = 0;
}
while (walk.nbytes) {
unsigned int nbytes = walk.nbytes;
if (nbytes < walk.total)
nbytes = round_down(nbytes, AES_BLOCK_SIZE);
kernel_vector_begin();
if (enc)
aes_xts_encrypt_zvkned_zvbb_zvkg(
&ctx->ctx1, walk.src.virt.addr,
walk.dst.virt.addr, nbytes, req->iv);
else
aes_xts_decrypt_zvkned_zvbb_zvkg(
&ctx->ctx1, walk.src.virt.addr,
walk.dst.virt.addr, nbytes, req->iv);
kernel_vector_end();
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
if (err || likely(!tail))
return err;
/* Do ciphertext stealing with the last full block and partial block. */
dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
if (req->dst != req->src)
dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
req->iv);
err = skcipher_walk_virt(&walk, req, false);
if (err)
return err;
kernel_vector_begin();
if (enc)
aes_xts_encrypt_zvkned_zvbb_zvkg(
&ctx->ctx1, walk.src.virt.addr,
walk.dst.virt.addr, walk.nbytes, req->iv);
else
aes_xts_decrypt_zvkned_zvbb_zvkg(
&ctx->ctx1, walk.src.virt.addr,
walk.dst.virt.addr, walk.nbytes, req->iv);
kernel_vector_end();
return skcipher_walk_done(&walk, 0);
}
static int riscv64_aes_xts_encrypt(struct skcipher_request *req)
{
return riscv64_aes_xts_crypt(req, true);
}
static int riscv64_aes_xts_decrypt(struct skcipher_request *req)
{
return riscv64_aes_xts_crypt(req, false);
}
/* Algorithm definitions */
static struct crypto_alg riscv64_zvkned_aes_cipher_alg = {
.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_priority = 300,
.cra_name = "aes",
.cra_driver_name = "aes-riscv64-zvkned",
.cra_cipher = {
.cia_min_keysize = AES_MIN_KEY_SIZE,
.cia_max_keysize = AES_MAX_KEY_SIZE,
.cia_setkey = riscv64_aes_setkey_cipher,
.cia_encrypt = riscv64_aes_encrypt,
.cia_decrypt = riscv64_aes_decrypt,
},
.cra_module = THIS_MODULE,
};
static struct skcipher_alg riscv64_zvkned_aes_skcipher_algs[] = {
{
.setkey = riscv64_aes_setkey_skcipher,
.encrypt = riscv64_aes_ecb_encrypt,
.decrypt = riscv64_aes_ecb_decrypt,
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.walksize = 8 * AES_BLOCK_SIZE, /* matches LMUL=8 */
.base = {
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_priority = 300,
.cra_name = "ecb(aes)",
.cra_driver_name = "ecb-aes-riscv64-zvkned",
.cra_module = THIS_MODULE,
},
}, {
.setkey = riscv64_aes_setkey_skcipher,
.encrypt = riscv64_aes_cbc_encrypt,
.decrypt = riscv64_aes_cbc_decrypt,
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.base = {
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_priority = 300,
.cra_name = "cbc(aes)",
.cra_driver_name = "cbc-aes-riscv64-zvkned",
.cra_module = THIS_MODULE,
},
}, {
.setkey = riscv64_aes_setkey_skcipher,
.encrypt = riscv64_aes_cbc_cts_encrypt,
.decrypt = riscv64_aes_cbc_cts_decrypt,
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.walksize = 4 * AES_BLOCK_SIZE, /* matches LMUL=4 */
.base = {
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_priority = 300,
.cra_name = "cts(cbc(aes))",
.cra_driver_name = "cts-cbc-aes-riscv64-zvkned",
.cra_module = THIS_MODULE,
},
}
};
static struct skcipher_alg riscv64_zvkned_zvkb_aes_skcipher_alg = {
.setkey = riscv64_aes_setkey_skcipher,
.encrypt = riscv64_aes_ctr_crypt,
.decrypt = riscv64_aes_ctr_crypt,
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.chunksize = AES_BLOCK_SIZE,
.walksize = 4 * AES_BLOCK_SIZE, /* matches LMUL=4 */
.base = {
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_priority = 300,
.cra_name = "ctr(aes)",
.cra_driver_name = "ctr-aes-riscv64-zvkned-zvkb",
.cra_module = THIS_MODULE,
},
};
static struct skcipher_alg riscv64_zvkned_zvbb_zvkg_aes_skcipher_alg = {
.setkey = riscv64_aes_xts_setkey,
.encrypt = riscv64_aes_xts_encrypt,
.decrypt = riscv64_aes_xts_decrypt,
.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.chunksize = AES_BLOCK_SIZE,
.walksize = 4 * AES_BLOCK_SIZE, /* matches LMUL=4 */
.base = {
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct riscv64_aes_xts_ctx),
.cra_priority = 300,
.cra_name = "xts(aes)",
.cra_driver_name = "xts-aes-riscv64-zvkned-zvbb-zvkg",
.cra_module = THIS_MODULE,
},
};
static inline bool riscv64_aes_xts_supported(void)
{
return riscv_isa_extension_available(NULL, ZVBB) &&
riscv_isa_extension_available(NULL, ZVKG) &&
riscv_vector_vlen() < 2048 /* Implementation limitation */;
}
static int __init riscv64_aes_mod_init(void)
{
int err = -ENODEV;
if (riscv_isa_extension_available(NULL, ZVKNED) &&
riscv_vector_vlen() >= 128) {
err = crypto_register_alg(&riscv64_zvkned_aes_cipher_alg);
if (err)
return err;
err = crypto_register_skciphers(
riscv64_zvkned_aes_skcipher_algs,
ARRAY_SIZE(riscv64_zvkned_aes_skcipher_algs));
if (err)
goto unregister_zvkned_cipher_alg;
if (riscv_isa_extension_available(NULL, ZVKB)) {
err = crypto_register_skcipher(
&riscv64_zvkned_zvkb_aes_skcipher_alg);
if (err)
goto unregister_zvkned_skcipher_algs;
}
if (riscv64_aes_xts_supported()) {
err = crypto_register_skcipher(
&riscv64_zvkned_zvbb_zvkg_aes_skcipher_alg);
if (err)
goto unregister_zvkned_zvkb_skcipher_alg;
}
}
return err;
unregister_zvkned_zvkb_skcipher_alg:
if (riscv_isa_extension_available(NULL, ZVKB))
crypto_unregister_skcipher(&riscv64_zvkned_zvkb_aes_skcipher_alg);
unregister_zvkned_skcipher_algs:
crypto_unregister_skciphers(riscv64_zvkned_aes_skcipher_algs,
ARRAY_SIZE(riscv64_zvkned_aes_skcipher_algs));
unregister_zvkned_cipher_alg:
crypto_unregister_alg(&riscv64_zvkned_aes_cipher_alg);
return err;
}
static void __exit riscv64_aes_mod_exit(void)
{
if (riscv64_aes_xts_supported())
crypto_unregister_skcipher(&riscv64_zvkned_zvbb_zvkg_aes_skcipher_alg);
if (riscv_isa_extension_available(NULL, ZVKB))
crypto_unregister_skcipher(&riscv64_zvkned_zvkb_aes_skcipher_alg);
crypto_unregister_skciphers(riscv64_zvkned_aes_skcipher_algs,
ARRAY_SIZE(riscv64_zvkned_aes_skcipher_algs));
crypto_unregister_alg(&riscv64_zvkned_aes_cipher_alg);
}
module_init(riscv64_aes_mod_init);
module_exit(riscv64_aes_mod_exit);
MODULE_DESCRIPTION("AES-ECB/CBC/CTS/CTR/XTS (RISC-V accelerated)");
MODULE_AUTHOR("Jerry Shih <jerry.shih@sifive.com>");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("aes");
MODULE_ALIAS_CRYPTO("ecb(aes)");
MODULE_ALIAS_CRYPTO("cbc(aes)");
MODULE_ALIAS_CRYPTO("cts(cbc(aes))");
MODULE_ALIAS_CRYPTO("ctr(aes)");
MODULE_ALIAS_CRYPTO("xts(aes)");

View File

@ -0,0 +1,312 @@
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048
// - RISC-V Vector AES block cipher extension ('Zvkned')
// - RISC-V Vector Bit-manipulation extension ('Zvbb')
// - RISC-V Vector GCM/GMAC extension ('Zvkg')
#include <linux/linkage.h>
.text
.option arch, +zvkned, +zvbb, +zvkg
#include "aes-macros.S"
#define KEYP a0
#define INP a1
#define OUTP a2
#define LEN a3
#define TWEAKP a4
#define LEN32 a5
#define TAIL_LEN a6
#define VL a7
#define VLMAX t4
// v1-v15 contain the AES round keys, but they are used for temporaries before
// the AES round keys have been loaded.
#define TWEAKS v16 // LMUL=4 (most of the time)
#define TWEAKS_BREV v20 // LMUL=4 (most of the time)
#define MULTS_BREV v24 // LMUL=4 (most of the time)
#define TMP0 v28
#define TMP1 v29
#define TMP2 v30
#define TMP3 v31
// xts_init initializes the following values:
//
// TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)
// TWEAKS_BREV: same as TWEAKS, but bit-reversed
// MULTS_BREV: N 128-bit values x^N, bit-reversed. Only if N > 1.
//
// N is the maximum number of blocks that will be processed per loop iteration,
// computed using vsetvli.
//
// The field convention used by XTS is the same as that of GHASH, but with the
// bits reversed within each byte. The zvkg extension provides the vgmul
// instruction which does multiplication in this field. Therefore, for tweak
// computation we use vgmul to do multiplications in parallel, instead of
// serially multiplying by x using shifting+xoring. Note that for this to work,
// the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).
.macro xts_init
// Load the first tweak T.
vsetivli zero, 4, e32, m1, ta, ma
vle32.v TWEAKS, (TWEAKP)
// If there's only one block (or no blocks at all), then skip the tweak
// sequence computation because (at most) T itself is needed.
li t0, 16
ble LEN, t0, .Linit_single_block\@
// Save a copy of T bit-reversed in v12.
vbrev8.v v12, TWEAKS
//
// Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming
// that N <= 128. Though, this code actually requires N < 64 (or
// equivalently VLEN < 2048) due to the use of 64-bit intermediate
// values here and in the x^N computation later.
//
vsetvli VL, LEN32, e32, m4, ta, ma
srli t0, VL, 2 // t0 = N (num blocks)
// Generate two sequences, each with N 32-bit values:
// v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].
vsetvli zero, t0, e32, m1, ta, ma
vmv.v.i v0, 1
vid.v v1
// Use vzext to zero-extend the sequences to 64 bits. Reinterpret them
// as two sequences, each with 2*N 32-bit values:
// v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].
vsetvli zero, t0, e64, m2, ta, ma
vzext.vf2 v2, v0
vzext.vf2 v4, v1
slli t1, t0, 1 // t1 = 2*N
vsetvli zero, t1, e32, m2, ta, ma
// Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],
// widening to 64 bits per element. When reinterpreted as N 128-bit
// values, this is the needed sequence of 128-bit values 1 << i (x^i).
vwsll.vv v8, v2, v4
// Copy the bit-reversed T to all N elements of TWEAKS_BREV, then
// multiply by x^i. This gives the sequence T*(x^i), bit-reversed.
vsetvli zero, LEN32, e32, m4, ta, ma
vmv.v.i TWEAKS_BREV, 0
vaesz.vs TWEAKS_BREV, v12
vbrev8.v v8, v8
vgmul.vv TWEAKS_BREV, v8
// Save a copy of the sequence T*(x^i) with the bit reversal undone.
vbrev8.v TWEAKS, TWEAKS_BREV
// Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.
li t1, 1
sll t1, t1, t0 // t1 = 1 << N
vsetivli zero, 2, e64, m1, ta, ma
vmv.v.i v0, 0
vsetivli zero, 1, e64, m1, tu, ma
vmv.v.x v0, t1
vbrev8.v v0, v0
vsetvli zero, LEN32, e32, m4, ta, ma
vmv.v.i MULTS_BREV, 0
vaesz.vs MULTS_BREV, v0
j .Linit_done\@
.Linit_single_block\@:
vbrev8.v TWEAKS_BREV, TWEAKS
.Linit_done\@:
.endm
// Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed. This is
// the multiplier required to advance the tweak by one.
.macro load_x
li t0, 0x40
vsetivli zero, 4, e32, m1, ta, ma
vmv.v.i MULTS_BREV, 0
vsetivli zero, 1, e8, m1, tu, ma
vmv.v.x MULTS_BREV, t0
.endm
.macro __aes_xts_crypt enc, keylen
// With 16 < len <= 31, there's no main loop, just ciphertext stealing.
beqz LEN32, .Lcts_without_main_loop\@
vsetvli VLMAX, zero, e32, m4, ta, ma
1:
vsetvli VL, LEN32, e32, m4, ta, ma
2:
// Encrypt or decrypt VL/4 blocks.
vle32.v TMP0, (INP)
vxor.vv TMP0, TMP0, TWEAKS
aes_crypt TMP0, \enc, \keylen
vxor.vv TMP0, TMP0, TWEAKS
vse32.v TMP0, (OUTP)
// Update the pointers and the remaining length.
slli t0, VL, 2
add INP, INP, t0
add OUTP, OUTP, t0
sub LEN32, LEN32, VL
// Check whether more blocks remain.
beqz LEN32, .Lmain_loop_done\@
// Compute the next sequence of tweaks by multiplying the previous
// sequence by x^N. Store the result in both bit-reversed order and
// regular order (i.e. with the bit reversal undone).
vgmul.vv TWEAKS_BREV, MULTS_BREV
vbrev8.v TWEAKS, TWEAKS_BREV
// Since we compute the tweak multipliers x^N in advance, we require
// that each iteration process the same length except possibly the last.
// This conflicts slightly with the behavior allowed by RISC-V Vector
// Extension, where CPUs can select a lower length for both of the last
// two iterations. E.g., vl might take the sequence of values
// [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we
// can use x^4 again instead of computing x^3. Therefore, we explicitly
// keep the vl at VLMAX if there is at least VLMAX remaining.
bge LEN32, VLMAX, 2b
j 1b
.Lmain_loop_done\@:
load_x
// Compute the next tweak.
addi t0, VL, -4
vsetivli zero, 4, e32, m4, ta, ma
vslidedown.vx TWEAKS_BREV, TWEAKS_BREV, t0 // Extract last tweak
vsetivli zero, 4, e32, m1, ta, ma
vgmul.vv TWEAKS_BREV, MULTS_BREV // Advance to next tweak
bnez TAIL_LEN, .Lcts\@
// Update *TWEAKP to contain the next tweak.
vbrev8.v TWEAKS, TWEAKS_BREV
vse32.v TWEAKS, (TWEAKP)
ret
.Lcts_without_main_loop\@:
load_x
.Lcts\@:
// TWEAKS_BREV now contains the next tweak. Compute the one after that.
vsetivli zero, 4, e32, m1, ta, ma
vmv.v.v TMP0, TWEAKS_BREV
vgmul.vv TMP0, MULTS_BREV
// Undo the bit reversal of the next two tweaks and store them in TMP1
// and TMP2, such that TMP1 is the first needed and TMP2 the second.
.if \enc
vbrev8.v TMP1, TWEAKS_BREV
vbrev8.v TMP2, TMP0
.else
vbrev8.v TMP1, TMP0
vbrev8.v TMP2, TWEAKS_BREV
.endif
// Encrypt/decrypt the last full block.
vle32.v TMP0, (INP)
vxor.vv TMP0, TMP0, TMP1
aes_crypt TMP0, \enc, \keylen
vxor.vv TMP0, TMP0, TMP1
// Swap the first TAIL_LEN bytes of the above result with the tail.
// Note that to support in-place encryption/decryption, the load from
// the input tail must happen before the store to the output tail.
addi t0, INP, 16
addi t1, OUTP, 16
vmv.v.v TMP3, TMP0
vsetvli zero, TAIL_LEN, e8, m1, tu, ma
vle8.v TMP0, (t0)
vse8.v TMP3, (t1)
// Encrypt/decrypt again and store the last full block.
vsetivli zero, 4, e32, m1, ta, ma
vxor.vv TMP0, TMP0, TMP2
aes_crypt TMP0, \enc, \keylen
vxor.vv TMP0, TMP0, TMP2
vse32.v TMP0, (OUTP)
ret
.endm
.macro aes_xts_crypt enc
// Check whether the length is a multiple of the AES block size.
andi TAIL_LEN, LEN, 15
beqz TAIL_LEN, 1f
// The length isn't a multiple of the AES block size, so ciphertext
// stealing will be required. Ciphertext stealing involves special
// handling of the partial block and the last full block, so subtract
// the length of both from the length to be processed in the main loop.
sub LEN, LEN, TAIL_LEN
addi LEN, LEN, -16
1:
srli LEN32, LEN, 2
// LEN and LEN32 now contain the total length of the blocks that will be
// processed in the main loop, in bytes and 32-bit words respectively.
xts_init
aes_begin KEYP, 128f, 192f
__aes_xts_crypt \enc, 256
128:
__aes_xts_crypt \enc, 128
192:
__aes_xts_crypt \enc, 192
.endm
// void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,
// const u8 *in, u8 *out, size_t len,
// u8 tweak[16]);
//
// |key| is the data key. |tweak| contains the next tweak; the encryption of
// the original IV with the tweak key was already done. This function supports
// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
// |len| must be a multiple of 16 except on the last call. If |len| is a
// multiple of 16, then this function updates |tweak| to contain the next tweak.
SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)
aes_xts_crypt 1
SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)
// Same prototype and calling convention as the encryption function
SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)
aes_xts_crypt 0
SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)

View File

@ -0,0 +1,146 @@
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128
// - RISC-V Vector AES block cipher extension ('Zvkned')
// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
#include <linux/linkage.h>
.text
.option arch, +zvkned, +zvkb
#include "aes-macros.S"
#define KEYP a0
#define INP a1
#define OUTP a2
#define LEN a3
#define IVP a4
#define LEN32 a5
#define VL_E32 a6
#define VL_BLOCKS a7
.macro aes_ctr32_crypt keylen
// LEN32 = number of blocks, rounded up, in 32-bit words.
addi t0, LEN, 15
srli t0, t0, 4
slli LEN32, t0, 2
// Create a mask that selects the last 32-bit word of each 128-bit
// block. This is the word that contains the (big-endian) counter.
li t0, 0x88
vsetvli t1, zero, e8, m1, ta, ma
vmv.v.x v0, t0
// Load the IV into v31. The last 32-bit word contains the counter.
vsetivli zero, 4, e32, m1, ta, ma
vle32.v v31, (IVP)
// Convert the big-endian counter into little-endian.
vsetivli zero, 4, e32, m1, ta, mu
vrev8.v v31, v31, v0.t
// Splat the IV to v16 (with LMUL=4). The number of copies is the
// maximum number of blocks that will be processed per iteration.
vsetvli zero, LEN32, e32, m4, ta, ma
vmv.v.i v16, 0
vaesz.vs v16, v31
// v20 = [x, x, x, 0, x, x, x, 1, ...]
viota.m v20, v0, v0.t
// v16 = [IV0, IV1, IV2, counter+0, IV0, IV1, IV2, counter+1, ...]
vsetvli VL_E32, LEN32, e32, m4, ta, mu
vadd.vv v16, v16, v20, v0.t
j 2f
1:
// Set the number of blocks to process in this iteration. vl=VL_E32 is
// the length in 32-bit words, i.e. 4 times the number of blocks.
vsetvli VL_E32, LEN32, e32, m4, ta, mu
// Increment the counters by the number of blocks processed in the
// previous iteration.
vadd.vx v16, v16, VL_BLOCKS, v0.t
2:
// Prepare the AES inputs into v24.
vmv.v.v v24, v16
vrev8.v v24, v24, v0.t // Convert counters back to big-endian.
// Encrypt the AES inputs to create the next portion of the keystream.
aes_encrypt v24, \keylen
// XOR the data with the keystream.
vsetvli t0, LEN, e8, m4, ta, ma
vle8.v v20, (INP)
vxor.vv v20, v20, v24
vse8.v v20, (OUTP)
// Advance the pointers and update the remaining length.
add INP, INP, t0
add OUTP, OUTP, t0
sub LEN, LEN, t0
sub LEN32, LEN32, VL_E32
srli VL_BLOCKS, VL_E32, 2
// Repeat if more data remains.
bnez LEN, 1b
// Update *IVP to contain the next counter.
vsetivli zero, 4, e32, m1, ta, mu
vadd.vx v16, v16, VL_BLOCKS, v0.t
vrev8.v v16, v16, v0.t // Convert counters back to big-endian.
vse32.v v16, (IVP)
ret
.endm
// void aes_ctr32_crypt_zvkned_zvkb(const struct crypto_aes_ctx *key,
// const u8 *in, u8 *out, size_t len,
// u8 iv[16]);
SYM_FUNC_START(aes_ctr32_crypt_zvkned_zvkb)
aes_begin KEYP, 128f, 192f
aes_ctr32_crypt 256
128:
aes_ctr32_crypt 128
192:
aes_ctr32_crypt 192
SYM_FUNC_END(aes_ctr32_crypt_zvkned_zvkb)

View File

@ -0,0 +1,339 @@
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128
// - RISC-V Vector AES block cipher extension ('Zvkned')
#include <linux/linkage.h>
.text
.option arch, +zvkned
#include "aes-macros.S"
#define KEYP a0
#define INP a1
#define OUTP a2
#define LEN a3
#define IVP a4
.macro __aes_crypt_zvkned enc, keylen
vle32.v v16, (INP)
aes_crypt v16, \enc, \keylen
vse32.v v16, (OUTP)
ret
.endm
.macro aes_crypt_zvkned enc
aes_begin KEYP, 128f, 192f
__aes_crypt_zvkned \enc, 256
128:
__aes_crypt_zvkned \enc, 128
192:
__aes_crypt_zvkned \enc, 192
.endm
// void aes_encrypt_zvkned(const struct crypto_aes_ctx *key,
// const u8 in[16], u8 out[16]);
SYM_FUNC_START(aes_encrypt_zvkned)
aes_crypt_zvkned 1
SYM_FUNC_END(aes_encrypt_zvkned)
// Same prototype and calling convention as the encryption function
SYM_FUNC_START(aes_decrypt_zvkned)
aes_crypt_zvkned 0
SYM_FUNC_END(aes_decrypt_zvkned)
.macro __aes_ecb_crypt enc, keylen
srli t0, LEN, 2
// t0 is the remaining length in 32-bit words. It's a multiple of 4.
1:
vsetvli t1, t0, e32, m8, ta, ma
sub t0, t0, t1 // Subtract number of words processed
slli t1, t1, 2 // Words to bytes
vle32.v v16, (INP)
aes_crypt v16, \enc, \keylen
vse32.v v16, (OUTP)
add INP, INP, t1
add OUTP, OUTP, t1
bnez t0, 1b
ret
.endm
.macro aes_ecb_crypt enc
aes_begin KEYP, 128f, 192f
__aes_ecb_crypt \enc, 256
128:
__aes_ecb_crypt \enc, 128
192:
__aes_ecb_crypt \enc, 192
.endm
// void aes_ecb_encrypt_zvkned(const struct crypto_aes_ctx *key,
// const u8 *in, u8 *out, size_t len);
//
// |len| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).
SYM_FUNC_START(aes_ecb_encrypt_zvkned)
aes_ecb_crypt 1
SYM_FUNC_END(aes_ecb_encrypt_zvkned)
// Same prototype and calling convention as the encryption function
SYM_FUNC_START(aes_ecb_decrypt_zvkned)
aes_ecb_crypt 0
SYM_FUNC_END(aes_ecb_decrypt_zvkned)
.macro aes_cbc_encrypt keylen
vle32.v v16, (IVP) // Load IV
1:
vle32.v v17, (INP) // Load plaintext block
vxor.vv v16, v16, v17 // XOR with IV or prev ciphertext block
aes_encrypt v16, \keylen // Encrypt
vse32.v v16, (OUTP) // Store ciphertext block
addi INP, INP, 16
addi OUTP, OUTP, 16
addi LEN, LEN, -16
bnez LEN, 1b
vse32.v v16, (IVP) // Store next IV
ret
.endm
.macro aes_cbc_decrypt keylen
srli LEN, LEN, 2 // Convert LEN from bytes to words
vle32.v v16, (IVP) // Load IV
1:
vsetvli t0, LEN, e32, m4, ta, ma
vle32.v v20, (INP) // Load ciphertext blocks
vslideup.vi v16, v20, 4 // Setup prev ciphertext blocks
addi t1, t0, -4
vslidedown.vx v24, v20, t1 // Save last ciphertext block
aes_decrypt v20, \keylen // Decrypt the blocks
vxor.vv v20, v20, v16 // XOR with prev ciphertext blocks
vse32.v v20, (OUTP) // Store plaintext blocks
vmv.v.v v16, v24 // Next "IV" is last ciphertext block
slli t1, t0, 2 // Words to bytes
add INP, INP, t1
add OUTP, OUTP, t1
sub LEN, LEN, t0
bnez LEN, 1b
vsetivli zero, 4, e32, m1, ta, ma
vse32.v v16, (IVP) // Store next IV
ret
.endm
// void aes_cbc_encrypt_zvkned(const struct crypto_aes_ctx *key,
// const u8 *in, u8 *out, size_t len, u8 iv[16]);
//
// |len| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).
SYM_FUNC_START(aes_cbc_encrypt_zvkned)
aes_begin KEYP, 128f, 192f
aes_cbc_encrypt 256
128:
aes_cbc_encrypt 128
192:
aes_cbc_encrypt 192
SYM_FUNC_END(aes_cbc_encrypt_zvkned)
// Same prototype and calling convention as the encryption function
SYM_FUNC_START(aes_cbc_decrypt_zvkned)
aes_begin KEYP, 128f, 192f
aes_cbc_decrypt 256
128:
aes_cbc_decrypt 128
192:
aes_cbc_decrypt 192
SYM_FUNC_END(aes_cbc_decrypt_zvkned)
.macro aes_cbc_cts_encrypt keylen
// CBC-encrypt all blocks except the last. But don't store the
// second-to-last block to the output buffer yet, since it will be
// handled specially in the ciphertext stealing step. Exception: if the
// message is single-block, still encrypt the last (and only) block.
li t0, 16
j 2f
1:
vse32.v v16, (OUTP) // Store ciphertext block
addi OUTP, OUTP, 16
2:
vle32.v v17, (INP) // Load plaintext block
vxor.vv v16, v16, v17 // XOR with IV or prev ciphertext block
aes_encrypt v16, \keylen // Encrypt
addi INP, INP, 16
addi LEN, LEN, -16
bgt LEN, t0, 1b // Repeat if more than one block remains
// Special case: if the message is a single block, just do CBC.
beqz LEN, .Lcts_encrypt_done\@
// Encrypt the last two blocks using ciphertext stealing as follows:
// C[n-1] = Encrypt(Encrypt(P[n-1] ^ C[n-2]) ^ P[n])
// C[n] = Encrypt(P[n-1] ^ C[n-2])[0..LEN]
//
// C[i] denotes the i'th ciphertext block, and likewise P[i] the i'th
// plaintext block. Block n, the last block, may be partial; its length
// is 1 <= LEN <= 16. If there are only 2 blocks, C[n-2] means the IV.
//
// v16 already contains Encrypt(P[n-1] ^ C[n-2]).
// INP points to P[n]. OUTP points to where C[n-1] should go.
// To support in-place encryption, load P[n] before storing C[n].
addi t0, OUTP, 16 // Get pointer to where C[n] should go
vsetvli zero, LEN, e8, m1, tu, ma
vle8.v v17, (INP) // Load P[n]
vse8.v v16, (t0) // Store C[n]
vxor.vv v16, v16, v17 // v16 = Encrypt(P[n-1] ^ C[n-2]) ^ P[n]
vsetivli zero, 4, e32, m1, ta, ma
aes_encrypt v16, \keylen
.Lcts_encrypt_done\@:
vse32.v v16, (OUTP) // Store C[n-1] (or C[n] in single-block case)
ret
.endm
#define LEN32 t4 // Length of remaining full blocks in 32-bit words
#define LEN_MOD16 t5 // Length of message in bytes mod 16
.macro aes_cbc_cts_decrypt keylen
andi LEN32, LEN, ~15
srli LEN32, LEN32, 2
andi LEN_MOD16, LEN, 15
// Save C[n-2] in v28 so that it's available later during the ciphertext
// stealing step. If there are fewer than three blocks, C[n-2] means
// the IV, otherwise it means the third-to-last ciphertext block.
vmv.v.v v28, v16 // IV
add t0, LEN, -33
bltz t0, .Lcts_decrypt_loop\@
andi t0, t0, ~15
add t0, t0, INP
vle32.v v28, (t0)
// CBC-decrypt all full blocks. For the last full block, or the last 2
// full blocks if the message is block-aligned, this doesn't write the
// correct output blocks (unless the message is only a single block),
// because it XORs the wrong values with the raw AES plaintexts. But we
// fix this after this loop without redoing the AES decryptions. This
// approach allows more of the AES decryptions to be parallelized.
.Lcts_decrypt_loop\@:
vsetvli t0, LEN32, e32, m4, ta, ma
addi t1, t0, -4
vle32.v v20, (INP) // Load next set of ciphertext blocks
vmv.v.v v24, v16 // Get IV or last ciphertext block of prev set
vslideup.vi v24, v20, 4 // Setup prev ciphertext blocks
vslidedown.vx v16, v20, t1 // Save last ciphertext block of this set
aes_decrypt v20, \keylen // Decrypt this set of blocks
vxor.vv v24, v24, v20 // XOR prev ciphertext blocks with decrypted blocks
vse32.v v24, (OUTP) // Store this set of plaintext blocks
sub LEN32, LEN32, t0
slli t0, t0, 2 // Words to bytes
add INP, INP, t0
add OUTP, OUTP, t0
bnez LEN32, .Lcts_decrypt_loop\@
vsetivli zero, 4, e32, m4, ta, ma
vslidedown.vx v20, v20, t1 // Extract raw plaintext of last full block
addi t0, OUTP, -16 // Get pointer to last full plaintext block
bnez LEN_MOD16, .Lcts_decrypt_non_block_aligned\@
// Special case: if the message is a single block, just do CBC.
li t1, 16
beq LEN, t1, .Lcts_decrypt_done\@
// Block-aligned message. Just fix up the last 2 blocks. We need:
//
// P[n-1] = Decrypt(C[n]) ^ C[n-2]
// P[n] = Decrypt(C[n-1]) ^ C[n]
//
// We have C[n] in v16, Decrypt(C[n]) in v20, and C[n-2] in v28.
// Together with Decrypt(C[n-1]) ^ C[n-2] from the output buffer, this
// is everything needed to fix the output without re-decrypting blocks.
addi t1, OUTP, -32 // Get pointer to where P[n-1] should go
vxor.vv v20, v20, v28 // Decrypt(C[n]) ^ C[n-2] == P[n-1]
vle32.v v24, (t1) // Decrypt(C[n-1]) ^ C[n-2]
vse32.v v20, (t1) // Store P[n-1]
vxor.vv v20, v24, v16 // Decrypt(C[n-1]) ^ C[n-2] ^ C[n] == P[n] ^ C[n-2]
j .Lcts_decrypt_finish\@
.Lcts_decrypt_non_block_aligned\@:
// Decrypt the last two blocks using ciphertext stealing as follows:
//
// P[n-1] = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16]) ^ C[n-2]
// P[n] = (Decrypt(C[n-1]) ^ C[n])[0..LEN_MOD16]
//
// We already have Decrypt(C[n-1]) in v20 and C[n-2] in v28.
vmv.v.v v16, v20 // v16 = Decrypt(C[n-1])
vsetvli zero, LEN_MOD16, e8, m1, tu, ma
vle8.v v20, (INP) // v20 = C[n] || Decrypt(C[n-1])[LEN_MOD16..16]
vxor.vv v16, v16, v20 // v16 = Decrypt(C[n-1]) ^ C[n]
vse8.v v16, (OUTP) // Store P[n]
vsetivli zero, 4, e32, m1, ta, ma
aes_decrypt v20, \keylen // v20 = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16])
.Lcts_decrypt_finish\@:
vxor.vv v20, v20, v28 // XOR with C[n-2]
vse32.v v20, (t0) // Store last full plaintext block
.Lcts_decrypt_done\@:
ret
.endm
.macro aes_cbc_cts_crypt keylen
vle32.v v16, (IVP) // Load IV
beqz a5, .Lcts_decrypt\@
aes_cbc_cts_encrypt \keylen
.Lcts_decrypt\@:
aes_cbc_cts_decrypt \keylen
.endm
// void aes_cbc_cts_crypt_zvkned(const struct crypto_aes_ctx *key,
// const u8 *in, u8 *out, size_t len,
// const u8 iv[16], bool enc);
//
// Encrypts or decrypts a message with the CS3 variant of AES-CBC-CTS.
// This is the variant that unconditionally swaps the last two blocks.
SYM_FUNC_START(aes_cbc_cts_crypt_zvkned)
aes_begin KEYP, 128f, 192f
aes_cbc_cts_crypt 256
128:
aes_cbc_cts_crypt 128
192:
aes_cbc_cts_crypt 192
SYM_FUNC_END(aes_cbc_cts_crypt_zvkned)

View File

@ -0,0 +1,101 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* ChaCha20 using the RISC-V vector crypto extensions
*
* Copyright (C) 2023 SiFive, Inc.
* Author: Jerry Shih <jerry.shih@sifive.com>
*/
#include <asm/simd.h>
#include <asm/vector.h>
#include <crypto/internal/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/linkage.h>
#include <linux/module.h>
asmlinkage void chacha20_zvkb(const u32 key[8], const u8 *in, u8 *out,
size_t len, const u32 iv[4]);
static int riscv64_chacha20_crypt(struct skcipher_request *req)
{
u32 iv[CHACHA_IV_SIZE / sizeof(u32)];
u8 block_buffer[CHACHA_BLOCK_SIZE];
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
unsigned int nbytes;
unsigned int tail_bytes;
int err;
iv[0] = get_unaligned_le32(req->iv);
iv[1] = get_unaligned_le32(req->iv + 4);
iv[2] = get_unaligned_le32(req->iv + 8);
iv[3] = get_unaligned_le32(req->iv + 12);
err = skcipher_walk_virt(&walk, req, false);
while (walk.nbytes) {
nbytes = walk.nbytes & ~(CHACHA_BLOCK_SIZE - 1);
tail_bytes = walk.nbytes & (CHACHA_BLOCK_SIZE - 1);
kernel_vector_begin();
if (nbytes) {
chacha20_zvkb(ctx->key, walk.src.virt.addr,
walk.dst.virt.addr, nbytes, iv);
iv[0] += nbytes / CHACHA_BLOCK_SIZE;
}
if (walk.nbytes == walk.total && tail_bytes > 0) {
memcpy(block_buffer, walk.src.virt.addr + nbytes,
tail_bytes);
chacha20_zvkb(ctx->key, block_buffer, block_buffer,
CHACHA_BLOCK_SIZE, iv);
memcpy(walk.dst.virt.addr + nbytes, block_buffer,
tail_bytes);
tail_bytes = 0;
}
kernel_vector_end();
err = skcipher_walk_done(&walk, tail_bytes);
}
return err;
}
static struct skcipher_alg riscv64_chacha_alg = {
.setkey = chacha20_setkey,
.encrypt = riscv64_chacha20_crypt,
.decrypt = riscv64_chacha20_crypt,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.base = {
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct chacha_ctx),
.cra_priority = 300,
.cra_name = "chacha20",
.cra_driver_name = "chacha20-riscv64-zvkb",
.cra_module = THIS_MODULE,
},
};
static int __init riscv64_chacha_mod_init(void)
{
if (riscv_isa_extension_available(NULL, ZVKB) &&
riscv_vector_vlen() >= 128)
return crypto_register_skcipher(&riscv64_chacha_alg);
return -ENODEV;
}
static void __exit riscv64_chacha_mod_exit(void)
{
crypto_unregister_skcipher(&riscv64_chacha_alg);
}
module_init(riscv64_chacha_mod_init);
module_exit(riscv64_chacha_mod_exit);
MODULE_DESCRIPTION("ChaCha20 (RISC-V accelerated)");
MODULE_AUTHOR("Jerry Shih <jerry.shih@sifive.com>");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("chacha20");

View File

@ -0,0 +1,294 @@
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128
// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
#include <linux/linkage.h>
.text
.option arch, +zvkb
#define KEYP a0
#define INP a1
#define OUTP a2
#define LEN a3
#define IVP a4
#define CONSTS0 a5
#define CONSTS1 a6
#define CONSTS2 a7
#define CONSTS3 t0
#define TMP t1
#define VL t2
#define STRIDE t3
#define NROUNDS t4
#define KEY0 s0
#define KEY1 s1
#define KEY2 s2
#define KEY3 s3
#define KEY4 s4
#define KEY5 s5
#define KEY6 s6
#define KEY7 s7
#define COUNTER s8
#define NONCE0 s9
#define NONCE1 s10
#define NONCE2 s11
.macro chacha_round a0, b0, c0, d0, a1, b1, c1, d1, \
a2, b2, c2, d2, a3, b3, c3, d3
// a += b; d ^= a; d = rol(d, 16);
vadd.vv \a0, \a0, \b0
vadd.vv \a1, \a1, \b1
vadd.vv \a2, \a2, \b2
vadd.vv \a3, \a3, \b3
vxor.vv \d0, \d0, \a0
vxor.vv \d1, \d1, \a1
vxor.vv \d2, \d2, \a2
vxor.vv \d3, \d3, \a3
vror.vi \d0, \d0, 32 - 16
vror.vi \d1, \d1, 32 - 16
vror.vi \d2, \d2, 32 - 16
vror.vi \d3, \d3, 32 - 16
// c += d; b ^= c; b = rol(b, 12);
vadd.vv \c0, \c0, \d0
vadd.vv \c1, \c1, \d1
vadd.vv \c2, \c2, \d2
vadd.vv \c3, \c3, \d3
vxor.vv \b0, \b0, \c0
vxor.vv \b1, \b1, \c1
vxor.vv \b2, \b2, \c2
vxor.vv \b3, \b3, \c3
vror.vi \b0, \b0, 32 - 12
vror.vi \b1, \b1, 32 - 12
vror.vi \b2, \b2, 32 - 12
vror.vi \b3, \b3, 32 - 12
// a += b; d ^= a; d = rol(d, 8);
vadd.vv \a0, \a0, \b0
vadd.vv \a1, \a1, \b1
vadd.vv \a2, \a2, \b2
vadd.vv \a3, \a3, \b3
vxor.vv \d0, \d0, \a0
vxor.vv \d1, \d1, \a1
vxor.vv \d2, \d2, \a2
vxor.vv \d3, \d3, \a3
vror.vi \d0, \d0, 32 - 8
vror.vi \d1, \d1, 32 - 8
vror.vi \d2, \d2, 32 - 8
vror.vi \d3, \d3, 32 - 8
// c += d; b ^= c; b = rol(b, 7);
vadd.vv \c0, \c0, \d0
vadd.vv \c1, \c1, \d1
vadd.vv \c2, \c2, \d2
vadd.vv \c3, \c3, \d3
vxor.vv \b0, \b0, \c0
vxor.vv \b1, \b1, \c1
vxor.vv \b2, \b2, \c2
vxor.vv \b3, \b3, \c3
vror.vi \b0, \b0, 32 - 7
vror.vi \b1, \b1, 32 - 7
vror.vi \b2, \b2, 32 - 7
vror.vi \b3, \b3, 32 - 7
.endm
// void chacha20_zvkb(const u32 key[8], const u8 *in, u8 *out, size_t len,
// const u32 iv[4]);
//
// |len| must be nonzero and a multiple of 64 (CHACHA_BLOCK_SIZE).
// The counter is treated as 32-bit, following the RFC7539 convention.
SYM_FUNC_START(chacha20_zvkb)
srli LEN, LEN, 6 // Bytes to blocks
addi sp, sp, -96
sd s0, 0(sp)
sd s1, 8(sp)
sd s2, 16(sp)
sd s3, 24(sp)
sd s4, 32(sp)
sd s5, 40(sp)
sd s6, 48(sp)
sd s7, 56(sp)
sd s8, 64(sp)
sd s9, 72(sp)
sd s10, 80(sp)
sd s11, 88(sp)
li STRIDE, 64
// Set up the initial state matrix in scalar registers.
li CONSTS0, 0x61707865 // "expa" little endian
li CONSTS1, 0x3320646e // "nd 3" little endian
li CONSTS2, 0x79622d32 // "2-by" little endian
li CONSTS3, 0x6b206574 // "te k" little endian
lw KEY0, 0(KEYP)
lw KEY1, 4(KEYP)
lw KEY2, 8(KEYP)
lw KEY3, 12(KEYP)
lw KEY4, 16(KEYP)
lw KEY5, 20(KEYP)
lw KEY6, 24(KEYP)
lw KEY7, 28(KEYP)
lw COUNTER, 0(IVP)
lw NONCE0, 4(IVP)
lw NONCE1, 8(IVP)
lw NONCE2, 12(IVP)
.Lblock_loop:
// Set vl to the number of blocks to process in this iteration.
vsetvli VL, LEN, e32, m1, ta, ma
// Set up the initial state matrix for the next VL blocks in v0-v15.
// v{i} holds the i'th 32-bit word of the state matrix for all blocks.
// Note that only the counter word, at index 12, differs across blocks.
vmv.v.x v0, CONSTS0
vmv.v.x v1, CONSTS1
vmv.v.x v2, CONSTS2
vmv.v.x v3, CONSTS3
vmv.v.x v4, KEY0
vmv.v.x v5, KEY1
vmv.v.x v6, KEY2
vmv.v.x v7, KEY3
vmv.v.x v8, KEY4
vmv.v.x v9, KEY5
vmv.v.x v10, KEY6
vmv.v.x v11, KEY7
vid.v v12
vadd.vx v12, v12, COUNTER
vmv.v.x v13, NONCE0
vmv.v.x v14, NONCE1
vmv.v.x v15, NONCE2
// Load the first half of the input data for each block into v16-v23.
// v{16+i} holds the i'th 32-bit word for all blocks.
vlsseg8e32.v v16, (INP), STRIDE
li NROUNDS, 20
.Lnext_doubleround:
addi NROUNDS, NROUNDS, -2
// column round
chacha_round v0, v4, v8, v12, v1, v5, v9, v13, \
v2, v6, v10, v14, v3, v7, v11, v15
// diagonal round
chacha_round v0, v5, v10, v15, v1, v6, v11, v12, \
v2, v7, v8, v13, v3, v4, v9, v14
bnez NROUNDS, .Lnext_doubleround
// Load the second half of the input data for each block into v24-v31.
// v{24+i} holds the {8+i}'th 32-bit word for all blocks.
addi TMP, INP, 32
vlsseg8e32.v v24, (TMP), STRIDE
// Finalize the first half of the keystream for each block.
vadd.vx v0, v0, CONSTS0
vadd.vx v1, v1, CONSTS1
vadd.vx v2, v2, CONSTS2
vadd.vx v3, v3, CONSTS3
vadd.vx v4, v4, KEY0
vadd.vx v5, v5, KEY1
vadd.vx v6, v6, KEY2
vadd.vx v7, v7, KEY3
// Encrypt/decrypt the first half of the data for each block.
vxor.vv v16, v16, v0
vxor.vv v17, v17, v1
vxor.vv v18, v18, v2
vxor.vv v19, v19, v3
vxor.vv v20, v20, v4
vxor.vv v21, v21, v5
vxor.vv v22, v22, v6
vxor.vv v23, v23, v7
// Store the first half of the output data for each block.
vssseg8e32.v v16, (OUTP), STRIDE
// Finalize the second half of the keystream for each block.
vadd.vx v8, v8, KEY4
vadd.vx v9, v9, KEY5
vadd.vx v10, v10, KEY6
vadd.vx v11, v11, KEY7
vid.v v0
vadd.vx v12, v12, COUNTER
vadd.vx v13, v13, NONCE0
vadd.vx v14, v14, NONCE1
vadd.vx v15, v15, NONCE2
vadd.vv v12, v12, v0
// Encrypt/decrypt the second half of the data for each block.
vxor.vv v24, v24, v8
vxor.vv v25, v25, v9
vxor.vv v26, v26, v10
vxor.vv v27, v27, v11
vxor.vv v29, v29, v13
vxor.vv v28, v28, v12
vxor.vv v30, v30, v14
vxor.vv v31, v31, v15
// Store the second half of the output data for each block.
addi TMP, OUTP, 32
vssseg8e32.v v24, (TMP), STRIDE
// Update the counter, the remaining number of blocks, and the input and
// output pointers according to the number of blocks processed (VL).
add COUNTER, COUNTER, VL
sub LEN, LEN, VL
slli TMP, VL, 6
add OUTP, OUTP, TMP
add INP, INP, TMP
bnez LEN, .Lblock_loop
ld s0, 0(sp)
ld s1, 8(sp)
ld s2, 16(sp)
ld s3, 24(sp)
ld s4, 32(sp)
ld s5, 40(sp)
ld s6, 48(sp)
ld s7, 56(sp)
ld s8, 64(sp)
ld s9, 72(sp)
ld s10, 80(sp)
ld s11, 88(sp)
addi sp, sp, 96
ret
SYM_FUNC_END(chacha20_zvkb)

View File

@ -0,0 +1,168 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* GHASH using the RISC-V vector crypto extensions
*
* Copyright (C) 2023 VRULL GmbH
* Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
*
* Copyright (C) 2023 SiFive, Inc.
* Author: Jerry Shih <jerry.shih@sifive.com>
*/
#include <asm/simd.h>
#include <asm/vector.h>
#include <crypto/ghash.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <linux/linkage.h>
#include <linux/module.h>
asmlinkage void ghash_zvkg(be128 *accumulator, const be128 *key, const u8 *data,
size_t len);
struct riscv64_ghash_tfm_ctx {
be128 key;
};
struct riscv64_ghash_desc_ctx {
be128 accumulator;
u8 buffer[GHASH_BLOCK_SIZE];
u32 bytes;
};
static int riscv64_ghash_setkey(struct crypto_shash *tfm, const u8 *key,
unsigned int keylen)
{
struct riscv64_ghash_tfm_ctx *tctx = crypto_shash_ctx(tfm);
if (keylen != GHASH_BLOCK_SIZE)
return -EINVAL;
memcpy(&tctx->key, key, GHASH_BLOCK_SIZE);
return 0;
}
static int riscv64_ghash_init(struct shash_desc *desc)
{
struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
*dctx = (struct riscv64_ghash_desc_ctx){};
return 0;
}
static inline void
riscv64_ghash_blocks(const struct riscv64_ghash_tfm_ctx *tctx,
struct riscv64_ghash_desc_ctx *dctx,
const u8 *src, size_t srclen)
{
/* The srclen is nonzero and a multiple of 16. */
if (crypto_simd_usable()) {
kernel_vector_begin();
ghash_zvkg(&dctx->accumulator, &tctx->key, src, srclen);
kernel_vector_end();
} else {
do {
crypto_xor((u8 *)&dctx->accumulator, src,
GHASH_BLOCK_SIZE);
gf128mul_lle(&dctx->accumulator, &tctx->key);
src += GHASH_BLOCK_SIZE;
srclen -= GHASH_BLOCK_SIZE;
} while (srclen);
}
}
static int riscv64_ghash_update(struct shash_desc *desc, const u8 *src,
unsigned int srclen)
{
const struct riscv64_ghash_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
unsigned int len;
if (dctx->bytes) {
if (dctx->bytes + srclen < GHASH_BLOCK_SIZE) {
memcpy(dctx->buffer + dctx->bytes, src, srclen);
dctx->bytes += srclen;
return 0;
}
memcpy(dctx->buffer + dctx->bytes, src,
GHASH_BLOCK_SIZE - dctx->bytes);
riscv64_ghash_blocks(tctx, dctx, dctx->buffer,
GHASH_BLOCK_SIZE);
src += GHASH_BLOCK_SIZE - dctx->bytes;
srclen -= GHASH_BLOCK_SIZE - dctx->bytes;
dctx->bytes = 0;
}
len = round_down(srclen, GHASH_BLOCK_SIZE);
if (len) {
riscv64_ghash_blocks(tctx, dctx, src, len);
src += len;
srclen -= len;
}
if (srclen) {
memcpy(dctx->buffer, src, srclen);
dctx->bytes = srclen;
}
return 0;
}
static int riscv64_ghash_final(struct shash_desc *desc, u8 *out)
{
const struct riscv64_ghash_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
int i;
if (dctx->bytes) {
for (i = dctx->bytes; i < GHASH_BLOCK_SIZE; i++)
dctx->buffer[i] = 0;
riscv64_ghash_blocks(tctx, dctx, dctx->buffer,
GHASH_BLOCK_SIZE);
}
memcpy(out, &dctx->accumulator, GHASH_DIGEST_SIZE);
return 0;
}
static struct shash_alg riscv64_ghash_alg = {
.init = riscv64_ghash_init,
.update = riscv64_ghash_update,
.final = riscv64_ghash_final,
.setkey = riscv64_ghash_setkey,
.descsize = sizeof(struct riscv64_ghash_desc_ctx),
.digestsize = GHASH_DIGEST_SIZE,
.base = {
.cra_blocksize = GHASH_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct riscv64_ghash_tfm_ctx),
.cra_priority = 300,
.cra_name = "ghash",
.cra_driver_name = "ghash-riscv64-zvkg",
.cra_module = THIS_MODULE,
},
};
static int __init riscv64_ghash_mod_init(void)
{
if (riscv_isa_extension_available(NULL, ZVKG) &&
riscv_vector_vlen() >= 128)
return crypto_register_shash(&riscv64_ghash_alg);
return -ENODEV;
}
static void __exit riscv64_ghash_mod_exit(void)
{
crypto_unregister_shash(&riscv64_ghash_alg);
}
module_init(riscv64_ghash_mod_init);
module_exit(riscv64_ghash_mod_exit);
MODULE_DESCRIPTION("GHASH (RISC-V accelerated)");
MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("ghash");

View File

@ -0,0 +1,72 @@
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128
// - RISC-V Vector GCM/GMAC extension ('Zvkg')
#include <linux/linkage.h>
.text
.option arch, +zvkg
#define ACCUMULATOR a0
#define KEY a1
#define DATA a2
#define LEN a3
// void ghash_zvkg(be128 *accumulator, const be128 *key, const u8 *data,
// size_t len);
//
// |len| must be nonzero and a multiple of 16 (GHASH_BLOCK_SIZE).
SYM_FUNC_START(ghash_zvkg)
vsetivli zero, 4, e32, m1, ta, ma
vle32.v v1, (ACCUMULATOR)
vle32.v v2, (KEY)
.Lnext_block:
vle32.v v3, (DATA)
vghsh.vv v1, v2, v3
addi DATA, DATA, 16
addi LEN, LEN, -16
bnez LEN, .Lnext_block
vse32.v v1, (ACCUMULATOR)
ret
SYM_FUNC_END(ghash_zvkg)

View File

@ -0,0 +1,137 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* SHA-256 and SHA-224 using the RISC-V vector crypto extensions
*
* Copyright (C) 2022 VRULL GmbH
* Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
*
* Copyright (C) 2023 SiFive, Inc.
* Author: Jerry Shih <jerry.shih@sifive.com>
*/
#include <asm/simd.h>
#include <asm/vector.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/sha256_base.h>
#include <linux/linkage.h>
#include <linux/module.h>
/*
* Note: the asm function only uses the 'state' field of struct sha256_state.
* It is assumed to be the first field.
*/
asmlinkage void sha256_transform_zvknha_or_zvknhb_zvkb(
struct sha256_state *state, const u8 *data, int num_blocks);
static int riscv64_sha256_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
/*
* Ensure struct sha256_state begins directly with the SHA-256
* 256-bit internal state, as this is what the asm function expects.
*/
BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
if (crypto_simd_usable()) {
kernel_vector_begin();
sha256_base_do_update(desc, data, len,
sha256_transform_zvknha_or_zvknhb_zvkb);
kernel_vector_end();
} else {
crypto_sha256_update(desc, data, len);
}
return 0;
}
static int riscv64_sha256_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
if (crypto_simd_usable()) {
kernel_vector_begin();
if (len)
sha256_base_do_update(
desc, data, len,
sha256_transform_zvknha_or_zvknhb_zvkb);
sha256_base_do_finalize(
desc, sha256_transform_zvknha_or_zvknhb_zvkb);
kernel_vector_end();
return sha256_base_finish(desc, out);
}
return crypto_sha256_finup(desc, data, len, out);
}
static int riscv64_sha256_final(struct shash_desc *desc, u8 *out)
{
return riscv64_sha256_finup(desc, NULL, 0, out);
}
static int riscv64_sha256_digest(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return sha256_base_init(desc) ?:
riscv64_sha256_finup(desc, data, len, out);
}
static struct shash_alg riscv64_sha256_algs[] = {
{
.init = sha256_base_init,
.update = riscv64_sha256_update,
.final = riscv64_sha256_final,
.finup = riscv64_sha256_finup,
.digest = riscv64_sha256_digest,
.descsize = sizeof(struct sha256_state),
.digestsize = SHA256_DIGEST_SIZE,
.base = {
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_priority = 300,
.cra_name = "sha256",
.cra_driver_name = "sha256-riscv64-zvknha_or_zvknhb-zvkb",
.cra_module = THIS_MODULE,
},
}, {
.init = sha224_base_init,
.update = riscv64_sha256_update,
.final = riscv64_sha256_final,
.finup = riscv64_sha256_finup,
.descsize = sizeof(struct sha256_state),
.digestsize = SHA224_DIGEST_SIZE,
.base = {
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_priority = 300,
.cra_name = "sha224",
.cra_driver_name = "sha224-riscv64-zvknha_or_zvknhb-zvkb",
.cra_module = THIS_MODULE,
},
},
};
static int __init riscv64_sha256_mod_init(void)
{
/* Both zvknha and zvknhb provide the SHA-256 instructions. */
if ((riscv_isa_extension_available(NULL, ZVKNHA) ||
riscv_isa_extension_available(NULL, ZVKNHB)) &&
riscv_isa_extension_available(NULL, ZVKB) &&
riscv_vector_vlen() >= 128)
return crypto_register_shashes(riscv64_sha256_algs,
ARRAY_SIZE(riscv64_sha256_algs));
return -ENODEV;
}
static void __exit riscv64_sha256_mod_exit(void)
{
crypto_unregister_shashes(riscv64_sha256_algs,
ARRAY_SIZE(riscv64_sha256_algs));
}
module_init(riscv64_sha256_mod_init);
module_exit(riscv64_sha256_mod_exit);
MODULE_DESCRIPTION("SHA-256 (RISC-V accelerated)");
MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("sha256");
MODULE_ALIAS_CRYPTO("sha224");

View File

@ -0,0 +1,225 @@
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128
// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknha' or 'Zvknhb')
// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
#include <linux/cfi_types.h>
.text
.option arch, +zvknha, +zvkb
#define STATEP a0
#define DATA a1
#define NUM_BLOCKS a2
#define STATEP_C a3
#define MASK v0
#define INDICES v1
#define W0 v2
#define W1 v3
#define W2 v4
#define W3 v5
#define VTMP v6
#define FEBA v7
#define HGDC v8
#define K0 v10
#define K1 v11
#define K2 v12
#define K3 v13
#define K4 v14
#define K5 v15
#define K6 v16
#define K7 v17
#define K8 v18
#define K9 v19
#define K10 v20
#define K11 v21
#define K12 v22
#define K13 v23
#define K14 v24
#define K15 v25
#define PREV_FEBA v26
#define PREV_HGDC v27
// Do 4 rounds of SHA-256. w0 contains the current 4 message schedule words.
//
// If not all the message schedule words have been computed yet, then this also
// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4
// message schedule words; this macro computes the group after w3 and writes it
// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3,
// w0), so the caller must cycle through the registers accordingly.
.macro sha256_4rounds last, k, w0, w1, w2, w3
vadd.vv VTMP, \k, \w0
vsha2cl.vv HGDC, FEBA, VTMP
vsha2ch.vv FEBA, HGDC, VTMP
.if !\last
vmerge.vvm VTMP, \w2, \w1, MASK
vsha2ms.vv \w0, VTMP, \w3
.endif
.endm
.macro sha256_16rounds last, k0, k1, k2, k3
sha256_4rounds \last, \k0, W0, W1, W2, W3
sha256_4rounds \last, \k1, W1, W2, W3, W0
sha256_4rounds \last, \k2, W2, W3, W0, W1
sha256_4rounds \last, \k3, W3, W0, W1, W2
.endm
// void sha256_transform_zvknha_or_zvknhb_zvkb(u32 state[8], const u8 *data,
// int num_blocks);
SYM_TYPED_FUNC_START(sha256_transform_zvknha_or_zvknhb_zvkb)
// Load the round constants into K0-K15.
vsetivli zero, 4, e32, m1, ta, ma
la t0, K256
vle32.v K0, (t0)
addi t0, t0, 16
vle32.v K1, (t0)
addi t0, t0, 16
vle32.v K2, (t0)
addi t0, t0, 16
vle32.v K3, (t0)
addi t0, t0, 16
vle32.v K4, (t0)
addi t0, t0, 16
vle32.v K5, (t0)
addi t0, t0, 16
vle32.v K6, (t0)
addi t0, t0, 16
vle32.v K7, (t0)
addi t0, t0, 16
vle32.v K8, (t0)
addi t0, t0, 16
vle32.v K9, (t0)
addi t0, t0, 16
vle32.v K10, (t0)
addi t0, t0, 16
vle32.v K11, (t0)
addi t0, t0, 16
vle32.v K12, (t0)
addi t0, t0, 16
vle32.v K13, (t0)
addi t0, t0, 16
vle32.v K14, (t0)
addi t0, t0, 16
vle32.v K15, (t0)
// Setup mask for the vmerge to replace the first word (idx==0) in
// message scheduling. There are 4 words, so an 8-bit mask suffices.
vsetivli zero, 1, e8, m1, ta, ma
vmv.v.i MASK, 0x01
// Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we
// need {f,e,b,a},{h,g,d,c}. The dst vtype is e32m1 and the index vtype
// is e8mf4. We use index-load with the i8 indices {20, 16, 4, 0},
// loaded using the 32-bit little endian value 0x00041014.
li t0, 0x00041014
vsetivli zero, 1, e32, m1, ta, ma
vmv.v.x INDICES, t0
addi STATEP_C, STATEP, 8
vsetivli zero, 4, e32, m1, ta, ma
vluxei8.v FEBA, (STATEP), INDICES
vluxei8.v HGDC, (STATEP_C), INDICES
.Lnext_block:
addi NUM_BLOCKS, NUM_BLOCKS, -1
// Save the previous state, as it's needed later.
vmv.v.v PREV_FEBA, FEBA
vmv.v.v PREV_HGDC, HGDC
// Load the next 512-bit message block and endian-swap each 32-bit word.
vle32.v W0, (DATA)
vrev8.v W0, W0
addi DATA, DATA, 16
vle32.v W1, (DATA)
vrev8.v W1, W1
addi DATA, DATA, 16
vle32.v W2, (DATA)
vrev8.v W2, W2
addi DATA, DATA, 16
vle32.v W3, (DATA)
vrev8.v W3, W3
addi DATA, DATA, 16
// Do the 64 rounds of SHA-256.
sha256_16rounds 0, K0, K1, K2, K3
sha256_16rounds 0, K4, K5, K6, K7
sha256_16rounds 0, K8, K9, K10, K11
sha256_16rounds 1, K12, K13, K14, K15
// Add the previous state.
vadd.vv FEBA, FEBA, PREV_FEBA
vadd.vv HGDC, HGDC, PREV_HGDC
// Repeat if more blocks remain.
bnez NUM_BLOCKS, .Lnext_block
// Store the new state and return.
vsuxei8.v FEBA, (STATEP), INDICES
vsuxei8.v HGDC, (STATEP_C), INDICES
ret
SYM_FUNC_END(sha256_transform_zvknha_or_zvknhb_zvkb)
.section ".rodata"
.p2align 2
.type K256, @object
K256:
.word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
.word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
.word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
.word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
.word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
.word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
.word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
.word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
.word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
.word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
.word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
.word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
.word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
.word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
.word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
.word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
.size K256, . - K256

View File

@ -0,0 +1,133 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* SHA-512 and SHA-384 using the RISC-V vector crypto extensions
*
* Copyright (C) 2023 VRULL GmbH
* Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
*
* Copyright (C) 2023 SiFive, Inc.
* Author: Jerry Shih <jerry.shih@sifive.com>
*/
#include <asm/simd.h>
#include <asm/vector.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/sha512_base.h>
#include <linux/linkage.h>
#include <linux/module.h>
/*
* Note: the asm function only uses the 'state' field of struct sha512_state.
* It is assumed to be the first field.
*/
asmlinkage void sha512_transform_zvknhb_zvkb(
struct sha512_state *state, const u8 *data, int num_blocks);
static int riscv64_sha512_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
/*
* Ensure struct sha512_state begins directly with the SHA-512
* 512-bit internal state, as this is what the asm function expects.
*/
BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
if (crypto_simd_usable()) {
kernel_vector_begin();
sha512_base_do_update(desc, data, len,
sha512_transform_zvknhb_zvkb);
kernel_vector_end();
} else {
crypto_sha512_update(desc, data, len);
}
return 0;
}
static int riscv64_sha512_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
if (crypto_simd_usable()) {
kernel_vector_begin();
if (len)
sha512_base_do_update(desc, data, len,
sha512_transform_zvknhb_zvkb);
sha512_base_do_finalize(desc, sha512_transform_zvknhb_zvkb);
kernel_vector_end();
return sha512_base_finish(desc, out);
}
return crypto_sha512_finup(desc, data, len, out);
}
static int riscv64_sha512_final(struct shash_desc *desc, u8 *out)
{
return riscv64_sha512_finup(desc, NULL, 0, out);
}
static int riscv64_sha512_digest(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return sha512_base_init(desc) ?:
riscv64_sha512_finup(desc, data, len, out);
}
static struct shash_alg riscv64_sha512_algs[] = {
{
.init = sha512_base_init,
.update = riscv64_sha512_update,
.final = riscv64_sha512_final,
.finup = riscv64_sha512_finup,
.digest = riscv64_sha512_digest,
.descsize = sizeof(struct sha512_state),
.digestsize = SHA512_DIGEST_SIZE,
.base = {
.cra_blocksize = SHA512_BLOCK_SIZE,
.cra_priority = 300,
.cra_name = "sha512",
.cra_driver_name = "sha512-riscv64-zvknhb-zvkb",
.cra_module = THIS_MODULE,
},
}, {
.init = sha384_base_init,
.update = riscv64_sha512_update,
.final = riscv64_sha512_final,
.finup = riscv64_sha512_finup,
.descsize = sizeof(struct sha512_state),
.digestsize = SHA384_DIGEST_SIZE,
.base = {
.cra_blocksize = SHA384_BLOCK_SIZE,
.cra_priority = 300,
.cra_name = "sha384",
.cra_driver_name = "sha384-riscv64-zvknhb-zvkb",
.cra_module = THIS_MODULE,
},
},
};
static int __init riscv64_sha512_mod_init(void)
{
if (riscv_isa_extension_available(NULL, ZVKNHB) &&
riscv_isa_extension_available(NULL, ZVKB) &&
riscv_vector_vlen() >= 128)
return crypto_register_shashes(riscv64_sha512_algs,
ARRAY_SIZE(riscv64_sha512_algs));
return -ENODEV;
}
static void __exit riscv64_sha512_mod_exit(void)
{
crypto_unregister_shashes(riscv64_sha512_algs,
ARRAY_SIZE(riscv64_sha512_algs));
}
module_init(riscv64_sha512_mod_init);
module_exit(riscv64_sha512_mod_exit);
MODULE_DESCRIPTION("SHA-512 (RISC-V accelerated)");
MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("sha512");
MODULE_ALIAS_CRYPTO("sha384");

View File

@ -0,0 +1,203 @@
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128
// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknhb')
// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
#include <linux/cfi_types.h>
.text
.option arch, +zvknhb, +zvkb
#define STATEP a0
#define DATA a1
#define NUM_BLOCKS a2
#define STATEP_C a3
#define K a4
#define MASK v0
#define INDICES v1
#define W0 v10 // LMUL=2
#define W1 v12 // LMUL=2
#define W2 v14 // LMUL=2
#define W3 v16 // LMUL=2
#define VTMP v20 // LMUL=2
#define FEBA v22 // LMUL=2
#define HGDC v24 // LMUL=2
#define PREV_FEBA v26 // LMUL=2
#define PREV_HGDC v28 // LMUL=2
// Do 4 rounds of SHA-512. w0 contains the current 4 message schedule words.
//
// If not all the message schedule words have been computed yet, then this also
// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4
// message schedule words; this macro computes the group after w3 and writes it
// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3,
// w0), so the caller must cycle through the registers accordingly.
.macro sha512_4rounds last, w0, w1, w2, w3
vle64.v VTMP, (K)
addi K, K, 32
vadd.vv VTMP, VTMP, \w0
vsha2cl.vv HGDC, FEBA, VTMP
vsha2ch.vv FEBA, HGDC, VTMP
.if !\last
vmerge.vvm VTMP, \w2, \w1, MASK
vsha2ms.vv \w0, VTMP, \w3
.endif
.endm
.macro sha512_16rounds last
sha512_4rounds \last, W0, W1, W2, W3
sha512_4rounds \last, W1, W2, W3, W0
sha512_4rounds \last, W2, W3, W0, W1
sha512_4rounds \last, W3, W0, W1, W2
.endm
// void sha512_transform_zvknhb_zvkb(u64 state[8], const u8 *data,
// int num_blocks);
SYM_TYPED_FUNC_START(sha512_transform_zvknhb_zvkb)
// Setup mask for the vmerge to replace the first word (idx==0) in
// message scheduling. There are 4 words, so an 8-bit mask suffices.
vsetivli zero, 1, e8, m1, ta, ma
vmv.v.i MASK, 0x01
// Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we
// need {f,e,b,a},{h,g,d,c}. The dst vtype is e64m2 and the index vtype
// is e8mf4. We use index-load with the i8 indices {40, 32, 8, 0},
// loaded using the 32-bit little endian value 0x00082028.
li t0, 0x00082028
vsetivli zero, 1, e32, m1, ta, ma
vmv.v.x INDICES, t0
addi STATEP_C, STATEP, 16
vsetivli zero, 4, e64, m2, ta, ma
vluxei8.v FEBA, (STATEP), INDICES
vluxei8.v HGDC, (STATEP_C), INDICES
.Lnext_block:
la K, K512
addi NUM_BLOCKS, NUM_BLOCKS, -1
// Save the previous state, as it's needed later.
vmv.v.v PREV_FEBA, FEBA
vmv.v.v PREV_HGDC, HGDC
// Load the next 1024-bit message block and endian-swap each 64-bit word
vle64.v W0, (DATA)
vrev8.v W0, W0
addi DATA, DATA, 32
vle64.v W1, (DATA)
vrev8.v W1, W1
addi DATA, DATA, 32
vle64.v W2, (DATA)
vrev8.v W2, W2
addi DATA, DATA, 32
vle64.v W3, (DATA)
vrev8.v W3, W3
addi DATA, DATA, 32
// Do the 80 rounds of SHA-512.
sha512_16rounds 0
sha512_16rounds 0
sha512_16rounds 0
sha512_16rounds 0
sha512_16rounds 1
// Add the previous state.
vadd.vv FEBA, FEBA, PREV_FEBA
vadd.vv HGDC, HGDC, PREV_HGDC
// Repeat if more blocks remain.
bnez NUM_BLOCKS, .Lnext_block
// Store the new state and return.
vsuxei8.v FEBA, (STATEP), INDICES
vsuxei8.v HGDC, (STATEP_C), INDICES
ret
SYM_FUNC_END(sha512_transform_zvknhb_zvkb)
.section ".rodata"
.p2align 3
.type K512, @object
K512:
.dword 0x428a2f98d728ae22, 0x7137449123ef65cd
.dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
.dword 0x3956c25bf348b538, 0x59f111f1b605d019
.dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
.dword 0xd807aa98a3030242, 0x12835b0145706fbe
.dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
.dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
.dword 0x9bdc06a725c71235, 0xc19bf174cf692694
.dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
.dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
.dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
.dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
.dword 0x983e5152ee66dfab, 0xa831c66d2db43210
.dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4
.dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725
.dword 0x06ca6351e003826f, 0x142929670a0e6e70
.dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926
.dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
.dword 0x650a73548baf63de, 0x766a0abb3c77b2a8
.dword 0x81c2c92e47edaee6, 0x92722c851482353b
.dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001
.dword 0xc24b8b70d0f89791, 0xc76c51a30654be30
.dword 0xd192e819d6ef5218, 0xd69906245565a910
.dword 0xf40e35855771202a, 0x106aa07032bbd1b8
.dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
.dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
.dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
.dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
.dword 0x748f82ee5defb2fc, 0x78a5636f43172f60
.dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec
.dword 0x90befffa23631e28, 0xa4506cebde82bde9
.dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b
.dword 0xca273eceea26619c, 0xd186b8c721c0c207
.dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
.dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6
.dword 0x113f9804bef90dae, 0x1b710b35131c471b
.dword 0x28db77f523047d84, 0x32caab7b40c72493
.dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
.dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
.dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
.size K512, . - K512

View File

@ -0,0 +1,112 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* SM3 using the RISC-V vector crypto extensions
*
* Copyright (C) 2023 VRULL GmbH
* Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
*
* Copyright (C) 2023 SiFive, Inc.
* Author: Jerry Shih <jerry.shih@sifive.com>
*/
#include <asm/simd.h>
#include <asm/vector.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/sm3_base.h>
#include <linux/linkage.h>
#include <linux/module.h>
/*
* Note: the asm function only uses the 'state' field of struct sm3_state.
* It is assumed to be the first field.
*/
asmlinkage void sm3_transform_zvksh_zvkb(
struct sm3_state *state, const u8 *data, int num_blocks);
static int riscv64_sm3_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
/*
* Ensure struct sm3_state begins directly with the SM3
* 256-bit internal state, as this is what the asm function expects.
*/
BUILD_BUG_ON(offsetof(struct sm3_state, state) != 0);
if (crypto_simd_usable()) {
kernel_vector_begin();
sm3_base_do_update(desc, data, len, sm3_transform_zvksh_zvkb);
kernel_vector_end();
} else {
sm3_update(shash_desc_ctx(desc), data, len);
}
return 0;
}
static int riscv64_sm3_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
struct sm3_state *ctx;
if (crypto_simd_usable()) {
kernel_vector_begin();
if (len)
sm3_base_do_update(desc, data, len,
sm3_transform_zvksh_zvkb);
sm3_base_do_finalize(desc, sm3_transform_zvksh_zvkb);
kernel_vector_end();
return sm3_base_finish(desc, out);
}
ctx = shash_desc_ctx(desc);
if (len)
sm3_update(ctx, data, len);
sm3_final(ctx, out);
return 0;
}
static int riscv64_sm3_final(struct shash_desc *desc, u8 *out)
{
return riscv64_sm3_finup(desc, NULL, 0, out);
}
static struct shash_alg riscv64_sm3_alg = {
.init = sm3_base_init,
.update = riscv64_sm3_update,
.final = riscv64_sm3_final,
.finup = riscv64_sm3_finup,
.descsize = sizeof(struct sm3_state),
.digestsize = SM3_DIGEST_SIZE,
.base = {
.cra_blocksize = SM3_BLOCK_SIZE,
.cra_priority = 300,
.cra_name = "sm3",
.cra_driver_name = "sm3-riscv64-zvksh-zvkb",
.cra_module = THIS_MODULE,
},
};
static int __init riscv64_sm3_mod_init(void)
{
if (riscv_isa_extension_available(NULL, ZVKSH) &&
riscv_isa_extension_available(NULL, ZVKB) &&
riscv_vector_vlen() >= 128)
return crypto_register_shash(&riscv64_sm3_alg);
return -ENODEV;
}
static void __exit riscv64_sm3_mod_exit(void)
{
crypto_unregister_shash(&riscv64_sm3_alg);
}
module_init(riscv64_sm3_mod_init);
module_exit(riscv64_sm3_mod_exit);
MODULE_DESCRIPTION("SM3 (RISC-V accelerated)");
MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("sm3");

View File

@ -0,0 +1,123 @@
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128
// - RISC-V Vector SM3 Secure Hash extension ('Zvksh')
// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
#include <linux/cfi_types.h>
.text
.option arch, +zvksh, +zvkb
#define STATEP a0
#define DATA a1
#define NUM_BLOCKS a2
#define STATE v0 // LMUL=2
#define PREV_STATE v2 // LMUL=2
#define W0 v4 // LMUL=2
#define W1 v6 // LMUL=2
#define VTMP v8 // LMUL=2
.macro sm3_8rounds i, w0, w1
// Do 4 rounds using W_{0+i}..W_{7+i}.
vsm3c.vi STATE, \w0, \i + 0
vslidedown.vi VTMP, \w0, 2
vsm3c.vi STATE, VTMP, \i + 1
// Compute W_{4+i}..W_{11+i}.
vslidedown.vi VTMP, \w0, 4
vslideup.vi VTMP, \w1, 4
// Do 4 rounds using W_{4+i}..W_{11+i}.
vsm3c.vi STATE, VTMP, \i + 2
vslidedown.vi VTMP, VTMP, 2
vsm3c.vi STATE, VTMP, \i + 3
.if \i < 28
// Compute W_{16+i}..W_{23+i}.
vsm3me.vv \w0, \w1, \w0
.endif
// For the next 8 rounds, w0 and w1 are swapped.
.endm
// void sm3_transform_zvksh_zvkb(u32 state[8], const u8 *data, int num_blocks);
SYM_TYPED_FUNC_START(sm3_transform_zvksh_zvkb)
// Load the state and endian-swap each 32-bit word.
vsetivli zero, 8, e32, m2, ta, ma
vle32.v STATE, (STATEP)
vrev8.v STATE, STATE
.Lnext_block:
addi NUM_BLOCKS, NUM_BLOCKS, -1
// Save the previous state, as it's needed later.
vmv.v.v PREV_STATE, STATE
// Load the next 512-bit message block into W0-W1.
vle32.v W0, (DATA)
addi DATA, DATA, 32
vle32.v W1, (DATA)
addi DATA, DATA, 32
// Do the 64 rounds of SM3.
sm3_8rounds 0, W0, W1
sm3_8rounds 4, W1, W0
sm3_8rounds 8, W0, W1
sm3_8rounds 12, W1, W0
sm3_8rounds 16, W0, W1
sm3_8rounds 20, W1, W0
sm3_8rounds 24, W0, W1
sm3_8rounds 28, W1, W0
// XOR in the previous state.
vxor.vv STATE, STATE, PREV_STATE
// Repeat if more blocks remain.
bnez NUM_BLOCKS, .Lnext_block
// Store the new state and return.
vrev8.v STATE, STATE
vse32.v STATE, (STATEP)
ret
SYM_FUNC_END(sm3_transform_zvksh_zvkb)

View File

@ -0,0 +1,107 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* SM4 using the RISC-V vector crypto extensions
*
* Copyright (C) 2023 VRULL GmbH
* Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
*
* Copyright (C) 2023 SiFive, Inc.
* Author: Jerry Shih <jerry.shih@sifive.com>
*/
#include <asm/simd.h>
#include <asm/vector.h>
#include <crypto/internal/cipher.h>
#include <crypto/internal/simd.h>
#include <crypto/sm4.h>
#include <linux/linkage.h>
#include <linux/module.h>
asmlinkage void sm4_expandkey_zvksed_zvkb(const u8 user_key[SM4_KEY_SIZE],
u32 rkey_enc[SM4_RKEY_WORDS],
u32 rkey_dec[SM4_RKEY_WORDS]);
asmlinkage void sm4_crypt_zvksed_zvkb(const u32 rkey[SM4_RKEY_WORDS],
const u8 in[SM4_BLOCK_SIZE],
u8 out[SM4_BLOCK_SIZE]);
static int riscv64_sm4_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen)
{
struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
if (crypto_simd_usable()) {
if (keylen != SM4_KEY_SIZE)
return -EINVAL;
kernel_vector_begin();
sm4_expandkey_zvksed_zvkb(key, ctx->rkey_enc, ctx->rkey_dec);
kernel_vector_end();
return 0;
}
return sm4_expandkey(ctx, key, keylen);
}
static void riscv64_sm4_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
if (crypto_simd_usable()) {
kernel_vector_begin();
sm4_crypt_zvksed_zvkb(ctx->rkey_enc, src, dst);
kernel_vector_end();
} else {
sm4_crypt_block(ctx->rkey_enc, dst, src);
}
}
static void riscv64_sm4_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
if (crypto_simd_usable()) {
kernel_vector_begin();
sm4_crypt_zvksed_zvkb(ctx->rkey_dec, src, dst);
kernel_vector_end();
} else {
sm4_crypt_block(ctx->rkey_dec, dst, src);
}
}
static struct crypto_alg riscv64_sm4_alg = {
.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
.cra_blocksize = SM4_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct sm4_ctx),
.cra_priority = 300,
.cra_name = "sm4",
.cra_driver_name = "sm4-riscv64-zvksed-zvkb",
.cra_cipher = {
.cia_min_keysize = SM4_KEY_SIZE,
.cia_max_keysize = SM4_KEY_SIZE,
.cia_setkey = riscv64_sm4_setkey,
.cia_encrypt = riscv64_sm4_encrypt,
.cia_decrypt = riscv64_sm4_decrypt,
},
.cra_module = THIS_MODULE,
};
static int __init riscv64_sm4_mod_init(void)
{
if (riscv_isa_extension_available(NULL, ZVKSED) &&
riscv_isa_extension_available(NULL, ZVKB) &&
riscv_vector_vlen() >= 128)
return crypto_register_alg(&riscv64_sm4_alg);
return -ENODEV;
}
static void __exit riscv64_sm4_mod_exit(void)
{
crypto_unregister_alg(&riscv64_sm4_alg);
}
module_init(riscv64_sm4_mod_init);
module_exit(riscv64_sm4_mod_exit);
MODULE_DESCRIPTION("SM4 (RISC-V accelerated)");
MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("sm4");

View File

@ -0,0 +1,117 @@
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128
// - RISC-V Vector SM4 Block Cipher extension ('Zvksed')
// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
#include <linux/linkage.h>
.text
.option arch, +zvksed, +zvkb
// void sm4_expandkey_zksed_zvkb(const u8 user_key[16], u32 rkey_enc[32],
// u32 rkey_dec[32]);
SYM_FUNC_START(sm4_expandkey_zvksed_zvkb)
vsetivli zero, 4, e32, m1, ta, ma
// Load the user key.
vle32.v v1, (a0)
vrev8.v v1, v1
// XOR the user key with the family key.
la t0, FAMILY_KEY
vle32.v v2, (t0)
vxor.vv v1, v1, v2
// Compute the round keys. Store them in forwards order in rkey_enc
// and in reverse order in rkey_dec.
addi a2, a2, 31*4
li t0, -4
.set i, 0
.rept 8
vsm4k.vi v1, v1, i
vse32.v v1, (a1) // Store to rkey_enc.
vsse32.v v1, (a2), t0 // Store to rkey_dec.
.if i < 7
addi a1, a1, 16
addi a2, a2, -16
.endif
.set i, i + 1
.endr
ret
SYM_FUNC_END(sm4_expandkey_zvksed_zvkb)
// void sm4_crypt_zvksed_zvkb(const u32 rkey[32], const u8 in[16], u8 out[16]);
SYM_FUNC_START(sm4_crypt_zvksed_zvkb)
vsetivli zero, 4, e32, m1, ta, ma
// Load the input data.
vle32.v v1, (a1)
vrev8.v v1, v1
// Do the 32 rounds of SM4, 4 at a time.
.set i, 0
.rept 8
vle32.v v2, (a0)
vsm4r.vs v1, v2
.if i < 7
addi a0, a0, 16
.endif
.set i, i + 1
.endr
// Store the output data (in reverse element order).
vrev8.v v1, v1
li t0, -4
addi a2, a2, 12
vsse32.v v1, (a2), t0
ret
SYM_FUNC_END(sm4_crypt_zvksed_zvkb)
.section ".rodata"
.p2align 2
.type FAMILY_KEY, @object
FAMILY_KEY:
.word 0xA3B1BAC6, 0x56AA3350, 0x677D9197, 0xB27022DC
.size FAMILY_KEY, . - FAMILY_KEY

View File

@ -18,9 +18,9 @@
#include <asm/sbi.h>
#include <asm/vendorid_list.h>
#define ANDESTECH_AX45MP_MARCHID 0x8000000000008a45UL
#define ANDESTECH_AX45MP_MIMPID 0x500UL
#define ANDESTECH_SBI_EXT_ANDES 0x0900031E
#define ANDES_AX45MP_MARCHID 0x8000000000008a45UL
#define ANDES_AX45MP_MIMPID 0x500UL
#define ANDES_SBI_EXT_ANDES 0x0900031E
#define ANDES_SBI_EXT_IOCP_SW_WORKAROUND 1
@ -32,7 +32,7 @@ static long ax45mp_iocp_sw_workaround(void)
* ANDES_SBI_EXT_IOCP_SW_WORKAROUND SBI EXT checks if the IOCP is missing and
* cache is controllable only then CMO will be applied to the platform.
*/
ret = sbi_ecall(ANDESTECH_SBI_EXT_ANDES, ANDES_SBI_EXT_IOCP_SW_WORKAROUND,
ret = sbi_ecall(ANDES_SBI_EXT_ANDES, ANDES_SBI_EXT_IOCP_SW_WORKAROUND,
0, 0, 0, 0, 0, 0);
return ret.error ? 0 : ret.value;
@ -50,7 +50,7 @@ static void errata_probe_iocp(unsigned int stage, unsigned long arch_id, unsigne
done = true;
if (arch_id != ANDESTECH_AX45MP_MARCHID || impid != ANDESTECH_AX45MP_MIMPID)
if (arch_id != ANDES_AX45MP_MARCHID || impid != ANDES_AX45MP_MIMPID)
return;
if (!ax45mp_iocp_sw_workaround())

View File

@ -183,6 +183,16 @@
REG_L x31, PT_T6(sp)
.endm
/* Annotate a function as being unsuitable for kprobes. */
#ifdef CONFIG_KPROBES
#define ASM_NOKPROBE(name) \
.pushsection "_kprobe_blacklist", "aw"; \
RISCV_PTR name; \
.popsection
#else
#define ASM_NOKPROBE(name)
#endif
#endif /* __ASSEMBLY__ */
#endif /* _ASM_RISCV_ASM_H */

View File

@ -17,7 +17,6 @@
#endif
#include <asm/cmpxchg.h>
#include <asm/barrier.h>
#define __atomic_acquire_fence() \
__asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory")
@ -207,7 +206,7 @@ static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int
" add %[rc], %[p], %[a]\n"
" sc.w.rl %[rc], %[rc], %[c]\n"
" bnez %[rc], 0b\n"
" fence rw, rw\n"
RISCV_FULL_BARRIER
"1:\n"
: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
: [a]"r" (a), [u]"r" (u)
@ -228,7 +227,7 @@ static __always_inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a,
" add %[rc], %[p], %[a]\n"
" sc.d.rl %[rc], %[rc], %[c]\n"
" bnez %[rc], 0b\n"
" fence rw, rw\n"
RISCV_FULL_BARRIER
"1:\n"
: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
: [a]"r" (a), [u]"r" (u)
@ -248,7 +247,7 @@ static __always_inline bool arch_atomic_inc_unless_negative(atomic_t *v)
" addi %[rc], %[p], 1\n"
" sc.w.rl %[rc], %[rc], %[c]\n"
" bnez %[rc], 0b\n"
" fence rw, rw\n"
RISCV_FULL_BARRIER
"1:\n"
: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
:
@ -268,7 +267,7 @@ static __always_inline bool arch_atomic_dec_unless_positive(atomic_t *v)
" addi %[rc], %[p], -1\n"
" sc.w.rl %[rc], %[rc], %[c]\n"
" bnez %[rc], 0b\n"
" fence rw, rw\n"
RISCV_FULL_BARRIER
"1:\n"
: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
:
@ -288,7 +287,7 @@ static __always_inline int arch_atomic_dec_if_positive(atomic_t *v)
" bltz %[rc], 1f\n"
" sc.w.rl %[rc], %[rc], %[c]\n"
" bnez %[rc], 0b\n"
" fence rw, rw\n"
RISCV_FULL_BARRIER
"1:\n"
: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
:
@ -310,7 +309,7 @@ static __always_inline bool arch_atomic64_inc_unless_negative(atomic64_t *v)
" addi %[rc], %[p], 1\n"
" sc.d.rl %[rc], %[rc], %[c]\n"
" bnez %[rc], 0b\n"
" fence rw, rw\n"
RISCV_FULL_BARRIER
"1:\n"
: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
:
@ -331,7 +330,7 @@ static __always_inline bool arch_atomic64_dec_unless_positive(atomic64_t *v)
" addi %[rc], %[p], -1\n"
" sc.d.rl %[rc], %[rc], %[c]\n"
" bnez %[rc], 0b\n"
" fence rw, rw\n"
RISCV_FULL_BARRIER
"1:\n"
: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
:
@ -352,7 +351,7 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
" bltz %[rc], 1f\n"
" sc.d.rl %[rc], %[rc], %[c]\n"
" bnez %[rc], 0b\n"
" fence rw, rw\n"
RISCV_FULL_BARRIER
"1:\n"
: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
:

View File

@ -11,28 +11,27 @@
#define _ASM_RISCV_BARRIER_H
#ifndef __ASSEMBLY__
#include <asm/fence.h>
#define nop() __asm__ __volatile__ ("nop")
#define __nops(n) ".rept " #n "\nnop\n.endr\n"
#define nops(n) __asm__ __volatile__ (__nops(n))
#define RISCV_FENCE(p, s) \
__asm__ __volatile__ ("fence " #p "," #s : : : "memory")
/* These barriers need to enforce ordering on both devices or memory. */
#define mb() RISCV_FENCE(iorw,iorw)
#define rmb() RISCV_FENCE(ir,ir)
#define wmb() RISCV_FENCE(ow,ow)
#define __mb() RISCV_FENCE(iorw, iorw)
#define __rmb() RISCV_FENCE(ir, ir)
#define __wmb() RISCV_FENCE(ow, ow)
/* These barriers do not need to enforce ordering on devices, just memory. */
#define __smp_mb() RISCV_FENCE(rw,rw)
#define __smp_rmb() RISCV_FENCE(r,r)
#define __smp_wmb() RISCV_FENCE(w,w)
#define __smp_mb() RISCV_FENCE(rw, rw)
#define __smp_rmb() RISCV_FENCE(r, r)
#define __smp_wmb() RISCV_FENCE(w, w)
#define __smp_store_release(p, v) \
do { \
compiletime_assert_atomic_type(*p); \
RISCV_FENCE(rw,w); \
RISCV_FENCE(rw, w); \
WRITE_ONCE(*p, v); \
} while (0)
@ -40,7 +39,7 @@ do { \
({ \
typeof(*p) ___p1 = READ_ONCE(*p); \
compiletime_assert_atomic_type(*p); \
RISCV_FENCE(r,rw); \
RISCV_FENCE(r, rw); \
___p1; \
})
@ -69,7 +68,7 @@ do { \
* instances the scheduler pairs this with an mb(), so nothing is necessary on
* the new hart.
*/
#define smp_mb__after_spinlock() RISCV_FENCE(iorw,iorw)
#define smp_mb__after_spinlock() RISCV_FENCE(iorw, iorw)
#include <asm-generic/barrier.h>

View File

@ -22,6 +22,16 @@
#include <asm-generic/bitops/fls.h>
#else
#define __HAVE_ARCH___FFS
#define __HAVE_ARCH___FLS
#define __HAVE_ARCH_FFS
#define __HAVE_ARCH_FLS
#include <asm-generic/bitops/__ffs.h>
#include <asm-generic/bitops/__fls.h>
#include <asm-generic/bitops/ffs.h>
#include <asm-generic/bitops/fls.h>
#include <asm/alternative-macros.h>
#include <asm/hwcap.h>
@ -37,8 +47,6 @@
static __always_inline unsigned long variable__ffs(unsigned long word)
{
int num;
asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
RISCV_ISA_EXT_ZBB, 1)
: : : : legacy);
@ -52,32 +60,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word)
return word;
legacy:
num = 0;
#if BITS_PER_LONG == 64
if ((word & 0xffffffff) == 0) {
num += 32;
word >>= 32;
}
#endif
if ((word & 0xffff) == 0) {
num += 16;
word >>= 16;
}
if ((word & 0xff) == 0) {
num += 8;
word >>= 8;
}
if ((word & 0xf) == 0) {
num += 4;
word >>= 4;
}
if ((word & 0x3) == 0) {
num += 2;
word >>= 2;
}
if ((word & 0x1) == 0)
num += 1;
return num;
return generic___ffs(word);
}
/**
@ -93,8 +76,6 @@ legacy:
static __always_inline unsigned long variable__fls(unsigned long word)
{
int num;
asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
RISCV_ISA_EXT_ZBB, 1)
: : : : legacy);
@ -108,32 +89,7 @@ static __always_inline unsigned long variable__fls(unsigned long word)
return BITS_PER_LONG - 1 - word;
legacy:
num = BITS_PER_LONG - 1;
#if BITS_PER_LONG == 64
if (!(word & (~0ul << 32))) {
num -= 32;
word <<= 32;
}
#endif
if (!(word & (~0ul << (BITS_PER_LONG - 16)))) {
num -= 16;
word <<= 16;
}
if (!(word & (~0ul << (BITS_PER_LONG - 8)))) {
num -= 8;
word <<= 8;
}
if (!(word & (~0ul << (BITS_PER_LONG - 4)))) {
num -= 4;
word <<= 4;
}
if (!(word & (~0ul << (BITS_PER_LONG - 2)))) {
num -= 2;
word <<= 2;
}
if (!(word & (~0ul << (BITS_PER_LONG - 1))))
num -= 1;
return num;
return generic___fls(word);
}
/**
@ -149,46 +105,23 @@ legacy:
static __always_inline int variable_ffs(int x)
{
int r;
if (!x)
return 0;
asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
RISCV_ISA_EXT_ZBB, 1)
: : : : legacy);
if (!x)
return 0;
asm volatile (".option push\n"
".option arch,+zbb\n"
CTZW "%0, %1\n"
".option pop\n"
: "=r" (r) : "r" (x) :);
: "=r" (x) : "r" (x) :);
return r + 1;
return x + 1;
legacy:
r = 1;
if (!(x & 0xffff)) {
x >>= 16;
r += 16;
}
if (!(x & 0xff)) {
x >>= 8;
r += 8;
}
if (!(x & 0xf)) {
x >>= 4;
r += 4;
}
if (!(x & 3)) {
x >>= 2;
r += 2;
}
if (!(x & 1)) {
x >>= 1;
r += 1;
}
return r;
return generic_ffs(x);
}
/**
@ -204,46 +137,23 @@ legacy:
static __always_inline int variable_fls(unsigned int x)
{
int r;
if (!x)
return 0;
asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
RISCV_ISA_EXT_ZBB, 1)
: : : : legacy);
if (!x)
return 0;
asm volatile (".option push\n"
".option arch,+zbb\n"
CLZW "%0, %1\n"
".option pop\n"
: "=r" (r) : "r" (x) :);
: "=r" (x) : "r" (x) :);
return 32 - r;
return 32 - x;
legacy:
r = 32;
if (!(x & 0xffff0000u)) {
x <<= 16;
r -= 16;
}
if (!(x & 0xff000000u)) {
x <<= 8;
r -= 8;
}
if (!(x & 0xf0000000u)) {
x <<= 4;
r -= 4;
}
if (!(x & 0xc0000000u)) {
x <<= 2;
r -= 2;
}
if (!(x & 0x80000000u)) {
x <<= 1;
r -= 1;
}
return r;
return generic_fls(x);
}
/**

View File

@ -8,7 +8,6 @@
#include <linux/bug.h>
#include <asm/barrier.h>
#include <asm/fence.h>
#define __xchg_relaxed(ptr, new, size) \
@ -313,7 +312,7 @@
" bne %0, %z3, 1f\n" \
" sc.w.rl %1, %z4, %2\n" \
" bnez %1, 0b\n" \
" fence rw, rw\n" \
RISCV_FULL_BARRIER \
"1:\n" \
: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \
: "rJ" ((long)__old), "rJ" (__new) \
@ -325,7 +324,7 @@
" bne %0, %z3, 1f\n" \
" sc.d.rl %1, %z4, %2\n" \
" bnez %1, 0b\n" \
" fence rw, rw\n" \
RISCV_FULL_BARRIER \
"1:\n" \
: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \
: "rJ" (__old), "rJ" (__new) \

View File

@ -14,9 +14,28 @@
static inline int is_compat_task(void)
{
if (!IS_ENABLED(CONFIG_COMPAT))
return 0;
return test_thread_flag(TIF_32BIT);
}
static inline int is_compat_thread(struct thread_info *thread)
{
if (!IS_ENABLED(CONFIG_COMPAT))
return 0;
return test_ti_thread_flag(thread, TIF_32BIT);
}
static inline void set_compat_task(bool is_compat)
{
if (is_compat)
set_thread_flag(TIF_32BIT);
else
clear_thread_flag(TIF_32BIT);
}
struct compat_user_regs_struct {
compat_ulong_t pc;
compat_ulong_t ra;

View File

@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2022-2023 Rivos, Inc
* Copyright 2022-2024 Rivos, Inc
*/
#ifndef _ASM_CPUFEATURE_H
@ -28,29 +28,38 @@ struct riscv_isainfo {
DECLARE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo);
DECLARE_PER_CPU(long, misaligned_access_speed);
/* Per-cpu ISA extensions. */
extern struct riscv_isainfo hart_isa[NR_CPUS];
void riscv_user_isa_enable(void);
#ifdef CONFIG_RISCV_MISALIGNED
bool unaligned_ctl_available(void);
bool check_unaligned_access_emulated(int cpu);
#if defined(CONFIG_RISCV_MISALIGNED)
bool check_unaligned_access_emulated_all_cpus(void);
void unaligned_emulation_finish(void);
bool unaligned_ctl_available(void);
DECLARE_PER_CPU(long, misaligned_access_speed);
#else
static inline bool unaligned_ctl_available(void)
{
return false;
}
#endif
static inline bool check_unaligned_access_emulated(int cpu)
#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
static __always_inline bool has_fast_unaligned_accesses(void)
{
return false;
return static_branch_likely(&fast_unaligned_access_speed_key);
}
#else
static __always_inline bool has_fast_unaligned_accesses(void)
{
if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
return true;
else
return false;
}
static inline void unaligned_emulation_finish(void) {}
#endif
unsigned long riscv_get_elf_hwcap(void);
@ -135,6 +144,4 @@ static __always_inline bool riscv_cpu_has_extension_unlikely(int cpu, const unsi
return __riscv_isa_extension_available(hart_isa[cpu].isa, ext);
}
DECLARE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
#endif

View File

@ -53,13 +53,9 @@ extern bool compat_elf_check_arch(Elf32_Ehdr *hdr);
#define ELF_ET_DYN_BASE ((DEFAULT_MAP_WINDOW / 3) * 2)
#ifdef CONFIG_64BIT
#ifdef CONFIG_COMPAT
#define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \
#define STACK_RND_MASK (is_compat_task() ? \
0x7ff >> (PAGE_SHIFT - 12) : \
0x3ffff >> (PAGE_SHIFT - 12))
#else
#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12))
#endif
#endif
/*
@ -139,10 +135,7 @@ do { \
#ifdef CONFIG_COMPAT
#define SET_PERSONALITY(ex) \
do { if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \
set_thread_flag(TIF_32BIT); \
else \
clear_thread_flag(TIF_32BIT); \
do { set_compat_task((ex).e_ident[EI_CLASS] == ELFCLASS32); \
if (personality(current->personality) != PER_LINUX32) \
set_personality(PER_LINUX | \
(current->personality & (~PER_MASK))); \

View File

@ -12,8 +12,8 @@
#include <asm/vendorid_list.h>
#ifdef CONFIG_ERRATA_ANDES
#define ERRATA_ANDESTECH_NO_IOCP 0
#define ERRATA_ANDESTECH_NUMBER 1
#define ERRATA_ANDES_NO_IOCP 0
#define ERRATA_ANDES_NUMBER 1
#endif
#ifdef CONFIG_ERRATA_SIFIVE
@ -112,15 +112,6 @@ asm volatile(ALTERNATIVE( \
#define THEAD_C9XX_RV_IRQ_PMU 17
#define THEAD_C9XX_CSR_SCOUNTEROF 0x5c5
#define ALT_SBI_PMU_OVERFLOW(__ovl) \
asm volatile(ALTERNATIVE( \
"csrr %0, " __stringify(CSR_SSCOUNTOVF), \
"csrr %0, " __stringify(THEAD_C9XX_CSR_SCOUNTEROF), \
THEAD_VENDOR_ID, ERRATA_THEAD_PMU, \
CONFIG_ERRATA_THEAD_PMU) \
: "=r" (__ovl) : \
: "memory")
#endif /* __ASSEMBLY__ */
#endif

View File

@ -1,12 +1,18 @@
#ifndef _ASM_RISCV_FENCE_H
#define _ASM_RISCV_FENCE_H
#define RISCV_FENCE_ASM(p, s) "\tfence " #p "," #s "\n"
#define RISCV_FENCE(p, s) \
({ __asm__ __volatile__ (RISCV_FENCE_ASM(p, s) : : : "memory"); })
#ifdef CONFIG_SMP
#define RISCV_ACQUIRE_BARRIER "\tfence r , rw\n"
#define RISCV_RELEASE_BARRIER "\tfence rw, w\n"
#define RISCV_ACQUIRE_BARRIER RISCV_FENCE_ASM(r, rw)
#define RISCV_RELEASE_BARRIER RISCV_FENCE_ASM(rw, w)
#define RISCV_FULL_BARRIER RISCV_FENCE_ASM(rw, rw)
#else
#define RISCV_ACQUIRE_BARRIER
#define RISCV_RELEASE_BARRIER
#define RISCV_FULL_BARRIER
#endif
#endif /* _ASM_RISCV_FENCE_H */

View File

@ -80,6 +80,7 @@
#define RISCV_ISA_EXT_ZFA 71
#define RISCV_ISA_EXT_ZTSO 72
#define RISCV_ISA_EXT_ZACAS 73
#define RISCV_ISA_EXT_XANDESPMU 74
#define RISCV_ISA_EXT_XLINUXENVCFG 127

View File

@ -47,10 +47,10 @@
* sufficient to ensure this works sanely on controllers that support I/O
* writes.
*/
#define __io_pbr() __asm__ __volatile__ ("fence io,i" : : : "memory");
#define __io_par(v) __asm__ __volatile__ ("fence i,ior" : : : "memory");
#define __io_pbw() __asm__ __volatile__ ("fence iow,o" : : : "memory");
#define __io_paw() __asm__ __volatile__ ("fence o,io" : : : "memory");
#define __io_pbr() RISCV_FENCE(io, i)
#define __io_par(v) RISCV_FENCE(i, ior)
#define __io_pbw() RISCV_FENCE(iow, o)
#define __io_paw() RISCV_FENCE(o, io)
/*
* Accesses from a single hart to a single I/O address must be ordered. This

View File

@ -0,0 +1,50 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _ASM_RISCV_MEMBARRIER_H
#define _ASM_RISCV_MEMBARRIER_H
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
struct mm_struct *next,
struct task_struct *tsk)
{
/*
* Only need the full barrier when switching between processes.
* Barrier when switching from kernel to userspace is not
* required here, given that it is implied by mmdrop(). Barrier
* when switching from userspace to kernel is not needed after
* store to rq->curr.
*/
if (IS_ENABLED(CONFIG_SMP) &&
likely(!(atomic_read(&next->membarrier_state) &
(MEMBARRIER_STATE_PRIVATE_EXPEDITED |
MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev))
return;
/*
* The membarrier system call requires a full memory barrier
* after storing to rq->curr, before going back to user-space.
*
* This barrier is also needed for the SYNC_CORE command when
* switching between processes; in particular, on a transition
* from a thread belonging to another mm to a thread belonging
* to the mm for which a membarrier SYNC_CORE is done on CPU0:
*
* - [CPU0] sets all bits in the mm icache_stale_mask (in
* prepare_sync_core_cmd());
*
* - [CPU1] stores to rq->curr (by the scheduler);
*
* - [CPU0] loads rq->curr within membarrier and observes
* cpu_rq(1)->curr->mm != mm, so the IPI is skipped on
* CPU1; this means membarrier relies on switch_mm() to
* issue the sync-core;
*
* - [CPU1] switch_mm() loads icache_stale_mask; if the bit
* is zero, switch_mm() may incorrectly skip the sync-core.
*
* Matches a full barrier in the proximity of the membarrier
* system call entry.
*/
smp_mb();
}
#endif /* _ASM_RISCV_MEMBARRIER_H */

View File

@ -12,6 +12,7 @@
#define _ASM_RISCV_MMIO_H
#include <linux/types.h>
#include <asm/fence.h>
#include <asm/mmiowb.h>
/* Generic IO read/write. These perform native-endian accesses. */
@ -131,8 +132,8 @@ static inline u64 __raw_readq(const volatile void __iomem *addr)
* doesn't define any ordering between the memory space and the I/O space.
*/
#define __io_br() do {} while (0)
#define __io_ar(v) ({ __asm__ __volatile__ ("fence i,ir" : : : "memory"); })
#define __io_bw() ({ __asm__ __volatile__ ("fence w,o" : : : "memory"); })
#define __io_ar(v) RISCV_FENCE(i, ir)
#define __io_bw() RISCV_FENCE(w, o)
#define __io_aw() mmiowb_set_pending()
#define readb(c) ({ u8 __v; __io_br(); __v = readb_cpu(c); __io_ar(__v); __v; })

View File

@ -7,7 +7,7 @@
* "o,w" is sufficient to ensure that all writes to the device have completed
* before the write to the spinlock is allowed to commit.
*/
#define mmiowb() __asm__ __volatile__ ("fence o,w" : : : "memory");
#define mmiowb() RISCV_FENCE(o, w)
#include <linux/smp.h>
#include <asm-generic/mmiowb.h>

View File

@ -95,13 +95,19 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
__pud_free(mm, pud);
}
#define __pud_free_tlb(tlb, pud, addr) \
do { \
if (pgtable_l4_enabled) { \
pagetable_pud_dtor(virt_to_ptdesc(pud)); \
tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pud)); \
} \
} while (0)
static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
unsigned long addr)
{
if (pgtable_l4_enabled) {
struct ptdesc *ptdesc = virt_to_ptdesc(pud);
pagetable_pud_dtor(ptdesc);
if (riscv_use_ipi_for_rfence())
tlb_remove_page_ptdesc(tlb, ptdesc);
else
tlb_remove_ptdesc(tlb, ptdesc);
}
}
#define p4d_alloc_one p4d_alloc_one
static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
@ -130,11 +136,16 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
__p4d_free(mm, p4d);
}
#define __p4d_free_tlb(tlb, p4d, addr) \
do { \
if (pgtable_l5_enabled) \
tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(p4d)); \
} while (0)
static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
unsigned long addr)
{
if (pgtable_l5_enabled) {
if (riscv_use_ipi_for_rfence())
tlb_remove_page_ptdesc(tlb, virt_to_ptdesc(p4d));
else
tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d));
}
}
#endif /* __PAGETABLE_PMD_FOLDED */
static inline void sync_kernel_mappings(pgd_t *pgd)
@ -159,19 +170,31 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
#ifndef __PAGETABLE_PMD_FOLDED
#define __pmd_free_tlb(tlb, pmd, addr) \
do { \
pagetable_pmd_dtor(virt_to_ptdesc(pmd)); \
tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \
} while (0)
static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
unsigned long addr)
{
struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
pagetable_pmd_dtor(ptdesc);
if (riscv_use_ipi_for_rfence())
tlb_remove_page_ptdesc(tlb, ptdesc);
else
tlb_remove_ptdesc(tlb, ptdesc);
}
#endif /* __PAGETABLE_PMD_FOLDED */
#define __pte_free_tlb(tlb, pte, buf) \
do { \
pagetable_pte_dtor(page_ptdesc(pte)); \
tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\
} while (0)
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
unsigned long addr)
{
struct ptdesc *ptdesc = page_ptdesc(pte);
pagetable_pte_dtor(ptdesc);
if (riscv_use_ipi_for_rfence())
tlb_remove_page_ptdesc(tlb, ptdesc);
else
tlb_remove_ptdesc(tlb, ptdesc);
}
#endif /* CONFIG_MMU */
#endif /* _ASM_RISCV_PGALLOC_H */

View File

@ -127,16 +127,10 @@
#define VA_USER_SV48 (UL(1) << (VA_BITS_SV48 - 1))
#define VA_USER_SV57 (UL(1) << (VA_BITS_SV57 - 1))
#ifdef CONFIG_COMPAT
#define MMAP_VA_BITS_64 ((VA_BITS >= VA_BITS_SV48) ? VA_BITS_SV48 : VA_BITS)
#define MMAP_MIN_VA_BITS_64 (VA_BITS_SV39)
#define MMAP_VA_BITS (is_compat_task() ? VA_BITS_SV32 : MMAP_VA_BITS_64)
#define MMAP_MIN_VA_BITS (is_compat_task() ? VA_BITS_SV32 : MMAP_MIN_VA_BITS_64)
#else
#define MMAP_VA_BITS ((VA_BITS >= VA_BITS_SV48) ? VA_BITS_SV48 : VA_BITS)
#define MMAP_MIN_VA_BITS (VA_BITS_SV39)
#endif /* CONFIG_COMPAT */
#else
#include <asm/pgtable-32.h>
#endif /* CONFIG_64BIT */
@ -439,9 +433,11 @@ static inline pte_t pte_mkhuge(pte_t pte)
return pte;
}
#ifdef CONFIG_RISCV_ISA_SVNAPOT
#define pte_leaf_size(pte) (pte_napot(pte) ? \
napot_cont_size(napot_cont_order(pte)) :\
PAGE_SIZE)
#endif
#ifdef CONFIG_NUMA_BALANCING
/*
@ -517,12 +513,12 @@ static inline void set_pte(pte_t *ptep, pte_t pteval)
WRITE_ONCE(*ptep, pteval);
}
void flush_icache_pte(pte_t pte);
void flush_icache_pte(struct mm_struct *mm, pte_t pte);
static inline void __set_pte_at(pte_t *ptep, pte_t pteval)
static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
{
if (pte_present(pteval) && pte_exec(pteval))
flush_icache_pte(pteval);
flush_icache_pte(mm, pteval);
set_pte(ptep, pteval);
}
@ -535,7 +531,7 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
page_table_check_ptes_set(mm, ptep, pteval, nr);
for (;;) {
__set_pte_at(ptep, pteval);
__set_pte_at(mm, ptep, pteval);
if (--nr == 0)
break;
ptep++;
@ -547,7 +543,7 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
static inline void pte_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
__set_pte_at(ptep, __pte(0));
__set_pte_at(mm, ptep, __pte(0));
}
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* defined in mm/pgtable.c */
@ -662,6 +658,12 @@ static inline int pmd_write(pmd_t pmd)
return pte_write(pmd_pte(pmd));
}
#define pud_write pud_write
static inline int pud_write(pud_t pud)
{
return pte_write(pud_pte(pud));
}
#define pmd_dirty pmd_dirty
static inline int pmd_dirty(pmd_t pmd)
{
@ -713,14 +715,14 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
page_table_check_pmd_set(mm, pmdp, pmd);
return __set_pte_at((pte_t *)pmdp, pmd_pte(pmd));
return __set_pte_at(mm, (pte_t *)pmdp, pmd_pte(pmd));
}
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
{
page_table_check_pud_set(mm, pudp, pud);
return __set_pte_at((pte_t *)pudp, pud_pte(pud));
return __set_pte_at(mm, (pte_t *)pudp, pud_pte(pud));
}
#ifdef CONFIG_PAGE_TABLE_CHECK
@ -871,8 +873,8 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
#define TASK_SIZE_MIN (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2)
#ifdef CONFIG_COMPAT
#define TASK_SIZE_32 (_AC(0x80000000, UL))
#define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \
#define TASK_SIZE_32 (_AC(0x80000000, UL) - PAGE_SIZE)
#define TASK_SIZE (is_compat_task() ? \
TASK_SIZE_32 : TASK_SIZE_64)
#else
#define TASK_SIZE TASK_SIZE_64

View File

@ -14,22 +14,21 @@
#include <asm/ptrace.h>
#ifdef CONFIG_64BIT
#define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1))
#define STACK_TOP_MAX TASK_SIZE
/*
* addr is a hint to the maximum userspace address that mmap should provide, so
* this macro needs to return the largest address space available so that
* mmap_end < addr, being mmap_end the top of that address space.
* See Documentation/arch/riscv/vm-layout.rst for more details.
*/
#define arch_get_mmap_end(addr, len, flags) \
({ \
unsigned long mmap_end; \
typeof(addr) _addr = (addr); \
if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())) \
if ((_addr) == 0 || is_compat_task() || \
((_addr + len) > BIT(VA_BITS - 1))) \
mmap_end = STACK_TOP_MAX; \
else if ((_addr) >= VA_USER_SV57) \
mmap_end = STACK_TOP_MAX; \
else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \
mmap_end = VA_USER_SV48; \
else \
mmap_end = VA_USER_SV39; \
mmap_end = (_addr + len); \
mmap_end; \
})
@ -39,17 +38,17 @@
typeof(addr) _addr = (addr); \
typeof(base) _base = (base); \
unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base); \
if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())) \
if ((_addr) == 0 || is_compat_task() || \
((_addr + len) > BIT(VA_BITS - 1))) \
mmap_base = (_base); \
else if (((_addr) >= VA_USER_SV57) && (VA_BITS >= VA_BITS_SV57)) \
mmap_base = VA_USER_SV57 - rnd_gap; \
else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \
mmap_base = VA_USER_SV48 - rnd_gap; \
else \
mmap_base = VA_USER_SV39 - rnd_gap; \
mmap_base = (_addr + len) - rnd_gap; \
mmap_base; \
})
#ifdef CONFIG_64BIT
#define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1))
#define STACK_TOP_MAX TASK_SIZE_64
#else
#define DEFAULT_MAP_WINDOW TASK_SIZE
#define STACK_TOP_MAX TASK_SIZE

View File

@ -34,9 +34,9 @@ static __must_check inline bool may_use_simd(void)
return false;
/*
* Nesting is acheived in preempt_v by spreading the control for
* Nesting is achieved in preempt_v by spreading the control for
* preemptible and non-preemptible kernel-mode Vector into two fields.
* Always try to match with prempt_v if kernel V-context exists. Then,
* Always try to match with preempt_v if kernel V-context exists. Then,
* fallback to check non preempt_v if nesting happens, or if the config
* is not set.
*/

View File

@ -56,4 +56,7 @@ int hibernate_resume_nonboot_cpu_disable(void);
asmlinkage void hibernate_restore_image(unsigned long resume_satp, unsigned long satp_temp,
unsigned long cpu_resume);
asmlinkage int hibernate_core_restore_code(void);
bool riscv_sbi_hsm_is_supported(void);
bool riscv_sbi_suspend_state_is_valid(u32 state);
int riscv_sbi_hart_suspend(u32 state);
#endif

View File

@ -0,0 +1,29 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_RISCV_SYNC_CORE_H
#define _ASM_RISCV_SYNC_CORE_H
/*
* RISC-V implements return to user-space through an xRET instruction,
* which is not core serializing.
*/
static inline void sync_core_before_usermode(void)
{
asm volatile ("fence.i" ::: "memory");
}
#ifdef CONFIG_SMP
/*
* Ensure the next switch_mm() on every CPU issues a core serializing
* instruction for the given @mm.
*/
static inline void prepare_sync_core_cmd(struct mm_struct *mm)
{
cpumask_setall(&mm->context.icache_stale_mask);
}
#else
static inline void prepare_sync_core_cmd(struct mm_struct *mm)
{
}
#endif /* CONFIG_SMP */
#endif /* _ASM_RISCV_SYNC_CORE_H */

View File

@ -12,26 +12,52 @@
asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *);
#define SC_RISCV_REGS_TO_ARGS(x, ...) \
__MAP(x,__SC_ARGS \
,,regs->orig_a0,,regs->a1,,regs->a2 \
#ifdef CONFIG_64BIT
#define __SYSCALL_SE_DEFINEx(x, prefix, name, ...) \
static long __se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
static long __se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__))
#define SC_RISCV_REGS_TO_ARGS(x, ...) \
__MAP(x,__SC_ARGS \
,,regs->orig_a0,,regs->a1,,regs->a2 \
,,regs->a3,,regs->a4,,regs->a5,,regs->a6)
#else
/*
* Use type aliasing to ensure registers a0-a6 are correctly passed to the syscall
* implementation when >word-size arguments are used.
*/
#define __SYSCALL_SE_DEFINEx(x, prefix, name, ...) \
__diag_push(); \
__diag_ignore(GCC, 8, "-Wattribute-alias", \
"Type aliasing is used to sanitize syscall arguments"); \
static long __se_##prefix##name(ulong, ulong, ulong, ulong, ulong, ulong, \
ulong) \
__attribute__((alias(__stringify(___se_##prefix##name)))); \
__diag_pop(); \
static long noinline ___se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
static long ___se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__))
#define SC_RISCV_REGS_TO_ARGS(x, ...) \
regs->orig_a0,regs->a1,regs->a2,regs->a3,regs->a4,regs->a5,regs->a6
#endif /* CONFIG_64BIT */
#ifdef CONFIG_COMPAT
#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long __riscv_compat_sys##name(const struct pt_regs *regs); \
ALLOW_ERROR_INJECTION(__riscv_compat_sys##name, ERRNO); \
static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
__SYSCALL_SE_DEFINEx(x, compat_sys, name, __VA_ARGS__) \
{ \
return __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__)); \
} \
asmlinkage long __riscv_compat_sys##name(const struct pt_regs *regs) \
{ \
return __se_compat_sys##name(SC_RISCV_REGS_TO_ARGS(x,__VA_ARGS__)); \
} \
static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
return __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__)); \
} \
static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
#define COMPAT_SYSCALL_DEFINE0(sname) \
@ -51,19 +77,18 @@ asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *);
#define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long __riscv_sys##name(const struct pt_regs *regs); \
ALLOW_ERROR_INJECTION(__riscv_sys##name, ERRNO); \
static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
asmlinkage long __riscv_sys##name(const struct pt_regs *regs) \
{ \
return __se_sys##name(SC_RISCV_REGS_TO_ARGS(x,__VA_ARGS__)); \
} \
static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
__SYSCALL_SE_DEFINEx(x, sys, name, __VA_ARGS__) \
{ \
long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
} \
asmlinkage long __riscv_sys##name(const struct pt_regs *regs) \
{ \
return __se_sys##name(SC_RISCV_REGS_TO_ARGS(x,__VA_ARGS__)); \
} \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
#define SYSCALL_DEFINE0(sname) \

View File

@ -10,6 +10,24 @@ struct mmu_gather;
static void tlb_flush(struct mmu_gather *tlb);
#ifdef CONFIG_MMU
#include <linux/swap.h>
/*
* While riscv platforms with riscv_ipi_for_rfence as true require an IPI to
* perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use
* SBI to perform TLB shootdown. To keep software pagetable walkers safe in this
* case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the
* comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
* for more details.
*/
static inline void __tlb_remove_table(void *table)
{
free_page_and_swap_cache(table);
}
#endif /* CONFIG_MMU */
#define tlb_flush tlb_flush
#include <asm-generic/tlb.h>

View File

@ -284,4 +284,15 @@ static inline bool riscv_v_vstate_ctrl_user_allowed(void) { return false; }
#endif /* CONFIG_RISCV_ISA_V */
/*
* Return the implementation's vlen value.
*
* riscv_v_vsize contains the value of "32 vector registers with vlenb length"
* so rebuild the vlen value in bits from it.
*/
static inline int riscv_vector_vlen(void)
{
return riscv_v_vsize / 32 * 8;
}
#endif /* ! __ASM_RISCV_VECTOR_H */

View File

@ -5,7 +5,7 @@
#ifndef ASM_VENDOR_LIST_H
#define ASM_VENDOR_LIST_H
#define ANDESTECH_VENDOR_ID 0x31e
#define ANDES_VENDOR_ID 0x31e
#define SIFIVE_VENDOR_ID 0x489
#define THEAD_VENDOR_ID 0x5b7

View File

@ -39,7 +39,6 @@ extra-y += vmlinux.lds
obj-y += head.o
obj-y += soc.o
obj-$(CONFIG_RISCV_ALTERNATIVE) += alternative.o
obj-y += copy-unaligned.o
obj-y += cpu.o
obj-y += cpufeature.o
obj-y += entry.o
@ -64,6 +63,9 @@ obj-y += tests/
obj-$(CONFIG_MMU) += vdso.o vdso/
obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
obj-$(CONFIG_RISCV_MISALIGNED) += unaligned_access_speed.o
obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) += copy-unaligned.o
obj-$(CONFIG_FPU) += fpu.o
obj-$(CONFIG_RISCV_ISA_V) += vector.o
obj-$(CONFIG_RISCV_ISA_V) += kernel_mode_vector.o

View File

@ -43,7 +43,7 @@ static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info
switch (cpu_mfr_info->vendor_id) {
#ifdef CONFIG_ERRATA_ANDES
case ANDESTECH_VENDOR_ID:
case ANDES_VENDOR_ID:
cpu_mfr_info->patch_func = andes_errata_patch_func;
break;
#endif

View File

@ -11,7 +11,6 @@
#include <linux/cpu.h>
#include <linux/cpuhotplug.h>
#include <linux/ctype.h>
#include <linux/jump_label.h>
#include <linux/log2.h>
#include <linux/memory.h>
#include <linux/module.h>
@ -21,21 +20,13 @@
#include <asm/cacheflush.h>
#include <asm/cpufeature.h>
#include <asm/hwcap.h>
#include <asm/hwprobe.h>
#include <asm/patch.h>
#include <asm/processor.h>
#include <asm/sbi.h>
#include <asm/vector.h>
#include "copy-unaligned.h"
#define NUM_ALPHA_EXTS ('z' - 'a' + 1)
#define MISALIGNED_ACCESS_JIFFIES_LG2 1
#define MISALIGNED_BUFFER_SIZE 0x4000
#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
unsigned long elf_hwcap __read_mostly;
/* Host ISA bitmap */
@ -44,11 +35,6 @@ static DECLARE_BITMAP(riscv_isa, RISCV_ISA_EXT_MAX) __read_mostly;
/* Per-cpu ISA extensions. */
struct riscv_isainfo hart_isa[NR_CPUS];
/* Performance information */
DEFINE_PER_CPU(long, misaligned_access_speed);
static cpumask_t fast_misaligned_access;
/**
* riscv_isa_extension_base() - Get base extension word
*
@ -318,6 +304,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
__RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
__RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
__RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT),
__RISCV_ISA_EXT_DATA(xandespmu, RISCV_ISA_EXT_XANDESPMU),
};
const size_t riscv_isa_ext_count = ARRAY_SIZE(riscv_isa_ext);
@ -731,247 +718,6 @@ unsigned long riscv_get_elf_hwcap(void)
return hwcap;
}
static int check_unaligned_access(void *param)
{
int cpu = smp_processor_id();
u64 start_cycles, end_cycles;
u64 word_cycles;
u64 byte_cycles;
int ratio;
unsigned long start_jiffies, now;
struct page *page = param;
void *dst;
void *src;
long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
if (check_unaligned_access_emulated(cpu))
return 0;
/* Make an unaligned destination buffer. */
dst = (void *)((unsigned long)page_address(page) | 0x1);
/* Unalign src as well, but differently (off by 1 + 2 = 3). */
src = dst + (MISALIGNED_BUFFER_SIZE / 2);
src += 2;
word_cycles = -1ULL;
/* Do a warmup. */
__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
preempt_disable();
start_jiffies = jiffies;
while ((now = jiffies) == start_jiffies)
cpu_relax();
/*
* For a fixed amount of time, repeatedly try the function, and take
* the best time in cycles as the measurement.
*/
while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
start_cycles = get_cycles64();
/* Ensure the CSR read can't reorder WRT to the copy. */
mb();
__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
/* Ensure the copy ends before the end time is snapped. */
mb();
end_cycles = get_cycles64();
if ((end_cycles - start_cycles) < word_cycles)
word_cycles = end_cycles - start_cycles;
}
byte_cycles = -1ULL;
__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
start_jiffies = jiffies;
while ((now = jiffies) == start_jiffies)
cpu_relax();
while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
start_cycles = get_cycles64();
mb();
__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
mb();
end_cycles = get_cycles64();
if ((end_cycles - start_cycles) < byte_cycles)
byte_cycles = end_cycles - start_cycles;
}
preempt_enable();
/* Don't divide by zero. */
if (!word_cycles || !byte_cycles) {
pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
cpu);
return 0;
}
if (word_cycles < byte_cycles)
speed = RISCV_HWPROBE_MISALIGNED_FAST;
ratio = div_u64((byte_cycles * 100), word_cycles);
pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
cpu,
ratio / 100,
ratio % 100,
(speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
per_cpu(misaligned_access_speed, cpu) = speed;
/*
* Set the value of fast_misaligned_access of a CPU. These operations
* are atomic to avoid race conditions.
*/
if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
cpumask_set_cpu(cpu, &fast_misaligned_access);
else
cpumask_clear_cpu(cpu, &fast_misaligned_access);
return 0;
}
static void check_unaligned_access_nonboot_cpu(void *param)
{
unsigned int cpu = smp_processor_id();
struct page **pages = param;
if (smp_processor_id() != 0)
check_unaligned_access(pages[cpu]);
}
DEFINE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
{
if (cpumask_weight(mask) == weight)
static_branch_enable_cpuslocked(&fast_misaligned_access_speed_key);
else
static_branch_disable_cpuslocked(&fast_misaligned_access_speed_key);
}
static void set_unaligned_access_static_branches_except_cpu(int cpu)
{
/*
* Same as set_unaligned_access_static_branches, except excludes the
* given CPU from the result. When a CPU is hotplugged into an offline
* state, this function is called before the CPU is set to offline in
* the cpumask, and thus the CPU needs to be explicitly excluded.
*/
cpumask_t fast_except_me;
cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
cpumask_clear_cpu(cpu, &fast_except_me);
modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
}
static void set_unaligned_access_static_branches(void)
{
/*
* This will be called after check_unaligned_access_all_cpus so the
* result of unaligned access speed for all CPUs will be available.
*
* To avoid the number of online cpus changing between reading
* cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
* held before calling this function.
*/
cpumask_t fast_and_online;
cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
}
static int lock_and_set_unaligned_access_static_branch(void)
{
cpus_read_lock();
set_unaligned_access_static_branches();
cpus_read_unlock();
return 0;
}
arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
static int riscv_online_cpu(unsigned int cpu)
{
static struct page *buf;
/* We are already set since the last check */
if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
goto exit;
buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
if (!buf) {
pr_warn("Allocation failure, not measuring misaligned performance\n");
return -ENOMEM;
}
check_unaligned_access(buf);
__free_pages(buf, MISALIGNED_BUFFER_ORDER);
exit:
set_unaligned_access_static_branches();
return 0;
}
static int riscv_offline_cpu(unsigned int cpu)
{
set_unaligned_access_static_branches_except_cpu(cpu);
return 0;
}
/* Measure unaligned access on all CPUs present at boot in parallel. */
static int check_unaligned_access_all_cpus(void)
{
unsigned int cpu;
unsigned int cpu_count = num_possible_cpus();
struct page **bufs = kzalloc(cpu_count * sizeof(struct page *),
GFP_KERNEL);
if (!bufs) {
pr_warn("Allocation failure, not measuring misaligned performance\n");
return 0;
}
/*
* Allocate separate buffers for each CPU so there's no fighting over
* cache lines.
*/
for_each_cpu(cpu, cpu_online_mask) {
bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
if (!bufs[cpu]) {
pr_warn("Allocation failure, not measuring misaligned performance\n");
goto out;
}
}
/* Check everybody except 0, who stays behind to tend jiffies. */
on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
/* Check core 0. */
smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
/*
* Setup hotplug callbacks for any new CPUs that come online or go
* offline.
*/
cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
riscv_online_cpu, riscv_offline_cpu);
out:
unaligned_emulation_finish();
for_each_cpu(cpu, cpu_online_mask) {
if (bufs[cpu])
__free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
}
kfree(bufs);
return 0;
}
arch_initcall(check_unaligned_access_all_cpus);
void riscv_user_isa_enable(void)
{
if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_ZICBOZ))

View File

@ -111,6 +111,7 @@ SYM_CODE_START(handle_exception)
1:
tail do_trap_unknown
SYM_CODE_END(handle_exception)
ASM_NOKPROBE(handle_exception)
/*
* The ret_from_exception must be called with interrupt disabled. Here is the
@ -184,6 +185,7 @@ SYM_CODE_START_NOALIGN(ret_from_exception)
sret
#endif
SYM_CODE_END(ret_from_exception)
ASM_NOKPROBE(ret_from_exception)
#ifdef CONFIG_VMAP_STACK
SYM_CODE_START_LOCAL(handle_kernel_stack_overflow)
@ -219,6 +221,7 @@ SYM_CODE_START_LOCAL(handle_kernel_stack_overflow)
move a0, sp
tail handle_bad_stack
SYM_CODE_END(handle_kernel_stack_overflow)
ASM_NOKPROBE(handle_kernel_stack_overflow)
#endif
SYM_CODE_START(ret_from_fork)

View File

@ -9,6 +9,9 @@ KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \
-fno-asynchronous-unwind-tables -fno-unwind-tables \
$(call cc-option,-fno-addrsig)
# Disable LTO
KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO), $(KBUILD_CFLAGS))
KBUILD_CFLAGS += -mcmodel=medany
CFLAGS_cmdline_early.o += -D__NO_FORTIFY

View File

@ -377,14 +377,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
return ret;
}
#else
static const struct user_regset_view compat_riscv_user_native_view = {};
#endif /* CONFIG_COMPAT */
const struct user_regset_view *task_user_regset_view(struct task_struct *task)
{
#ifdef CONFIG_COMPAT
if (test_tsk_thread_flag(task, TIF_32BIT))
if (is_compat_thread(&task->thread_info))
return &compat_riscv_user_native_view;
else
#endif
return &riscv_user_native_view;
}

View File

@ -28,7 +28,6 @@
#include <asm/cpufeature.h>
#include <asm/cpu_ops.h>
#include <asm/cpufeature.h>
#include <asm/irq.h>
#include <asm/mmu_context.h>
#include <asm/numa.h>

View File

@ -132,4 +132,53 @@ static int __init sbi_system_suspend_init(void)
}
arch_initcall(sbi_system_suspend_init);
static int sbi_suspend_finisher(unsigned long suspend_type,
unsigned long resume_addr,
unsigned long opaque)
{
struct sbiret ret;
ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_SUSPEND,
suspend_type, resume_addr, opaque, 0, 0, 0);
return (ret.error) ? sbi_err_map_linux_errno(ret.error) : 0;
}
int riscv_sbi_hart_suspend(u32 state)
{
if (state & SBI_HSM_SUSP_NON_RET_BIT)
return cpu_suspend(state, sbi_suspend_finisher);
else
return sbi_suspend_finisher(state, 0, 0);
}
bool riscv_sbi_suspend_state_is_valid(u32 state)
{
if (state > SBI_HSM_SUSPEND_RET_DEFAULT &&
state < SBI_HSM_SUSPEND_RET_PLATFORM)
return false;
if (state > SBI_HSM_SUSPEND_NON_RET_DEFAULT &&
state < SBI_HSM_SUSPEND_NON_RET_PLATFORM)
return false;
return true;
}
bool riscv_sbi_hsm_is_supported(void)
{
/*
* The SBI HSM suspend function is only available when:
* 1) SBI version is 0.3 or higher
* 2) SBI HSM extension is available
*/
if (sbi_spec_version < sbi_mk_version(0, 3) ||
!sbi_probe_extension(SBI_EXT_HSM)) {
pr_info("HSM suspend not available\n");
return false;
}
return true;
}
#endif /* CONFIG_RISCV_SBI */

View File

@ -147,6 +147,7 @@ static bool hwprobe_ext0_has(const struct cpumask *cpus, unsigned long ext)
return (pair.value & ext);
}
#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
static u64 hwprobe_misaligned(const struct cpumask *cpus)
{
int cpu;
@ -169,6 +170,18 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
return perf;
}
#else
static u64 hwprobe_misaligned(const struct cpumask *cpus)
{
if (IS_ENABLED(CONFIG_RISCV_EFFICIENT_UNALIGNED_ACCESS))
return RISCV_HWPROBE_MISALIGNED_FAST;
if (IS_ENABLED(CONFIG_RISCV_EMULATED_UNALIGNED_ACCESS) && unaligned_ctl_available())
return RISCV_HWPROBE_MISALIGNED_EMULATED;
return RISCV_HWPROBE_MISALIGNED_SLOW;
}
#endif
static void hwprobe_one_pair(struct riscv_hwprobe *pair,
const struct cpumask *cpus)

View File

@ -6,6 +6,7 @@
#include <linux/cpu.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/randomize_kstack.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
#include <linux/sched/signal.h>
@ -310,7 +311,8 @@ asmlinkage __visible __trap_section void do_trap_break(struct pt_regs *regs)
}
}
asmlinkage __visible __trap_section void do_trap_ecall_u(struct pt_regs *regs)
asmlinkage __visible __trap_section __no_stack_protector
void do_trap_ecall_u(struct pt_regs *regs)
{
if (user_mode(regs)) {
long syscall = regs->a7;
@ -322,10 +324,23 @@ asmlinkage __visible __trap_section void do_trap_ecall_u(struct pt_regs *regs)
syscall = syscall_enter_from_user_mode(regs, syscall);
add_random_kstack_offset();
if (syscall >= 0 && syscall < NR_syscalls)
syscall_handler(regs, syscall);
else if (syscall != -1)
regs->a0 = -ENOSYS;
/*
* Ultimately, this value will get limited by KSTACK_OFFSET_MAX(),
* so the maximum stack offset is 1k bytes (10 bits).
*
* The actual entropy will be further reduced by the compiler when
* applying stack alignment constraints: 16-byte (i.e. 4-bit) aligned
* for RV32I or RV64I.
*
* The resulting 6 bits of entropy is seen in SP[9:4].
*/
choose_random_kstack_offset(get_random_u16());
syscall_exit_to_user_mode(regs);
} else {

View File

@ -413,7 +413,9 @@ int handle_misaligned_load(struct pt_regs *regs)
perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr);
#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
*this_cpu_ptr(&misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_EMULATED;
#endif
if (!unaligned_enabled)
return -1;
@ -596,7 +598,7 @@ int handle_misaligned_store(struct pt_regs *regs)
return 0;
}
bool check_unaligned_access_emulated(int cpu)
static bool check_unaligned_access_emulated(int cpu)
{
long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu);
unsigned long tmp_var, tmp_val;
@ -623,7 +625,7 @@ bool check_unaligned_access_emulated(int cpu)
return misaligned_emu_detected;
}
void unaligned_emulation_finish(void)
bool check_unaligned_access_emulated_all_cpus(void)
{
int cpu;
@ -632,13 +634,12 @@ void unaligned_emulation_finish(void)
* accesses emulated since tasks requesting such control can run on any
* CPU.
*/
for_each_present_cpu(cpu) {
if (per_cpu(misaligned_access_speed, cpu) !=
RISCV_HWPROBE_MISALIGNED_EMULATED) {
return;
}
}
for_each_online_cpu(cpu)
if (!check_unaligned_access_emulated(cpu))
return false;
unaligned_ctl = true;
return true;
}
bool unaligned_ctl_available(void)

View File

@ -0,0 +1,281 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2024 Rivos Inc.
*/
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/jump_label.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/types.h>
#include <asm/cpufeature.h>
#include <asm/hwprobe.h>
#include "copy-unaligned.h"
#define MISALIGNED_ACCESS_JIFFIES_LG2 1
#define MISALIGNED_BUFFER_SIZE 0x4000
#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
DEFINE_PER_CPU(long, misaligned_access_speed);
#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
static cpumask_t fast_misaligned_access;
static int check_unaligned_access(void *param)
{
int cpu = smp_processor_id();
u64 start_cycles, end_cycles;
u64 word_cycles;
u64 byte_cycles;
int ratio;
unsigned long start_jiffies, now;
struct page *page = param;
void *dst;
void *src;
long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
return 0;
/* Make an unaligned destination buffer. */
dst = (void *)((unsigned long)page_address(page) | 0x1);
/* Unalign src as well, but differently (off by 1 + 2 = 3). */
src = dst + (MISALIGNED_BUFFER_SIZE / 2);
src += 2;
word_cycles = -1ULL;
/* Do a warmup. */
__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
preempt_disable();
start_jiffies = jiffies;
while ((now = jiffies) == start_jiffies)
cpu_relax();
/*
* For a fixed amount of time, repeatedly try the function, and take
* the best time in cycles as the measurement.
*/
while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
start_cycles = get_cycles64();
/* Ensure the CSR read can't reorder WRT to the copy. */
mb();
__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
/* Ensure the copy ends before the end time is snapped. */
mb();
end_cycles = get_cycles64();
if ((end_cycles - start_cycles) < word_cycles)
word_cycles = end_cycles - start_cycles;
}
byte_cycles = -1ULL;
__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
start_jiffies = jiffies;
while ((now = jiffies) == start_jiffies)
cpu_relax();
while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
start_cycles = get_cycles64();
mb();
__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
mb();
end_cycles = get_cycles64();
if ((end_cycles - start_cycles) < byte_cycles)
byte_cycles = end_cycles - start_cycles;
}
preempt_enable();
/* Don't divide by zero. */
if (!word_cycles || !byte_cycles) {
pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
cpu);
return 0;
}
if (word_cycles < byte_cycles)
speed = RISCV_HWPROBE_MISALIGNED_FAST;
ratio = div_u64((byte_cycles * 100), word_cycles);
pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
cpu,
ratio / 100,
ratio % 100,
(speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
per_cpu(misaligned_access_speed, cpu) = speed;
/*
* Set the value of fast_misaligned_access of a CPU. These operations
* are atomic to avoid race conditions.
*/
if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
cpumask_set_cpu(cpu, &fast_misaligned_access);
else
cpumask_clear_cpu(cpu, &fast_misaligned_access);
return 0;
}
static void check_unaligned_access_nonboot_cpu(void *param)
{
unsigned int cpu = smp_processor_id();
struct page **pages = param;
if (smp_processor_id() != 0)
check_unaligned_access(pages[cpu]);
}
DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
{
if (cpumask_weight(mask) == weight)
static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
else
static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
}
static void set_unaligned_access_static_branches_except_cpu(int cpu)
{
/*
* Same as set_unaligned_access_static_branches, except excludes the
* given CPU from the result. When a CPU is hotplugged into an offline
* state, this function is called before the CPU is set to offline in
* the cpumask, and thus the CPU needs to be explicitly excluded.
*/
cpumask_t fast_except_me;
cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
cpumask_clear_cpu(cpu, &fast_except_me);
modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
}
static void set_unaligned_access_static_branches(void)
{
/*
* This will be called after check_unaligned_access_all_cpus so the
* result of unaligned access speed for all CPUs will be available.
*
* To avoid the number of online cpus changing between reading
* cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
* held before calling this function.
*/
cpumask_t fast_and_online;
cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
}
static int lock_and_set_unaligned_access_static_branch(void)
{
cpus_read_lock();
set_unaligned_access_static_branches();
cpus_read_unlock();
return 0;
}
arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
static int riscv_online_cpu(unsigned int cpu)
{
static struct page *buf;
/* We are already set since the last check */
if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
goto exit;
buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
if (!buf) {
pr_warn("Allocation failure, not measuring misaligned performance\n");
return -ENOMEM;
}
check_unaligned_access(buf);
__free_pages(buf, MISALIGNED_BUFFER_ORDER);
exit:
set_unaligned_access_static_branches();
return 0;
}
static int riscv_offline_cpu(unsigned int cpu)
{
set_unaligned_access_static_branches_except_cpu(cpu);
return 0;
}
/* Measure unaligned access speed on all CPUs present at boot in parallel. */
static int check_unaligned_access_speed_all_cpus(void)
{
unsigned int cpu;
unsigned int cpu_count = num_possible_cpus();
struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL);
if (!bufs) {
pr_warn("Allocation failure, not measuring misaligned performance\n");
return 0;
}
/*
* Allocate separate buffers for each CPU so there's no fighting over
* cache lines.
*/
for_each_cpu(cpu, cpu_online_mask) {
bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
if (!bufs[cpu]) {
pr_warn("Allocation failure, not measuring misaligned performance\n");
goto out;
}
}
/* Check everybody except 0, who stays behind to tend jiffies. */
on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
/* Check core 0. */
smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
/*
* Setup hotplug callbacks for any new CPUs that come online or go
* offline.
*/
cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
riscv_online_cpu, riscv_offline_cpu);
out:
for_each_cpu(cpu, cpu_online_mask) {
if (bufs[cpu])
__free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
}
kfree(bufs);
return 0;
}
static int check_unaligned_access_all_cpus(void)
{
bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
if (!all_cpus_emulated)
return check_unaligned_access_speed_all_cpus();
return 0;
}
#else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
static int check_unaligned_access_all_cpus(void)
{
check_unaligned_access_emulated_all_cpus();
return 0;
}
#endif
arch_initcall(check_unaligned_access_all_cpus);

View File

@ -3,7 +3,7 @@
* Checksum library
*
* Influenced by arch/arm64/lib/csum.c
* Copyright (C) 2023 Rivos Inc.
* Copyright (C) 2023-2024 Rivos Inc.
*/
#include <linux/bitops.h>
#include <linux/compiler.h>
@ -318,10 +318,7 @@ unsigned int do_csum(const unsigned char *buff, int len)
* branches. The largest chunk of overlap was delegated into the
* do_csum_common function.
*/
if (static_branch_likely(&fast_misaligned_access_speed_key))
return do_csum_no_alignment(buff, len);
if (((unsigned long)buff & OFFSET_MASK) == 0)
if (has_fast_unaligned_accesses() || (((unsigned long)buff & OFFSET_MASK) == 0))
return do_csum_no_alignment(buff, len);
return do_csum_with_alignment(buff, len);

View File

@ -1,7 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#include <linux/linkage.h>
#include <asm-generic/export.h>
#include <asm/asm.h>
#include <asm/asm-extable.h>
#include <asm/csr.h>

View File

@ -82,12 +82,12 @@ void flush_icache_mm(struct mm_struct *mm, bool local)
#endif /* CONFIG_SMP */
#ifdef CONFIG_MMU
void flush_icache_pte(pte_t pte)
void flush_icache_pte(struct mm_struct *mm, pte_t pte)
{
struct folio *folio = page_folio(pte_page(pte));
if (!test_bit(PG_dcache_clean, &folio->flags)) {
flush_icache_all();
flush_icache_mm(mm, false);
set_bit(PG_dcache_clean, &folio->flags);
}
}

View File

@ -323,6 +323,8 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
if (unlikely(prev == next))
return;
membarrier_arch_switch_mm(prev, next, task);
/*
* Mark the current MM context as inactive, and the next as
* active. This is at least used by the icache flushing

View File

@ -764,6 +764,11 @@ static int __init print_no5lvl(char *p)
}
early_param("no5lvl", print_no5lvl);
static void __init set_mmap_rnd_bits_max(void)
{
mmap_rnd_bits_max = MMAP_VA_BITS - PAGE_SHIFT - 3;
}
/*
* There is a simple way to determine if 4-level is supported by the
* underlying hardware: establish 1:1 mapping in 4-level page table mode
@ -1078,6 +1083,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL)
set_satp_mode(dtb_pa);
set_mmap_rnd_bits_max();
#endif
/*

View File

@ -10,7 +10,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
pte_t entry, int dirty)
{
if (!pte_same(ptep_get(ptep), entry))
__set_pte_at(ptep, entry);
__set_pte_at(vma->vm_mm, ptep, entry);
/*
* update_mmu_cache will unconditionally execute, handling both
* the case that the PTE changed and the spurious fault case.

View File

@ -1497,6 +1497,9 @@ endif
if PPC
source "arch/powerpc/crypto/Kconfig"
endif
if RISCV
source "arch/riscv/crypto/Kconfig"
endif
if S390
source "arch/s390/crypto/Kconfig"
endif

View File

@ -286,7 +286,7 @@ config ACPI_CPPC_LIB
config ACPI_PROCESSOR
tristate "Processor"
depends on X86 || ARM64 || LOONGARCH
depends on X86 || ARM64 || LOONGARCH || RISCV
select ACPI_PROCESSOR_IDLE
select ACPI_CPU_FREQ_PSS if X86 || LOONGARCH
select THERMAL

View File

@ -1,2 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y += rhct.o
obj-y += rhct.o
obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o
obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o

157
drivers/acpi/riscv/cppc.c Normal file
View File

@ -0,0 +1,157 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Implement CPPC FFH helper routines for RISC-V.
*
* Copyright (C) 2024 Ventana Micro Systems Inc.
*/
#include <acpi/cppc_acpi.h>
#include <asm/csr.h>
#include <asm/sbi.h>
#define SBI_EXT_CPPC 0x43505043
/* CPPC interfaces defined in SBI spec */
#define SBI_CPPC_PROBE 0x0
#define SBI_CPPC_READ 0x1
#define SBI_CPPC_READ_HI 0x2
#define SBI_CPPC_WRITE 0x3
/* RISC-V FFH definitions from RISC-V FFH spec */
#define FFH_CPPC_TYPE(r) (((r) & GENMASK_ULL(63, 60)) >> 60)
#define FFH_CPPC_SBI_REG(r) ((r) & GENMASK(31, 0))
#define FFH_CPPC_CSR_NUM(r) ((r) & GENMASK(11, 0))
#define FFH_CPPC_SBI 0x1
#define FFH_CPPC_CSR 0x2
struct sbi_cppc_data {
u64 val;
u32 reg;
struct sbiret ret;
};
static bool cppc_ext_present;
static int __init sbi_cppc_init(void)
{
if (sbi_spec_version >= sbi_mk_version(2, 0) &&
sbi_probe_extension(SBI_EXT_CPPC) > 0) {
pr_info("SBI CPPC extension detected\n");
cppc_ext_present = true;
} else {
pr_info("SBI CPPC extension NOT detected!!\n");
cppc_ext_present = false;
}
return 0;
}
device_initcall(sbi_cppc_init);
static void sbi_cppc_read(void *read_data)
{
struct sbi_cppc_data *data = (struct sbi_cppc_data *)read_data;
data->ret = sbi_ecall(SBI_EXT_CPPC, SBI_CPPC_READ,
data->reg, 0, 0, 0, 0, 0);
}
static void sbi_cppc_write(void *write_data)
{
struct sbi_cppc_data *data = (struct sbi_cppc_data *)write_data;
data->ret = sbi_ecall(SBI_EXT_CPPC, SBI_CPPC_WRITE,
data->reg, data->val, 0, 0, 0, 0);
}
static void cppc_ffh_csr_read(void *read_data)
{
struct sbi_cppc_data *data = (struct sbi_cppc_data *)read_data;
switch (data->reg) {
/* Support only TIME CSR for now */
case CSR_TIME:
data->ret.value = csr_read(CSR_TIME);
data->ret.error = 0;
break;
default:
data->ret.error = -EINVAL;
break;
}
}
static void cppc_ffh_csr_write(void *write_data)
{
struct sbi_cppc_data *data = (struct sbi_cppc_data *)write_data;
data->ret.error = -EINVAL;
}
/*
* Refer to drivers/acpi/cppc_acpi.c for the description of the functions
* below.
*/
bool cpc_ffh_supported(void)
{
return true;
}
int cpc_read_ffh(int cpu, struct cpc_reg *reg, u64 *val)
{
struct sbi_cppc_data data;
if (WARN_ON_ONCE(irqs_disabled()))
return -EPERM;
if (FFH_CPPC_TYPE(reg->address) == FFH_CPPC_SBI) {
if (!cppc_ext_present)
return -EINVAL;
data.reg = FFH_CPPC_SBI_REG(reg->address);
smp_call_function_single(cpu, sbi_cppc_read, &data, 1);
*val = data.ret.value;
return (data.ret.error) ? sbi_err_map_linux_errno(data.ret.error) : 0;
} else if (FFH_CPPC_TYPE(reg->address) == FFH_CPPC_CSR) {
data.reg = FFH_CPPC_CSR_NUM(reg->address);
smp_call_function_single(cpu, cppc_ffh_csr_read, &data, 1);
*val = data.ret.value;
return (data.ret.error) ? sbi_err_map_linux_errno(data.ret.error) : 0;
}
return -EINVAL;
}
int cpc_write_ffh(int cpu, struct cpc_reg *reg, u64 val)
{
struct sbi_cppc_data data;
if (WARN_ON_ONCE(irqs_disabled()))
return -EPERM;
if (FFH_CPPC_TYPE(reg->address) == FFH_CPPC_SBI) {
if (!cppc_ext_present)
return -EINVAL;
data.reg = FFH_CPPC_SBI_REG(reg->address);
data.val = val;
smp_call_function_single(cpu, sbi_cppc_write, &data, 1);
return (data.ret.error) ? sbi_err_map_linux_errno(data.ret.error) : 0;
} else if (FFH_CPPC_TYPE(reg->address) == FFH_CPPC_CSR) {
data.reg = FFH_CPPC_CSR_NUM(reg->address);
data.val = val;
smp_call_function_single(cpu, cppc_ffh_csr_write, &data, 1);
return (data.ret.error) ? sbi_err_map_linux_errno(data.ret.error) : 0;
}
return -EINVAL;
}

View File

@ -0,0 +1,81 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2024, Ventana Micro Systems Inc
* Author: Sunil V L <sunilvl@ventanamicro.com>
*
*/
#include <linux/acpi.h>
#include <acpi/processor.h>
#include <linux/cpu_pm.h>
#include <linux/cpuidle.h>
#include <linux/suspend.h>
#include <asm/cpuidle.h>
#include <asm/sbi.h>
#include <asm/suspend.h>
#define RISCV_FFH_LPI_TYPE_MASK GENMASK_ULL(63, 60)
#define RISCV_FFH_LPI_RSVD_MASK GENMASK_ULL(59, 32)
#define RISCV_FFH_LPI_TYPE_SBI BIT_ULL(60)
static int acpi_cpu_init_idle(unsigned int cpu)
{
int i;
struct acpi_lpi_state *lpi;
struct acpi_processor *pr = per_cpu(processors, cpu);
if (unlikely(!pr || !pr->flags.has_lpi))
return -EINVAL;
if (!riscv_sbi_hsm_is_supported())
return -ENODEV;
if (pr->power.count <= 1)
return -ENODEV;
for (i = 1; i < pr->power.count; i++) {
u32 state;
lpi = &pr->power.lpi_states[i];
/*
* Validate Entry Method as per FFH spec.
* bits[63:60] should be 0x1
* bits[59:32] should be 0x0
* bits[31:0] represent a SBI power_state
*/
if (((lpi->address & RISCV_FFH_LPI_TYPE_MASK) != RISCV_FFH_LPI_TYPE_SBI) ||
(lpi->address & RISCV_FFH_LPI_RSVD_MASK)) {
pr_warn("Invalid LPI entry method %#llx\n", lpi->address);
return -EINVAL;
}
state = lpi->address;
if (!riscv_sbi_suspend_state_is_valid(state)) {
pr_warn("Invalid SBI power state %#x\n", state);
return -EINVAL;
}
}
return 0;
}
int acpi_processor_ffh_lpi_probe(unsigned int cpu)
{
return acpi_cpu_init_idle(cpu);
}
int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi)
{
u32 state = lpi->address;
if (state & SBI_HSM_SUSP_NON_RET_BIT)
return CPU_PM_CPU_IDLE_ENTER_PARAM(riscv_sbi_hart_suspend,
lpi->index,
state);
else
return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(riscv_sbi_hart_suspend,
lpi->index,
state);
}

View File

@ -131,7 +131,7 @@ static int clint_timer_starting_cpu(unsigned int cpu)
struct clock_event_device *ce = per_cpu_ptr(&clint_clock_event, cpu);
ce->cpumask = cpumask_of(cpu);
clockevents_config_and_register(ce, clint_timer_freq, 100, 0x7fffffff);
clockevents_config_and_register(ce, clint_timer_freq, 100, ULONG_MAX);
enable_percpu_irq(clint_timer_irq,
irq_get_trigger_type(clint_timer_irq));

View File

@ -114,7 +114,7 @@ static int riscv_timer_starting_cpu(unsigned int cpu)
ce->features |= CLOCK_EVT_FEAT_C3STOP;
if (static_branch_likely(&riscv_sstc_available))
ce->rating = 450;
clockevents_config_and_register(ce, riscv_timebase, 100, 0x7fffffff);
clockevents_config_and_register(ce, riscv_timebase, 100, ULONG_MAX);
enable_percpu_irq(riscv_clock_event_irq,
irq_get_trigger_type(riscv_clock_event_irq));

View File

@ -302,4 +302,33 @@ config QORIQ_CPUFREQ
which are capable of changing the CPU's frequency dynamically.
endif
config ACPI_CPPC_CPUFREQ
tristate "CPUFreq driver based on the ACPI CPPC spec"
depends on ACPI_PROCESSOR
depends on ARM || ARM64 || RISCV
select ACPI_CPPC_LIB
help
This adds a CPUFreq driver which uses CPPC methods
as described in the ACPIv5.1 spec. CPPC stands for
Collaborative Processor Performance Controls. It
is based on an abstract continuous scale of CPU
performance values which allows the remote power
processor to flexibly optimize for power and
performance. CPPC relies on power management firmware
support for its operation.
If in doubt, say N.
config ACPI_CPPC_CPUFREQ_FIE
bool "Frequency Invariance support for CPPC cpufreq driver"
depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY
depends on ARM || ARM64 || RISCV
default y
help
This extends frequency invariance support in the CPPC cpufreq driver,
by using CPPC delivered and reference performance counters.
If in doubt, say N.
endmenu

View File

@ -3,32 +3,6 @@
# ARM CPU Frequency scaling drivers
#
config ACPI_CPPC_CPUFREQ
tristate "CPUFreq driver based on the ACPI CPPC spec"
depends on ACPI_PROCESSOR
select ACPI_CPPC_LIB
help
This adds a CPUFreq driver which uses CPPC methods
as described in the ACPIv5.1 spec. CPPC stands for
Collaborative Processor Performance Controls. It
is based on an abstract continuous scale of CPU
performance values which allows the remote power
processor to flexibly optimize for power and
performance. CPPC relies on power management firmware
support for its operation.
If in doubt, say N.
config ACPI_CPPC_CPUFREQ_FIE
bool "Frequency Invariance support for CPPC cpufreq driver"
depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY
default y
help
This extends frequency invariance support in the CPPC cpufreq driver,
by using CPPC delivered and reference performance counters.
If in doubt, say N.
config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM
tristate "Allwinner nvmem based SUN50I CPUFreq driver"
depends on ARCH_SUNXI

View File

@ -73,26 +73,6 @@ static inline bool sbi_is_domain_state_available(void)
return data->available;
}
static int sbi_suspend_finisher(unsigned long suspend_type,
unsigned long resume_addr,
unsigned long opaque)
{
struct sbiret ret;
ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_SUSPEND,
suspend_type, resume_addr, opaque, 0, 0, 0);
return (ret.error) ? sbi_err_map_linux_errno(ret.error) : 0;
}
static int sbi_suspend(u32 state)
{
if (state & SBI_HSM_SUSP_NON_RET_BIT)
return cpu_suspend(state, sbi_suspend_finisher);
else
return sbi_suspend_finisher(state, 0, 0);
}
static __cpuidle int sbi_cpuidle_enter_state(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int idx)
{
@ -100,9 +80,9 @@ static __cpuidle int sbi_cpuidle_enter_state(struct cpuidle_device *dev,
u32 state = states[idx];
if (state & SBI_HSM_SUSP_NON_RET_BIT)
return CPU_PM_CPU_IDLE_ENTER_PARAM(sbi_suspend, idx, state);
return CPU_PM_CPU_IDLE_ENTER_PARAM(riscv_sbi_hart_suspend, idx, state);
else
return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(sbi_suspend,
return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(riscv_sbi_hart_suspend,
idx, state);
}
@ -133,7 +113,7 @@ static __cpuidle int __sbi_enter_domain_idle_state(struct cpuidle_device *dev,
else
state = states[idx];
ret = sbi_suspend(state) ? -1 : idx;
ret = riscv_sbi_hart_suspend(state) ? -1 : idx;
ct_cpuidle_exit();
@ -206,17 +186,6 @@ static const struct of_device_id sbi_cpuidle_state_match[] = {
{ },
};
static bool sbi_suspend_state_is_valid(u32 state)
{
if (state > SBI_HSM_SUSPEND_RET_DEFAULT &&
state < SBI_HSM_SUSPEND_RET_PLATFORM)
return false;
if (state > SBI_HSM_SUSPEND_NON_RET_DEFAULT &&
state < SBI_HSM_SUSPEND_NON_RET_PLATFORM)
return false;
return true;
}
static int sbi_dt_parse_state_node(struct device_node *np, u32 *state)
{
int err = of_property_read_u32(np, "riscv,sbi-suspend-param", state);
@ -226,7 +195,7 @@ static int sbi_dt_parse_state_node(struct device_node *np, u32 *state)
return err;
}
if (!sbi_suspend_state_is_valid(*state)) {
if (!riscv_sbi_suspend_state_is_valid(*state)) {
pr_warn("Invalid SBI suspend state %#x\n", *state);
return -EINVAL;
}
@ -607,16 +576,8 @@ static int __init sbi_cpuidle_init(void)
int ret;
struct platform_device *pdev;
/*
* The SBI HSM suspend function is only available when:
* 1) SBI version is 0.3 or higher
* 2) SBI HSM extension is available
*/
if ((sbi_spec_version < sbi_mk_version(0, 3)) ||
!sbi_probe_extension(SBI_EXT_HSM)) {
pr_info("HSM suspend not available\n");
if (!riscv_sbi_hsm_is_supported())
return 0;
}
ret = platform_driver_register(&sbi_cpuidle_driver);
if (ret)

View File

@ -96,6 +96,20 @@ config STARFIVE_STARLINK_PMU
an L3 memory system. The L3 cache events are added into perf event
subsystem, allowing monitoring of various L3 cache perf events.
config ANDES_CUSTOM_PMU
bool "Andes custom PMU support"
depends on ARCH_RENESAS && RISCV_ALTERNATIVE && RISCV_PMU_SBI
default y
help
The Andes cores implement the PMU overflow extension very
similar to the standard Sscofpmf and Smcntrpmf extension.
This will patch the overflow and pending CSRs and handle the
non-standard behaviour via the regular SBI PMU driver and
interface.
If you don't know what to do here, say "Y".
config ARM_PMU_ACPI
depends on ARM_PMU && ACPI
def_bool y

View File

@ -19,11 +19,33 @@
#include <linux/of.h>
#include <linux/cpu_pm.h>
#include <linux/sched/clock.h>
#include <linux/soc/andes/irq.h>
#include <asm/errata_list.h>
#include <asm/sbi.h>
#include <asm/cpufeature.h>
#define ALT_SBI_PMU_OVERFLOW(__ovl) \
asm volatile(ALTERNATIVE_2( \
"csrr %0, " __stringify(CSR_SSCOUNTOVF), \
"csrr %0, " __stringify(THEAD_C9XX_CSR_SCOUNTEROF), \
THEAD_VENDOR_ID, ERRATA_THEAD_PMU, \
CONFIG_ERRATA_THEAD_PMU, \
"csrr %0, " __stringify(ANDES_CSR_SCOUNTEROF), \
0, RISCV_ISA_EXT_XANDESPMU, \
CONFIG_ANDES_CUSTOM_PMU) \
: "=r" (__ovl) : \
: "memory")
#define ALT_SBI_PMU_OVF_CLEAR_PENDING(__irq_mask) \
asm volatile(ALTERNATIVE( \
"csrc " __stringify(CSR_IP) ", %0\n\t", \
"csrc " __stringify(ANDES_CSR_SLIP) ", %0\n\t", \
0, RISCV_ISA_EXT_XANDESPMU, \
CONFIG_ANDES_CUSTOM_PMU) \
: : "r"(__irq_mask) \
: "memory")
#define SYSCTL_NO_USER_ACCESS 0
#define SYSCTL_USER_ACCESS 1
#define SYSCTL_LEGACY 2
@ -61,6 +83,7 @@ static int sysctl_perf_user_access __read_mostly = SYSCTL_USER_ACCESS;
static union sbi_pmu_ctr_info *pmu_ctr_list;
static bool riscv_pmu_use_irq;
static unsigned int riscv_pmu_irq_num;
static unsigned int riscv_pmu_irq_mask;
static unsigned int riscv_pmu_irq;
/* Cache the available counters in a bitmask */
@ -694,7 +717,7 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
event = cpu_hw_evt->events[fidx];
if (!event) {
csr_clear(CSR_SIP, BIT(riscv_pmu_irq_num));
ALT_SBI_PMU_OVF_CLEAR_PENDING(riscv_pmu_irq_mask);
return IRQ_NONE;
}
@ -708,7 +731,7 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
* Overflow interrupt pending bit should only be cleared after stopping
* all the counters to avoid any race condition.
*/
csr_clear(CSR_SIP, BIT(riscv_pmu_irq_num));
ALT_SBI_PMU_OVF_CLEAR_PENDING(riscv_pmu_irq_mask);
/* No overflow bit is set */
if (!overflow)
@ -780,8 +803,7 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
if (riscv_pmu_use_irq) {
cpu_hw_evt->irq = riscv_pmu_irq;
csr_clear(CSR_IP, BIT(riscv_pmu_irq_num));
csr_set(CSR_IE, BIT(riscv_pmu_irq_num));
ALT_SBI_PMU_OVF_CLEAR_PENDING(riscv_pmu_irq_mask);
enable_percpu_irq(riscv_pmu_irq, IRQ_TYPE_NONE);
}
@ -792,7 +814,6 @@ static int pmu_sbi_dying_cpu(unsigned int cpu, struct hlist_node *node)
{
if (riscv_pmu_use_irq) {
disable_percpu_irq(riscv_pmu_irq);
csr_clear(CSR_IE, BIT(riscv_pmu_irq_num));
}
/* Disable all counters access for user mode now */
@ -816,8 +837,14 @@ static int pmu_sbi_setup_irqs(struct riscv_pmu *pmu, struct platform_device *pde
riscv_cached_mimpid(0) == 0) {
riscv_pmu_irq_num = THEAD_C9XX_RV_IRQ_PMU;
riscv_pmu_use_irq = true;
} else if (riscv_isa_extension_available(NULL, XANDESPMU) &&
IS_ENABLED(CONFIG_ANDES_CUSTOM_PMU)) {
riscv_pmu_irq_num = ANDES_SLI_CAUSE_BASE + ANDES_RV_IRQ_PMOVI;
riscv_pmu_use_irq = true;
}
riscv_pmu_irq_mask = BIT(riscv_pmu_irq_num % BITS_PER_LONG);
if (!riscv_pmu_use_irq)
return -EOPNOTSUPP;

View File

@ -5,12 +5,12 @@
#include <asm/types.h>
/**
* __ffs - find first bit in word.
* generic___ffs - find first bit in word.
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static __always_inline unsigned long __ffs(unsigned long word)
static __always_inline unsigned long generic___ffs(unsigned long word)
{
int num = 0;
@ -41,4 +41,8 @@ static __always_inline unsigned long __ffs(unsigned long word)
return num;
}
#ifndef __HAVE_ARCH___FFS
#define __ffs(word) generic___ffs(word)
#endif
#endif /* _ASM_GENERIC_BITOPS___FFS_H_ */

View File

@ -5,12 +5,12 @@
#include <asm/types.h>
/**
* __fls - find last (most-significant) set bit in a long word
* generic___fls - find last (most-significant) set bit in a long word
* @word: the word to search
*
* Undefined if no set bit exists, so code should check against 0 first.
*/
static __always_inline unsigned long __fls(unsigned long word)
static __always_inline unsigned long generic___fls(unsigned long word)
{
int num = BITS_PER_LONG - 1;
@ -41,4 +41,8 @@ static __always_inline unsigned long __fls(unsigned long word)
return num;
}
#ifndef __HAVE_ARCH___FLS
#define __fls(word) generic___fls(word)
#endif
#endif /* _ASM_GENERIC_BITOPS___FLS_H_ */

View File

@ -3,14 +3,14 @@
#define _ASM_GENERIC_BITOPS_FFS_H_
/**
* ffs - find first bit set
* generic_ffs - find first bit set
* @x: the word to search
*
* This is defined the same way as
* the libc and compiler builtin ffs routines, therefore
* differs in spirit from ffz (man ffs).
*/
static inline int ffs(int x)
static inline int generic_ffs(int x)
{
int r = 1;
@ -39,4 +39,8 @@ static inline int ffs(int x)
return r;
}
#ifndef __HAVE_ARCH_FFS
#define ffs(x) generic_ffs(x)
#endif
#endif /* _ASM_GENERIC_BITOPS_FFS_H_ */

View File

@ -3,14 +3,14 @@
#define _ASM_GENERIC_BITOPS_FLS_H_
/**
* fls - find last (most-significant) bit set
* generic_fls - find last (most-significant) bit set
* @x: the word to search
*
* This is defined the same way as ffs.
* Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
*/
static __always_inline int fls(unsigned int x)
static __always_inline int generic_fls(unsigned int x)
{
int r = 32;
@ -39,4 +39,8 @@ static __always_inline int fls(unsigned int x)
return r;
}
#ifndef __HAVE_ARCH_FLS
#define fls(x) generic_fls(x)
#endif
#endif /* _ASM_GENERIC_BITOPS_FLS_H_ */

View File

@ -87,7 +87,7 @@ extern int sysctl_legacy_va_layout;
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
extern const int mmap_rnd_bits_min;
extern const int mmap_rnd_bits_max;
extern int mmap_rnd_bits_max __ro_after_init;
extern int mmap_rnd_bits __read_mostly;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS

View File

@ -17,5 +17,19 @@ static inline void sync_core_before_usermode(void)
}
#endif
#endif /* _LINUX_SYNC_CORE_H */
#ifdef CONFIG_ARCH_HAS_PREPARE_SYNC_CORE_CMD
#include <asm/sync_core.h>
#else
/*
* This is a dummy prepare_sync_core_cmd() implementation that can be used on
* all architectures which provide unconditional core serializing instructions
* in switch_mm().
* If your architecture doesn't provide such core serializing instructions in
* switch_mm(), you may need to write your own functions.
*/
static inline void prepare_sync_core_cmd(struct mm_struct *mm)
{
}
#endif
#endif /* _LINUX_SYNC_CORE_H */

View File

@ -1986,6 +1986,9 @@ source "kernel/Kconfig.locks"
config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
bool
config ARCH_HAS_PREPARE_SYNC_CORE_CMD
bool
config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
bool

View File

@ -6647,7 +6647,9 @@ static void __sched notrace __schedule(unsigned int sched_mode)
* if (signal_pending_state()) if (p->state & @state)
*
* Also, the membarrier system call requires a full memory barrier
* after coming from user-space, before storing to rq->curr.
* after coming from user-space, before storing to rq->curr; this
* barrier matches a full barrier in the proximity of the membarrier
* system call exit.
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();
@ -6718,12 +6720,20 @@ static void __sched notrace __schedule(unsigned int sched_mode)
*
* Here are the schemes providing that barrier on the
* various architectures:
* - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
* switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
* - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC,
* RISC-V. switch_mm() relies on membarrier_arch_switch_mm()
* on PowerPC and on RISC-V.
* - finish_lock_switch() for weakly-ordered
* architectures where spin_unlock is a full barrier,
* - switch_to() for arm64 (weakly-ordered, spin_unlock
* is a RELEASE barrier),
*
* The barrier matches a full barrier in the proximity of
* the membarrier system call entry.
*
* On RISC-V, this barrier pairing is also needed for the
* SYNC_CORE command when switching between processes, cf.
* the inline comments in membarrier_arch_switch_mm().
*/
++*switch_count;

View File

@ -254,7 +254,7 @@ static int membarrier_global_expedited(void)
return 0;
/*
* Matches memory barriers around rq->curr modification in
* Matches memory barriers after rq->curr modification in
* scheduler.
*/
smp_mb(); /* system call entry is not a mb. */
@ -304,7 +304,7 @@ static int membarrier_global_expedited(void)
/*
* Memory barrier on the caller thread _after_ we finished
* waiting for the last IPI. Matches memory barriers around
* waiting for the last IPI. Matches memory barriers before
* rq->curr modification in scheduler.
*/
smp_mb(); /* exit from system call is not a mb */
@ -324,6 +324,7 @@ static int membarrier_private_expedited(int flags, int cpu_id)
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
ipi_func = ipi_sync_core;
prepare_sync_core_cmd(mm);
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
if (!IS_ENABLED(CONFIG_RSEQ))
return -EINVAL;
@ -343,8 +344,12 @@ static int membarrier_private_expedited(int flags, int cpu_id)
return 0;
/*
* Matches memory barriers around rq->curr modification in
* Matches memory barriers after rq->curr modification in
* scheduler.
*
* On RISC-V, this barrier pairing is also needed for the
* SYNC_CORE command when switching between processes, cf.
* the inline comments in membarrier_arch_switch_mm().
*/
smp_mb(); /* system call entry is not a mb. */
@ -420,7 +425,7 @@ out:
/*
* Memory barrier on the caller thread _after_ we finished
* waiting for the last IPI. Matches memory barriers around
* waiting for the last IPI. Matches memory barriers before
* rq->curr modification in scheduler.
*/
smp_mb(); /* exit from system call is not a mb */

View File

@ -64,7 +64,7 @@
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX;
int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS

View File

@ -0,0 +1,68 @@
[
{
"ArchStdEvent": "FW_MISALIGNED_LOAD"
},
{
"ArchStdEvent": "FW_MISALIGNED_STORE"
},
{
"ArchStdEvent": "FW_ACCESS_LOAD"
},
{
"ArchStdEvent": "FW_ACCESS_STORE"
},
{
"ArchStdEvent": "FW_ILLEGAL_INSN"
},
{
"ArchStdEvent": "FW_SET_TIMER"
},
{
"ArchStdEvent": "FW_IPI_SENT"
},
{
"ArchStdEvent": "FW_IPI_RECEIVED"
},
{
"ArchStdEvent": "FW_FENCE_I_SENT"
},
{
"ArchStdEvent": "FW_FENCE_I_RECEIVED"
},
{
"ArchStdEvent": "FW_SFENCE_VMA_SENT"
},
{
"ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
},
{
"ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
},
{
"ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED"
},
{
"ArchStdEvent": "FW_HFENCE_GVMA_SENT"
},
{
"ArchStdEvent": "FW_HFENCE_GVMA_RECEIVED"
},
{
"ArchStdEvent": "FW_HFENCE_GVMA_VMID_SENT"
},
{
"ArchStdEvent": "FW_HFENCE_GVMA_VMID_RECEIVED"
},
{
"ArchStdEvent": "FW_HFENCE_VVMA_SENT"
},
{
"ArchStdEvent": "FW_HFENCE_VVMA_RECEIVED"
},
{
"ArchStdEvent": "FW_HFENCE_VVMA_ASID_SENT"
},
{
"ArchStdEvent": "FW_HFENCE_VVMA_ASID_RECEIVED"
}
]

View File

@ -0,0 +1,127 @@
[
{
"EventCode": "0x10",
"EventName": "cycle_count",
"BriefDescription": "Cycle count"
},
{
"EventCode": "0x20",
"EventName": "inst_count",
"BriefDescription": "Retired instruction count"
},
{
"EventCode": "0x30",
"EventName": "int_load_inst",
"BriefDescription": "Integer load instruction count"
},
{
"EventCode": "0x40",
"EventName": "int_store_inst",
"BriefDescription": "Integer store instruction count"
},
{
"EventCode": "0x50",
"EventName": "atomic_inst",
"BriefDescription": "Atomic instruction count"
},
{
"EventCode": "0x60",
"EventName": "sys_inst",
"BriefDescription": "System instruction count"
},
{
"EventCode": "0x70",
"EventName": "int_compute_inst",
"BriefDescription": "Integer computational instruction count"
},
{
"EventCode": "0x80",
"EventName": "condition_br",
"BriefDescription": "Conditional branch instruction count"
},
{
"EventCode": "0x90",
"EventName": "taken_condition_br",
"BriefDescription": "Taken conditional branch instruction count"
},
{
"EventCode": "0xA0",
"EventName": "jal_inst",
"BriefDescription": "JAL instruction count"
},
{
"EventCode": "0xB0",
"EventName": "jalr_inst",
"BriefDescription": "JALR instruction count"
},
{
"EventCode": "0xC0",
"EventName": "ret_inst",
"BriefDescription": "Return instruction count"
},
{
"EventCode": "0xD0",
"EventName": "control_trans_inst",
"BriefDescription": "Control transfer instruction count"
},
{
"EventCode": "0xE0",
"EventName": "ex9_inst",
"BriefDescription": "EXEC.IT instruction count"
},
{
"EventCode": "0xF0",
"EventName": "int_mul_inst",
"BriefDescription": "Integer multiplication instruction count"
},
{
"EventCode": "0x100",
"EventName": "int_div_rem_inst",
"BriefDescription": "Integer division/remainder instruction count"
},
{
"EventCode": "0x110",
"EventName": "float_load_inst",
"BriefDescription": "Floating-point load instruction count"
},
{
"EventCode": "0x120",
"EventName": "float_store_inst",
"BriefDescription": "Floating-point store instruction count"
},
{
"EventCode": "0x130",
"EventName": "float_add_sub_inst",
"BriefDescription": "Floating-point addition/subtraction instruction count"
},
{
"EventCode": "0x140",
"EventName": "float_mul_inst",
"BriefDescription": "Floating-point multiplication instruction count"
},
{
"EventCode": "0x150",
"EventName": "float_fused_muladd_inst",
"BriefDescription": "Floating-point fused multiply-add instruction count"
},
{
"EventCode": "0x160",
"EventName": "float_div_sqrt_inst",
"BriefDescription": "Floating-point division or square-root instruction count"
},
{
"EventCode": "0x170",
"EventName": "other_float_inst",
"BriefDescription": "Other floating-point instruction count"
},
{
"EventCode": "0x180",
"EventName": "int_mul_add_sub_inst",
"BriefDescription": "Integer multiplication and add/sub instruction count"
},
{
"EventCode": "0x190",
"EventName": "retired_ops",
"BriefDescription": "Retired operation count"
}
]

View File

@ -0,0 +1,57 @@
[
{
"EventCode": "0x01",
"EventName": "ilm_access",
"BriefDescription": "ILM access"
},
{
"EventCode": "0x11",
"EventName": "dlm_access",
"BriefDescription": "DLM access"
},
{
"EventCode": "0x21",
"EventName": "icache_access",
"BriefDescription": "ICACHE access"
},
{
"EventCode": "0x31",
"EventName": "icache_miss",
"BriefDescription": "ICACHE miss"
},
{
"EventCode": "0x41",
"EventName": "dcache_access",
"BriefDescription": "DCACHE access"
},
{
"EventCode": "0x51",
"EventName": "dcache_miss",
"BriefDescription": "DCACHE miss"
},
{
"EventCode": "0x61",
"EventName": "dcache_load_access",
"BriefDescription": "DCACHE load access"
},
{
"EventCode": "0x71",
"EventName": "dcache_load_miss",
"BriefDescription": "DCACHE load miss"
},
{
"EventCode": "0x81",
"EventName": "dcache_store_access",
"BriefDescription": "DCACHE store access"
},
{
"EventCode": "0x91",
"EventName": "dcache_store_miss",
"BriefDescription": "DCACHE store miss"
},
{
"EventCode": "0xA1",
"EventName": "dcache_wb",
"BriefDescription": "DCACHE writeback"
}
]

Some files were not shown because too many files have changed in this diff Show More