linux/drivers/firmware/efi/efi.c

1193 lines
30 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* efi.c - EFI subsystem
*
* Copyright (C) 2001,2003,2004 Dell <Matt_Domsch@dell.com>
* Copyright (C) 2004 Intel Corporation <matthew.e.tolentino@intel.com>
* Copyright (C) 2013 Tom Gundersen <teg@jklm.no>
*
* This code registers /sys/firmware/efi{,/efivars} when EFI is supported,
* allowing the efivarfs to be mounted or the efivars module to be loaded.
* The existance of /sys/firmware/efi may also be used by userspace to
* determine that the system supports EFI.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kobject.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/debugfs.h>
#include <linux/device.h>
#include <linux/efi.h>
#include <linux/of.h>
#include <linux/initrd.h>
#include <linux/io.h>
#include <linux/kexec.h>
#include <linux/platform_device.h>
#include <linux/random.h>
#include <linux/reboot.h>
#include <linux/slab.h>
#include <linux/acpi.h>
#include <linux/ucs2_string.h>
2016-03-01 05:22:52 +08:00
#include <linux/memblock.h>
#include <linux/security.h>
#include <linux/notifier.h>
#include <asm/early_ioremap.h>
struct efi __read_mostly efi = {
.runtime_supported_mask = EFI_RT_SUPPORTED_ALL,
.acpi = EFI_INVALID_TABLE_ADDR,
.acpi20 = EFI_INVALID_TABLE_ADDR,
.smbios = EFI_INVALID_TABLE_ADDR,
.smbios3 = EFI_INVALID_TABLE_ADDR,
.esrt = EFI_INVALID_TABLE_ADDR,
.tpm_log = EFI_INVALID_TABLE_ADDR,
.tpm_final_log = EFI_INVALID_TABLE_ADDR,
#ifdef CONFIG_LOAD_UEFI_KEYS
.mokvar_table = EFI_INVALID_TABLE_ADDR,
#endif
#ifdef CONFIG_EFI_COCO_SECRET
.coco_secret = EFI_INVALID_TABLE_ADDR,
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
.unaccepted = EFI_INVALID_TABLE_ADDR,
#endif
};
EXPORT_SYMBOL(efi);
unsigned long __ro_after_init efi_rng_seed = EFI_INVALID_TABLE_ADDR;
static unsigned long __initdata mem_reserve = EFI_INVALID_TABLE_ADDR;
static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR;
static unsigned long __initdata initrd = EFI_INVALID_TABLE_ADDR;
extern unsigned long screen_info_table;
struct mm_struct efi_mm = {
mm: start tracking VMAs with maple tree Start tracking the VMAs with the new maple tree structure in parallel with the rb_tree. Add debug and trace events for maple tree operations and duplicate the rb_tree that is created on forks into the maple tree. The maple tree is added to the mm_struct including the mm_init struct, added support in required mm/mmap functions, added tracking in kernel/fork for process forking, and used to find the unmapped_area and checked against what the rbtree finds. This also moves the mmap_lock() in exit_mmap() since the oom reaper call does walk the VMAs. Otherwise lockdep will be unhappy if oom happens. When splitting a vma fails due to allocations of the maple tree nodes, the error path in __split_vma() calls new->vm_ops->close(new). The page accounting for hugetlb is actually in the close() operation, so it accounts for the removal of 1/2 of the VMA which was not adjusted. This results in a negative exit value. To avoid the negative charge, set vm_start = vm_end and vm_pgoff = 0. There is also a potential accounting issue in special mappings from insert_vm_struct() failing to allocate, so reverse the charge there in the failure scenario. Link: https://lkml.kernel.org/r/20220906194824.2110408-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Tested-by: Yu Zhao <yuzhao@google.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: David Hildenbrand <david@redhat.com> Cc: David Howells <dhowells@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: SeongJae Park <sj@kernel.org> Cc: Sven Schnelle <svens@linux.ibm.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-09-07 03:48:45 +08:00
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, efi_mm.mmap_lock),
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
mm/gup: prevent gup_fast from racing with COW during fork Since commit 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes") pages under a FOLL_PIN will not be write protected during COW for fork. This means that pages returned from pin_user_pages(FOLL_WRITE) should not become write protected while the pin is active. However, there is a small race where get_user_pages_fast(FOLL_PIN) can establish a FOLL_PIN at the same time copy_present_page() is write protecting it: CPU 0 CPU 1 get_user_pages_fast() internal_get_user_pages_fast() copy_page_range() pte_alloc_map_lock() copy_present_page() atomic_read(has_pinned) == 0 page_maybe_dma_pinned() == false atomic_set(has_pinned, 1); gup_pgd_range() gup_pte_range() pte_t pte = gup_get_pte(ptep) pte_access_permitted(pte) try_grab_compound_head() pte = pte_wrprotect(pte) set_pte_at(); pte_unmap_unlock() // GUP now returns with a write protected page The first attempt to resolve this by using the write protect caused problems (and was missing a barrrier), see commit f3c64eda3e50 ("mm: avoid early COW write protect games during fork()") Instead wrap copy_p4d_range() with the write side of a seqcount and check the read side around gup_pgd_range(). If there is a collision then get_user_pages_fast() fails and falls back to slow GUP. Slow GUP is safe against this race because copy_page_range() is only called while holding the exclusive side of the mmap_lock on the src mm_struct. [akpm@linux-foundation.org: coding style fixes] Link: https://lore.kernel.org/r/CAHk-=wi=iCnYCARbPGjkVJu9eyYeZ13N64tZYLdOB8CP5Q_PLw@mail.gmail.com Link: https://lkml.kernel.org/r/2-v4-908497cf359a+4782-gup_fork_jgg@nvidia.com Fixes: f3c64eda3e50 ("mm: avoid early COW write protect games during fork()") Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Reviewed-by: John Hubbard <jhubbard@nvidia.com> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Peter Xu <peterx@redhat.com> Acked-by: "Ahmed S. Darwish" <a.darwish@linutronix.de> [seqcount_t parts] Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Hugh Dickins <hughd@google.com> Cc: Jann Horn <jannh@google.com> Cc: Kirill Shutemov <kirill@shutemov.name> Cc: Kirill Tkhai <ktkhai@virtuozzo.com> Cc: Leon Romanovsky <leonro@nvidia.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-12-15 11:05:44 +08:00
.write_protect_seq = SEQCNT_ZERO(efi_mm.write_protect_seq),
MMAP_LOCK_INITIALIZER(efi_mm)
.page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(efi_mm.mmlist),
.cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
};
efi: Use a work queue to invoke EFI Runtime Services Presently, when a user process requests the kernel to execute any UEFI runtime service, the kernel temporarily switches to a separate set of page tables that describe the virtual mapping of the UEFI runtime services regions in memory. Since UEFI runtime services are typically invoked with interrupts enabled, any code that may be called during this time, will have an incorrect view of the process's address space. Although it is unusual for code running in interrupt context to make assumptions about the process context it runs in, there are cases (such as the perf subsystem taking samples) where this causes problems. So let's set up a work queue for calling UEFI runtime services, so that the actual calls are made when the work queue items are dispatched by a work queue worker running in a separate kernel thread. Such threads are not expected to have userland mappings in the first place, and so the additional mappings created for the UEFI runtime services can never clash with any. The ResetSystem() runtime service is not covered by the work queue handling, since it is not expected to return, and may be called at a time when the kernel is torn down to the point where we cannot expect work queues to still be operational. The non-blocking variants of SetVariable() and QueryVariableInfo() are also excluded: these are intended to be used from atomic context, which obviously rules out waiting for a completion to be signalled by another thread. Note that these variants are currently only used for UEFI runtime services calls that occur very early in the boot, and for ones that occur in critical conditions, e.g., to flush kernel logs to UEFI variables via efi-pstore. Suggested-by: Andy Lutomirski <luto@kernel.org> Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com> [ardb: exclude ResetSystem() from the workqueue treatment merge from 2 separate patches and rewrite commit log] Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20180711094040.12506-4-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-07-11 17:40:35 +08:00
struct workqueue_struct *efi_rts_wq;
static bool disable_runtime = IS_ENABLED(CONFIG_EFI_DISABLE_RUNTIME);
static int __init setup_noefi(char *arg)
{
disable_runtime = true;
return 0;
}
early_param("noefi", setup_noefi);
bool efi_runtime_disabled(void)
{
return disable_runtime;
}
bool __pure __efi_soft_reserve_enabled(void)
{
return !efi_enabled(EFI_MEM_NO_SOFT_RESERVE);
}
static int __init parse_efi_cmdline(char *str)
{
efi: Check for NULL efi kernel parameters Even though it is documented how to specifiy efi parameters, it is possible to cause a kernel panic due to a dereference of a NULL pointer when parsing such parameters if "efi" alone is given: PANIC: early exception 0e rip 10:ffffffff812fb361 error 0 cr2 0 [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 4.2.0-rc1+ #450 [ 0.000000] ffffffff81fe20a9 ffffffff81e03d50 ffffffff8184bb0f 00000000000003f8 [ 0.000000] 0000000000000000 ffffffff81e03e08 ffffffff81f371a1 64656c62616e6520 [ 0.000000] 0000000000000069 000000000000005f 0000000000000000 0000000000000000 [ 0.000000] Call Trace: [ 0.000000] [<ffffffff8184bb0f>] dump_stack+0x45/0x57 [ 0.000000] [<ffffffff81f371a1>] early_idt_handler_common+0x81/0xae [ 0.000000] [<ffffffff812fb361>] ? parse_option_str+0x11/0x90 [ 0.000000] [<ffffffff81f4dd69>] arch_parse_efi_cmdline+0x15/0x42 [ 0.000000] [<ffffffff81f376e1>] do_early_param+0x50/0x8a [ 0.000000] [<ffffffff8106b1b3>] parse_args+0x1e3/0x400 [ 0.000000] [<ffffffff81f37a43>] parse_early_options+0x24/0x28 [ 0.000000] [<ffffffff81f37691>] ? loglevel+0x31/0x31 [ 0.000000] [<ffffffff81f37a78>] parse_early_param+0x31/0x3d [ 0.000000] [<ffffffff81f3ae98>] setup_arch+0x2de/0xc08 [ 0.000000] [<ffffffff8109629a>] ? vprintk_default+0x1a/0x20 [ 0.000000] [<ffffffff81f37b20>] start_kernel+0x90/0x423 [ 0.000000] [<ffffffff81f37495>] x86_64_start_reservations+0x2a/0x2c [ 0.000000] [<ffffffff81f37582>] x86_64_start_kernel+0xeb/0xef [ 0.000000] RIP 0xffffffff81ba2efc This panic is not reproducible with "efi=" as this will result in a non-NULL zero-length string. Thus, verify that the pointer to the parameter string is not NULL. This is consistent with other parameter-parsing functions which check for NULL pointers. Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com> Cc: Dave Young <dyoung@redhat.com> Cc: <stable@vger.kernel.org> Signed-off-by: Matt Fleming <matt.fleming@intel.com>
2015-07-16 10:36:03 +08:00
if (!str) {
pr_warn("need at least one option\n");
return -EINVAL;
}
if (parse_option_str(str, "debug"))
set_bit(EFI_DBG, &efi.flags);
if (parse_option_str(str, "noruntime"))
disable_runtime = true;
if (parse_option_str(str, "runtime"))
disable_runtime = false;
if (parse_option_str(str, "nosoftreserve"))
set_bit(EFI_MEM_NO_SOFT_RESERVE, &efi.flags);
return 0;
}
early_param("efi", parse_efi_cmdline);
struct kobject *efi_kobj;
/*
* Let's not leave out systab information that snuck into
* the efivars driver
* Note, do not add more fields in systab sysfs file as it breaks sysfs
* one value per file rule!
*/
static ssize_t systab_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
char *str = buf;
if (!kobj || !buf)
return -EINVAL;
if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
str += sprintf(str, "ACPI20=0x%lx\n", efi.acpi20);
if (efi.acpi != EFI_INVALID_TABLE_ADDR)
str += sprintf(str, "ACPI=0x%lx\n", efi.acpi);
/*
* If both SMBIOS and SMBIOS3 entry points are implemented, the
* SMBIOS3 entry point shall be preferred, so we list it first to
* let applications stop parsing after the first match.
*/
if (efi.smbios3 != EFI_INVALID_TABLE_ADDR)
str += sprintf(str, "SMBIOS3=0x%lx\n", efi.smbios3);
if (efi.smbios != EFI_INVALID_TABLE_ADDR)
str += sprintf(str, "SMBIOS=0x%lx\n", efi.smbios);
arch: Remove Itanium (IA-64) architecture The Itanium architecture is obsolete, and an informal survey [0] reveals that any residual use of Itanium hardware in production is mostly HP-UX or OpenVMS based. The use of Linux on Itanium appears to be limited to enthusiasts that occasionally boot a fresh Linux kernel to see whether things are still working as intended, and perhaps to churn out some distro packages that are rarely used in practice. None of the original companies behind Itanium still produce or support any hardware or software for the architecture, and it is listed as 'Orphaned' in the MAINTAINERS file, as apparently, none of the engineers that contributed on behalf of those companies (nor anyone else, for that matter) have been willing to support or maintain the architecture upstream or even be responsible for applying the odd fix. The Intel firmware team removed all IA-64 support from the Tianocore/EDK2 reference implementation of EFI in 2018. (Itanium is the original architecture for which EFI was developed, and the way Linux supports it deviates significantly from other architectures.) Some distros, such as Debian and Gentoo, still maintain [unofficial] ia64 ports, but many have dropped support years ago. While the argument is being made [1] that there is a 'for the common good' angle to being able to build and run existing projects such as the Grid Community Toolkit [2] on Itanium for interoperability testing, the fact remains that none of those projects are known to be deployed on Linux/ia64, and very few people actually have access to such a system in the first place. Even if there were ways imaginable in which Linux/ia64 could be put to good use today, what matters is whether anyone is actually doing that, and this does not appear to be the case. There are no emulators widely available, and so boot testing Itanium is generally infeasible for ordinary contributors. GCC still supports IA-64 but its compile farm [3] no longer has any IA-64 machines. GLIBC would like to get rid of IA-64 [4] too because it would permit some overdue code cleanups. In summary, the benefits to the ecosystem of having IA-64 be part of it are mostly theoretical, whereas the maintenance overhead of keeping it supported is real. So let's rip off the band aid, and remove the IA-64 arch code entirely. This follows the timeline proposed by the Debian/ia64 maintainer [5], which removes support in a controlled manner, leaving IA-64 in a known good state in the most recent LTS release. Other projects will follow once the kernel support is removed. [0] https://lore.kernel.org/all/CAMj1kXFCMh_578jniKpUtx_j8ByHnt=s7S+yQ+vGbKt9ud7+kQ@mail.gmail.com/ [1] https://lore.kernel.org/all/0075883c-7c51-00f5-2c2d-5119c1820410@web.de/ [2] https://gridcf.org/gct-docs/latest/index.html [3] https://cfarm.tetaneutral.net/machines/list/ [4] https://lore.kernel.org/all/87bkiilpc4.fsf@mid.deneb.enyo.de/ [5] https://lore.kernel.org/all/ff58a3e76e5102c94bb5946d99187b358def688a.camel@physik.fu-berlin.de/ Acked-by: Tony Luck <tony.luck@intel.com> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
2022-10-20 21:54:33 +08:00
if (IS_ENABLED(CONFIG_X86))
str = efi_systab_show_arch(str);
return str - buf;
}
static struct kobj_attribute efi_attr_systab = __ATTR_RO_MODE(systab, 0400);
static ssize_t fw_platform_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", efi_enabled(EFI_64BIT) ? 64 : 32);
}
extern __weak struct kobj_attribute efi_attr_fw_vendor;
extern __weak struct kobj_attribute efi_attr_runtime;
extern __weak struct kobj_attribute efi_attr_config_table;
static struct kobj_attribute efi_attr_fw_platform_size =
__ATTR_RO(fw_platform_size);
static struct attribute *efi_subsys_attrs[] = {
&efi_attr_systab.attr,
&efi_attr_fw_platform_size.attr,
&efi_attr_fw_vendor.attr,
&efi_attr_runtime.attr,
&efi_attr_config_table.attr,
NULL,
};
umode_t __weak efi_attr_is_visible(struct kobject *kobj, struct attribute *attr,
int n)
{
return attr->mode;
}
static const struct attribute_group efi_subsys_attr_group = {
.attrs = efi_subsys_attrs,
.is_visible = efi_attr_is_visible,
};
struct blocking_notifier_head efivar_ops_nh;
EXPORT_SYMBOL_GPL(efivar_ops_nh);
static struct efivars generic_efivars;
static struct efivar_operations generic_ops;
static bool generic_ops_supported(void)
{
unsigned long name_size;
efi_status_t status;
efi_char16_t name;
efi_guid_t guid;
name_size = sizeof(name);
if (!efi.get_next_variable)
return false;
status = efi.get_next_variable(&name_size, &name, &guid);
if (status == EFI_UNSUPPORTED)
return false;
return true;
}
static int generic_ops_register(void)
{
if (!generic_ops_supported())
return 0;
generic_ops.get_variable = efi.get_variable;
generic_ops.get_next_variable = efi.get_next_variable;
generic_ops.query_variable_store = efi_query_variable_store;
generic_ops.query_variable_info = efi.query_variable_info;
efi/efivars: Expose RT service availability via efivars abstraction Commit bf67fad19e493b ("efi: Use more granular check for availability for variable services") introduced a check into the efivarfs, efi-pstore and other drivers that aborts loading of the module if not all three variable runtime services (GetVariable, SetVariable and GetNextVariable) are supported. However, this results in efivarfs being unavailable entirely if only SetVariable support is missing, which is only needed if you want to make any modifications. Also, efi-pstore and the sysfs EFI variable interface could be backed by another implementation of the 'efivars' abstraction, in which case it is completely irrelevant which services are supported by the EFI firmware. So make the generic 'efivars' abstraction dependent on the availibility of the GetVariable and GetNextVariable EFI runtime services, and add a helper 'efivar_supports_writes()' to find out whether the currently active efivars abstraction supports writes (and wire it up to the availability of SetVariable for the generic one). Then, use the efivar_supports_writes() helper to decide whether to permit efivarfs to be mounted read-write, and whether to enable efi-pstore or the sysfs EFI variable interface altogether. Fixes: bf67fad19e493b ("efi: Use more granular check for availability for variable services") Reported-by: Heinrich Schuchardt <xypron.glpk@gmx.de> Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org> Tested-by: Ilias Apalodimas <ilias.apalodimas@linaro.org> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
2020-07-08 18:01:57 +08:00
if (efi_rt_services_supported(EFI_RT_SUPPORTED_SET_VARIABLE)) {
generic_ops.set_variable = efi.set_variable;
generic_ops.set_variable_nonblocking = efi.set_variable_nonblocking;
}
return efivars_register(&generic_efivars, &generic_ops);
}
static void generic_ops_unregister(void)
{
if (!generic_ops.get_variable)
return;
efivars_unregister(&generic_efivars);
}
void efivars_generic_ops_register(void)
{
generic_ops_register();
}
EXPORT_SYMBOL_GPL(efivars_generic_ops_register);
void efivars_generic_ops_unregister(void)
{
generic_ops_unregister();
}
EXPORT_SYMBOL_GPL(efivars_generic_ops_unregister);
#ifdef CONFIG_EFI_CUSTOM_SSDT_OVERLAYS
#define EFIVAR_SSDT_NAME_MAX 16UL
static char efivar_ssdt[EFIVAR_SSDT_NAME_MAX] __initdata;
static int __init efivar_ssdt_setup(char *str)
{
int ret = security_locked_down(LOCKDOWN_ACPI_TABLES);
if (ret)
return ret;
if (strlen(str) < sizeof(efivar_ssdt))
memcpy(efivar_ssdt, str, strlen(str));
else
pr_warn("efivar_ssdt: name too long: %s\n", str);
2022-03-01 12:18:51 +08:00
return 1;
}
__setup("efivar_ssdt=", efivar_ssdt_setup);
static __init int efivar_ssdt_load(void)
{
unsigned long name_size = 256;
efi_char16_t *name = NULL;
efi_status_t status;
efi_guid_t guid;
2019-10-03 00:58:59 +08:00
if (!efivar_ssdt[0])
return 0;
name = kzalloc(name_size, GFP_KERNEL);
if (!name)
return -ENOMEM;
for (;;) {
char utf8_name[EFIVAR_SSDT_NAME_MAX];
unsigned long data_size = 0;
void *data;
int limit;
status = efi.get_next_variable(&name_size, name, &guid);
if (status == EFI_NOT_FOUND) {
break;
} else if (status == EFI_BUFFER_TOO_SMALL) {
efi_char16_t *name_tmp =
krealloc(name, name_size, GFP_KERNEL);
if (!name_tmp) {
kfree(name);
return -ENOMEM;
}
name = name_tmp;
continue;
}
limit = min(EFIVAR_SSDT_NAME_MAX, name_size);
ucs2_as_utf8(utf8_name, name, limit - 1);
if (strncmp(utf8_name, efivar_ssdt, limit) != 0)
continue;
pr_info("loading SSDT from variable %s-%pUl\n", efivar_ssdt, &guid);
status = efi.get_variable(name, &guid, NULL, &data_size, NULL);
if (status != EFI_BUFFER_TOO_SMALL || !data_size)
return -EIO;
data = kmalloc(data_size, GFP_KERNEL);
if (!data)
return -ENOMEM;
status = efi.get_variable(name, &guid, NULL, &data_size, data);
if (status == EFI_SUCCESS) {
acpi_status ret = acpi_load_table(data, NULL);
if (ret)
pr_err("failed to load table: %u\n", ret);
else
continue;
} else {
pr_err("failed to get var data: 0x%lx\n", status);
}
kfree(data);
}
return 0;
}
#else
static inline int efivar_ssdt_load(void) { return 0; }
#endif
#ifdef CONFIG_DEBUG_FS
#define EFI_DEBUGFS_MAX_BLOBS 32
static struct debugfs_blob_wrapper debugfs_blob[EFI_DEBUGFS_MAX_BLOBS];
static void __init efi_debugfs_init(void)
{
struct dentry *efi_debugfs;
efi_memory_desc_t *md;
char name[32];
int type_count[EFI_BOOT_SERVICES_DATA + 1] = {};
int i = 0;
efi_debugfs = debugfs_create_dir("efi", NULL);
if (IS_ERR_OR_NULL(efi_debugfs))
return;
for_each_efi_memory_desc(md) {
switch (md->type) {
case EFI_BOOT_SERVICES_CODE:
snprintf(name, sizeof(name), "boot_services_code%d",
type_count[md->type]++);
break;
case EFI_BOOT_SERVICES_DATA:
snprintf(name, sizeof(name), "boot_services_data%d",
type_count[md->type]++);
break;
default:
continue;
}
if (i >= EFI_DEBUGFS_MAX_BLOBS) {
pr_warn("More then %d EFI boot service segments, only showing first %d in debugfs\n",
EFI_DEBUGFS_MAX_BLOBS, EFI_DEBUGFS_MAX_BLOBS);
break;
}
debugfs_blob[i].size = md->num_pages << EFI_PAGE_SHIFT;
debugfs_blob[i].data = memremap(md->phys_addr,
debugfs_blob[i].size,
MEMREMAP_WB);
if (!debugfs_blob[i].data)
continue;
debugfs_create_blob(name, 0400, efi_debugfs, &debugfs_blob[i]);
i++;
}
}
#else
static inline void efi_debugfs_init(void) {}
#endif
/*
* We register the efi subsystem with the firmware subsystem and the
* efivars subsystem with the efi subsystem, if the system was booted with
* EFI.
*/
static int __init efisubsys_init(void)
{
int error;
if (!efi_enabled(EFI_RUNTIME_SERVICES))
efi.runtime_supported_mask = 0;
if (!efi_enabled(EFI_BOOT))
return 0;
if (efi.runtime_supported_mask) {
/*
* Since we process only one efi_runtime_service() at a time, an
* ordered workqueue (which creates only one execution context)
* should suffice for all our needs.
*/
efi_rts_wq = alloc_ordered_workqueue("efi_rts_wq", 0);
if (!efi_rts_wq) {
pr_err("Creating efi_rts_wq failed, EFI runtime services disabled.\n");
clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
efi.runtime_supported_mask = 0;
return 0;
}
efi: Use a work queue to invoke EFI Runtime Services Presently, when a user process requests the kernel to execute any UEFI runtime service, the kernel temporarily switches to a separate set of page tables that describe the virtual mapping of the UEFI runtime services regions in memory. Since UEFI runtime services are typically invoked with interrupts enabled, any code that may be called during this time, will have an incorrect view of the process's address space. Although it is unusual for code running in interrupt context to make assumptions about the process context it runs in, there are cases (such as the perf subsystem taking samples) where this causes problems. So let's set up a work queue for calling UEFI runtime services, so that the actual calls are made when the work queue items are dispatched by a work queue worker running in a separate kernel thread. Such threads are not expected to have userland mappings in the first place, and so the additional mappings created for the UEFI runtime services can never clash with any. The ResetSystem() runtime service is not covered by the work queue handling, since it is not expected to return, and may be called at a time when the kernel is torn down to the point where we cannot expect work queues to still be operational. The non-blocking variants of SetVariable() and QueryVariableInfo() are also excluded: these are intended to be used from atomic context, which obviously rules out waiting for a completion to be signalled by another thread. Note that these variants are currently only used for UEFI runtime services calls that occur very early in the boot, and for ones that occur in critical conditions, e.g., to flush kernel logs to UEFI variables via efi-pstore. Suggested-by: Andy Lutomirski <luto@kernel.org> Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com> [ardb: exclude ResetSystem() from the workqueue treatment merge from 2 separate patches and rewrite commit log] Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20180711094040.12506-4-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-07-11 17:40:35 +08:00
}
if (efi_rt_services_supported(EFI_RT_SUPPORTED_TIME_SERVICES))
platform_device_register_simple("rtc-efi", 0, NULL, 0);
/* We register the efi directory at /sys/firmware/efi */
efi_kobj = kobject_create_and_add("efi", firmware_kobj);
if (!efi_kobj) {
pr_err("efi: Firmware registration failed.\n");
error = -ENOMEM;
goto err_destroy_wq;
}
efi/efivars: Expose RT service availability via efivars abstraction Commit bf67fad19e493b ("efi: Use more granular check for availability for variable services") introduced a check into the efivarfs, efi-pstore and other drivers that aborts loading of the module if not all three variable runtime services (GetVariable, SetVariable and GetNextVariable) are supported. However, this results in efivarfs being unavailable entirely if only SetVariable support is missing, which is only needed if you want to make any modifications. Also, efi-pstore and the sysfs EFI variable interface could be backed by another implementation of the 'efivars' abstraction, in which case it is completely irrelevant which services are supported by the EFI firmware. So make the generic 'efivars' abstraction dependent on the availibility of the GetVariable and GetNextVariable EFI runtime services, and add a helper 'efivar_supports_writes()' to find out whether the currently active efivars abstraction supports writes (and wire it up to the availability of SetVariable for the generic one). Then, use the efivar_supports_writes() helper to decide whether to permit efivarfs to be mounted read-write, and whether to enable efi-pstore or the sysfs EFI variable interface altogether. Fixes: bf67fad19e493b ("efi: Use more granular check for availability for variable services") Reported-by: Heinrich Schuchardt <xypron.glpk@gmx.de> Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org> Tested-by: Ilias Apalodimas <ilias.apalodimas@linaro.org> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
2020-07-08 18:01:57 +08:00
if (efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE |
EFI_RT_SUPPORTED_GET_NEXT_VARIABLE_NAME)) {
error = generic_ops_register();
if (error)
goto err_put;
efivar_ssdt_load();
platform_device_register_simple("efivars", 0, NULL, 0);
}
BLOCKING_INIT_NOTIFIER_HEAD(&efivar_ops_nh);
error = sysfs_create_group(efi_kobj, &efi_subsys_attr_group);
if (error) {
pr_err("efi: Sysfs attribute export failed with error %d.\n",
error);
goto err_unregister;
}
/* and the standard mountpoint for efivarfs */
error = sysfs_create_mount_point(efi_kobj, "efivars");
if (error) {
pr_err("efivars: Subsystem registration failed.\n");
goto err_remove_group;
}
if (efi_enabled(EFI_DBG) && efi_enabled(EFI_PRESERVE_BS_REGIONS))
efi_debugfs_init();
#ifdef CONFIG_EFI_COCO_SECRET
if (efi.coco_secret != EFI_INVALID_TABLE_ADDR)
platform_device_register_simple("efi_secret", 0, NULL, 0);
#endif
return 0;
err_remove_group:
sysfs_remove_group(efi_kobj, &efi_subsys_attr_group);
err_unregister:
efi/efivars: Expose RT service availability via efivars abstraction Commit bf67fad19e493b ("efi: Use more granular check for availability for variable services") introduced a check into the efivarfs, efi-pstore and other drivers that aborts loading of the module if not all three variable runtime services (GetVariable, SetVariable and GetNextVariable) are supported. However, this results in efivarfs being unavailable entirely if only SetVariable support is missing, which is only needed if you want to make any modifications. Also, efi-pstore and the sysfs EFI variable interface could be backed by another implementation of the 'efivars' abstraction, in which case it is completely irrelevant which services are supported by the EFI firmware. So make the generic 'efivars' abstraction dependent on the availibility of the GetVariable and GetNextVariable EFI runtime services, and add a helper 'efivar_supports_writes()' to find out whether the currently active efivars abstraction supports writes (and wire it up to the availability of SetVariable for the generic one). Then, use the efivar_supports_writes() helper to decide whether to permit efivarfs to be mounted read-write, and whether to enable efi-pstore or the sysfs EFI variable interface altogether. Fixes: bf67fad19e493b ("efi: Use more granular check for availability for variable services") Reported-by: Heinrich Schuchardt <xypron.glpk@gmx.de> Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org> Tested-by: Ilias Apalodimas <ilias.apalodimas@linaro.org> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
2020-07-08 18:01:57 +08:00
if (efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE |
EFI_RT_SUPPORTED_GET_NEXT_VARIABLE_NAME))
generic_ops_unregister();
err_put:
kobject_put(efi_kobj);
efi_kobj = NULL;
err_destroy_wq:
if (efi_rts_wq)
destroy_workqueue(efi_rts_wq);
return error;
}
subsys_initcall(efisubsys_init);
void __init efi_find_mirror(void)
{
efi_memory_desc_t *md;
u64 mirror_size = 0, total_size = 0;
if (!efi_enabled(EFI_MEMMAP))
return;
for_each_efi_memory_desc(md) {
unsigned long long start = md->phys_addr;
unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
total_size += size;
if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
memblock_mark_mirror(start, size);
mirror_size += size;
}
}
if (mirror_size)
pr_info("Memory: %lldM/%lldM mirrored memory\n",
mirror_size>>20, total_size>>20);
}
/*
* Find the efi memory descriptor for a given physical address. Given a
* physical address, determine if it exists within an EFI Memory Map entry,
* and if so, populate the supplied memory descriptor with the appropriate
* data.
*/
int __efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md)
{
efi_memory_desc_t *md;
if (!efi_enabled(EFI_MEMMAP)) {
pr_err_once("EFI_MEMMAP is not enabled.\n");
return -EINVAL;
}
if (!out_md) {
pr_err_once("out_md is null.\n");
return -EINVAL;
}
for_each_efi_memory_desc(md) {
u64 size;
u64 end;
/* skip bogus entries (including empty ones) */
if ((md->phys_addr & (EFI_PAGE_SIZE - 1)) ||
(md->num_pages <= 0) ||
(md->num_pages > (U64_MAX - md->phys_addr) >> EFI_PAGE_SHIFT))
continue;
size = md->num_pages << EFI_PAGE_SHIFT;
end = md->phys_addr + size;
if (phys_addr >= md->phys_addr && phys_addr < end) {
memcpy(out_md, md, sizeof(*out_md));
return 0;
}
}
return -ENOENT;
}
extern int efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md)
__weak __alias(__efi_mem_desc_lookup);
/*
* Calculate the highest address of an efi memory descriptor.
*/
u64 __init efi_mem_desc_end(efi_memory_desc_t *md)
{
u64 size = md->num_pages << EFI_PAGE_SHIFT;
u64 end = md->phys_addr + size;
return end;
}
2016-03-01 05:22:52 +08:00
void __init __weak efi_arch_mem_reserve(phys_addr_t addr, u64 size) {}
/**
* efi_mem_reserve - Reserve an EFI memory region
* @addr: Physical address to reserve
* @size: Size of reservation
*
* Mark a region as reserved from general kernel allocation and
* prevent it being released by efi_free_boot_services().
*
* This function should be called drivers once they've parsed EFI
* configuration tables to figure out where their data lives, e.g.
* efi_esrt_init().
*/
void __init efi_mem_reserve(phys_addr_t addr, u64 size)
{
/* efi_mem_reserve() does not work under Xen */
if (WARN_ON_ONCE(efi_enabled(EFI_PARAVIRT)))
return;
2016-03-01 05:22:52 +08:00
if (!memblock_is_region_reserved(addr, size))
memblock_reserve(addr, size);
/*
* Some architectures (x86) reserve all boot services ranges
* until efi_free_boot_services() because of buggy firmware
* implementations. This means the above memblock_reserve() is
* superfluous on x86 and instead what it needs to do is
* ensure the @start, @size is not freed.
*/
efi_arch_mem_reserve(addr, size);
}
static const efi_config_table_type_t common_tables[] __initconst = {
{ACPI_20_TABLE_GUID, &efi.acpi20, "ACPI 2.0" },
{ACPI_TABLE_GUID, &efi.acpi, "ACPI" },
{SMBIOS_TABLE_GUID, &efi.smbios, "SMBIOS" },
{SMBIOS3_TABLE_GUID, &efi.smbios3, "SMBIOS 3.0" },
{EFI_SYSTEM_RESOURCE_TABLE_GUID, &efi.esrt, "ESRT" },
{EFI_MEMORY_ATTRIBUTES_TABLE_GUID, &efi_mem_attr_table, "MEMATTR" },
{LINUX_EFI_RANDOM_SEED_TABLE_GUID, &efi_rng_seed, "RNG" },
{LINUX_EFI_TPM_EVENT_LOG_GUID, &efi.tpm_log, "TPMEventLog" },
{EFI_TCG2_FINAL_EVENTS_TABLE_GUID, &efi.tpm_final_log, "TPMFinalLog" },
{EFI_CC_FINAL_EVENTS_TABLE_GUID, &efi.tpm_final_log, "CCFinalLog" },
{LINUX_EFI_MEMRESERVE_TABLE_GUID, &mem_reserve, "MEMRESERVE" },
{LINUX_EFI_INITRD_MEDIA_GUID, &initrd, "INITRD" },
{EFI_RT_PROPERTIES_TABLE_GUID, &rt_prop, "RTPROP" },
#ifdef CONFIG_EFI_RCI2_TABLE
{DELLEMC_EFI_RCI2_TABLE_GUID, &rci2_table_phys },
#endif
#ifdef CONFIG_LOAD_UEFI_KEYS
{LINUX_EFI_MOK_VARIABLE_TABLE_GUID, &efi.mokvar_table, "MOKvar" },
#endif
#ifdef CONFIG_EFI_COCO_SECRET
{LINUX_EFI_COCO_SECRET_AREA_GUID, &efi.coco_secret, "CocoSecret" },
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
{LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID, &efi.unaccepted, "Unaccepted" },
#endif
#ifdef CONFIG_EFI_GENERIC_STUB
{LINUX_EFI_SCREEN_INFO_TABLE_GUID, &screen_info_table },
#endif
{},
};
static __init int match_config_table(const efi_guid_t *guid,
unsigned long table,
const efi_config_table_type_t *table_types)
{
int i;
for (i = 0; efi_guidcmp(table_types[i].guid, NULL_GUID); i++) {
if (efi_guidcmp(*guid, table_types[i].guid))
continue;
if (!efi_config_table_is_usable(guid, table)) {
if (table_types[i].name[0])
pr_cont("(%s=0x%lx unusable) ",
table_types[i].name, table);
return 1;
}
*(table_types[i].ptr) = table;
if (table_types[i].name[0])
pr_cont("%s=0x%lx ", table_types[i].name, table);
return 1;
}
return 0;
}
/**
* reserve_unaccepted - Map and reserve unaccepted configuration table
* @unaccepted: Pointer to unaccepted memory table
*
* memblock_add() makes sure that the table is mapped in direct mapping. During
* normal boot it happens automatically because the table is allocated from
* usable memory. But during crashkernel boot only memory specifically reserved
* for crash scenario is mapped. memblock_add() forces the table to be mapped
* in crashkernel case.
*
* Align the range to the nearest page borders. Ranges smaller than page size
* are not going to be mapped.
*
* memblock_reserve() makes sure that future allocations will not touch the
* table.
*/
static __init void reserve_unaccepted(struct efi_unaccepted_memory *unaccepted)
{
phys_addr_t start, size;
start = PAGE_ALIGN_DOWN(efi.unaccepted);
size = PAGE_ALIGN(sizeof(*unaccepted) + unaccepted->size);
memblock_add(start, size);
memblock_reserve(start, size);
}
int __init efi_config_parse_tables(const efi_config_table_t *config_tables,
int count,
const efi_config_table_type_t *arch_tables)
{
const efi_config_table_64_t *tbl64 = (void *)config_tables;
const efi_config_table_32_t *tbl32 = (void *)config_tables;
const efi_guid_t *guid;
unsigned long table;
int i;
pr_info("");
for (i = 0; i < count; i++) {
if (!IS_ENABLED(CONFIG_X86)) {
guid = &config_tables[i].guid;
table = (unsigned long)config_tables[i].table;
} else if (efi_enabled(EFI_64BIT)) {
guid = &tbl64[i].guid;
table = tbl64[i].table;
if (IS_ENABLED(CONFIG_X86_32) &&
tbl64[i].table > U32_MAX) {
pr_cont("\n");
pr_err("Table located above 4GB, disabling EFI.\n");
return -EINVAL;
}
} else {
guid = &tbl32[i].guid;
table = tbl32[i].table;
}
if (!match_config_table(guid, table, common_tables) && arch_tables)
match_config_table(guid, table, arch_tables);
}
pr_cont("\n");
set_bit(EFI_CONFIG_TABLES, &efi.flags);
if (efi_rng_seed != EFI_INVALID_TABLE_ADDR) {
struct linux_efi_random_seed *seed;
u32 size = 0;
seed = early_memremap(efi_rng_seed, sizeof(*seed));
if (seed != NULL) {
efi: random: combine bootloader provided RNG seed with RNG protocol output Instead of blindly creating the EFI random seed configuration table if the RNG protocol is implemented and works, check whether such a EFI configuration table was provided by an earlier boot stage and if so, concatenate the existing and the new seeds, leaving it up to the core code to mix it in and credit it the way it sees fit. This can be used for, e.g., systemd-boot, to pass an additional seed to Linux in a way that can be consumed by the kernel very early. In that case, the following definitions should be used to pass the seed to the EFI stub: struct linux_efi_random_seed { u32 size; // of the 'seed' array in bytes u8 seed[]; }; The memory for the struct must be allocated as EFI_ACPI_RECLAIM_MEMORY pool memory, and the address of the struct in memory should be installed as a EFI configuration table using the following GUID: LINUX_EFI_RANDOM_SEED_TABLE_GUID 1ce1e5bc-7ceb-42f2-81e5-8aadf180f57b Note that doing so is safe even on kernels that were built without this patch applied, but the seed will simply be overwritten with a seed derived from the EFI RNG protocol, if available. The recommended seed size is 32 bytes, and seeds larger than 512 bytes are considered corrupted and ignored entirely. In order to preserve forward secrecy, seeds from previous bootloaders are memzero'd out, and in order to preserve memory, those older seeds are also freed from memory. Freeing from memory without first memzeroing is not safe to do, as it's possible that nothing else will ever overwrite those pages used by EFI. Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com> [ardb: incorporate Jason's followup changes to extend the maximum seed size on the consumer end, memzero() it and drop a needless printk] Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
2022-10-20 16:39:10 +08:00
size = min_t(u32, seed->size, SZ_1K); // sanity check
early_memunmap(seed, sizeof(*seed));
} else {
pr_err("Could not map UEFI random seed!\n");
}
if (size > 0) {
seed = early_memremap(efi_rng_seed,
sizeof(*seed) + size);
if (seed != NULL) {
add_bootloader_randomness(seed->bits, size);
efi: random: combine bootloader provided RNG seed with RNG protocol output Instead of blindly creating the EFI random seed configuration table if the RNG protocol is implemented and works, check whether such a EFI configuration table was provided by an earlier boot stage and if so, concatenate the existing and the new seeds, leaving it up to the core code to mix it in and credit it the way it sees fit. This can be used for, e.g., systemd-boot, to pass an additional seed to Linux in a way that can be consumed by the kernel very early. In that case, the following definitions should be used to pass the seed to the EFI stub: struct linux_efi_random_seed { u32 size; // of the 'seed' array in bytes u8 seed[]; }; The memory for the struct must be allocated as EFI_ACPI_RECLAIM_MEMORY pool memory, and the address of the struct in memory should be installed as a EFI configuration table using the following GUID: LINUX_EFI_RANDOM_SEED_TABLE_GUID 1ce1e5bc-7ceb-42f2-81e5-8aadf180f57b Note that doing so is safe even on kernels that were built without this patch applied, but the seed will simply be overwritten with a seed derived from the EFI RNG protocol, if available. The recommended seed size is 32 bytes, and seeds larger than 512 bytes are considered corrupted and ignored entirely. In order to preserve forward secrecy, seeds from previous bootloaders are memzero'd out, and in order to preserve memory, those older seeds are also freed from memory. Freeing from memory without first memzeroing is not safe to do, as it's possible that nothing else will ever overwrite those pages used by EFI. Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com> [ardb: incorporate Jason's followup changes to extend the maximum seed size on the consumer end, memzero() it and drop a needless printk] Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
2022-10-20 16:39:10 +08:00
memzero_explicit(seed->bits, size);
early_memunmap(seed, sizeof(*seed) + size);
} else {
pr_err("Could not map UEFI random seed!\n");
}
}
}
efi/x86: Ignore the memory attributes table on i386 Commit: 3a6b6c6fb23667fa ("efi: Make EFI_MEMORY_ATTRIBUTES_TABLE initialization common across all architectures") moved the call to efi_memattr_init() from ARM specific to the generic EFI init code, in order to be able to apply the restricted permissions described in that table on x86 as well. We never enabled this feature fully on i386, and so mapping and reserving this table is pointless. However, due to the early call to memblock_reserve(), the memory bookkeeping gets confused to the point where it produces the splat below when we try to map the memory later on: ------------[ cut here ]------------ ioremap on RAM at 0x3f251000 - 0x3fa1afff WARNING: CPU: 0 PID: 0 at arch/x86/mm/ioremap.c:166 __ioremap_caller ... Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.20.0 #48 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 EIP: __ioremap_caller.constprop.0+0x249/0x260 Code: 90 0f b7 05 4e 38 40 de 09 45 e0 e9 09 ff ff ff 90 8d 45 ec c6 05 ... EAX: 00000029 EBX: 00000000 ECX: de59c228 EDX: 00000001 ESI: 3f250fff EDI: 00000000 EBP: de3edf20 ESP: de3edee0 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00200296 CR0: 80050033 CR2: ffd17000 CR3: 1e58c000 CR4: 00040690 Call Trace: ioremap_cache+0xd/0x10 ? old_map_region+0x72/0x9d old_map_region+0x72/0x9d efi_map_region+0x8/0xa efi_enter_virtual_mode+0x260/0x43b start_kernel+0x329/0x3aa i386_start_kernel+0xa7/0xab startup_32_smp+0x164/0x168 ---[ end trace e15ccf6b9f356833 ]--- Let's work around this by disregarding the memory attributes table altogether on i386, which does not result in a loss of functionality or protection, given that we never consumed the contents. Fixes: 3a6b6c6fb23667fa ("efi: Make EFI_MEMORY_ATTRIBUTES_TABLE ... ") Tested-by: Arvind Sankar <nivedita@alum.mit.edu> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Ingo Molnar <mingo@kernel.org> Link: https://lore.kernel.org/r/20200304165917.5893-1-ardb@kernel.org Link: https://lore.kernel.org/r/20200308080859.21568-21-ardb@kernel.org
2020-03-08 16:08:51 +08:00
if (!IS_ENABLED(CONFIG_X86_32) && efi_enabled(EFI_MEMMAP))
efi_memattr_init();
efi_tpm_eventlog_init();
if (mem_reserve != EFI_INVALID_TABLE_ADDR) {
unsigned long prsv = mem_reserve;
while (prsv) {
struct linux_efi_memreserve *rsv;
u8 *p;
/*
* Just map a full page: that is what we will get
* anyway, and it permits us to map the entire entry
* before knowing its size.
*/
p = early_memremap(ALIGN_DOWN(prsv, PAGE_SIZE),
PAGE_SIZE);
if (p == NULL) {
pr_err("Could not map UEFI memreserve entry!\n");
return -ENOMEM;
}
rsv = (void *)(p + prsv % PAGE_SIZE);
/* reserve the entry itself */
efi: Replace zero-length array and use struct_size() helper The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. Lastly, make use of the sizeof_field() helper instead of an open-coded version. This issue was found with the help of Coccinelle and audited _manually_. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org> Reviewed-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/r/20200527171425.GA4053@embeddedor Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
2020-05-28 01:14:25 +08:00
memblock_reserve(prsv,
struct_size(rsv, entry, rsv->size));
for (i = 0; i < atomic_read(&rsv->count); i++) {
memblock_reserve(rsv->entry[i].base,
rsv->entry[i].size);
}
prsv = rsv->next;
early_memunmap(p, PAGE_SIZE);
}
}
if (rt_prop != EFI_INVALID_TABLE_ADDR) {
efi_rt_properties_table_t *tbl;
tbl = early_memremap(rt_prop, sizeof(*tbl));
if (tbl) {
efi.runtime_supported_mask &= tbl->runtime_services_supported;
early_memunmap(tbl, sizeof(*tbl));
}
}
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) &&
initrd != EFI_INVALID_TABLE_ADDR && phys_initrd_size == 0) {
struct linux_efi_initrd *tbl;
tbl = early_memremap(initrd, sizeof(*tbl));
if (tbl) {
phys_initrd_start = tbl->base;
phys_initrd_size = tbl->size;
early_memunmap(tbl, sizeof(*tbl));
}
}
if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) &&
efi.unaccepted != EFI_INVALID_TABLE_ADDR) {
struct efi_unaccepted_memory *unaccepted;
unaccepted = early_memremap(efi.unaccepted, sizeof(*unaccepted));
if (unaccepted) {
if (unaccepted->version == 1) {
reserve_unaccepted(unaccepted);
} else {
efi.unaccepted = EFI_INVALID_TABLE_ADDR;
}
early_memunmap(unaccepted, sizeof(*unaccepted));
}
}
return 0;
}
int __init efi_systab_check_header(const efi_table_hdr_t *systab_hdr)
{
if (systab_hdr->signature != EFI_SYSTEM_TABLE_SIGNATURE) {
pr_err("System table signature incorrect!\n");
return -EINVAL;
}
return 0;
}
static const efi_char16_t *__init map_fw_vendor(unsigned long fw_vendor,
size_t size)
{
const efi_char16_t *ret;
ret = early_memremap_ro(fw_vendor, size);
if (!ret)
pr_err("Could not map the firmware vendor!\n");
return ret;
}
static void __init unmap_fw_vendor(const void *fw_vendor, size_t size)
{
early_memunmap((void *)fw_vendor, size);
}
void __init efi_systab_report_header(const efi_table_hdr_t *systab_hdr,
unsigned long fw_vendor)
{
char vendor[100] = "unknown";
const efi_char16_t *c16;
size_t i;
u16 rev;
c16 = map_fw_vendor(fw_vendor, sizeof(vendor) * sizeof(efi_char16_t));
if (c16) {
for (i = 0; i < sizeof(vendor) - 1 && c16[i]; ++i)
vendor[i] = c16[i];
vendor[i] = '\0';
unmap_fw_vendor(c16, sizeof(vendor) * sizeof(efi_char16_t));
}
rev = (u16)systab_hdr->revision;
pr_info("EFI v%u.%u", systab_hdr->revision >> 16, rev / 10);
rev %= 10;
if (rev)
pr_cont(".%u", rev);
pr_cont(" by %s\n", vendor);
efi: runtime: avoid EFIv2 runtime services on Apple x86 machines Aditya reports [0] that his recent MacbookPro crashes in the firmware when using the variable services at runtime. The culprit appears to be a call to QueryVariableInfo(), which we did not use to call on Apple x86 machines in the past as they only upgraded from EFI v1.10 to EFI v2.40 firmware fairly recently, and QueryVariableInfo() (along with UpdateCapsule() et al) was added in EFI v2.00. The only runtime service introduced in EFI v2.00 that we actually use in Linux is QueryVariableInfo(), as the capsule based ones are optional, generally not used at runtime (all the LVFS/fwupd firmware update infrastructure uses helper EFI programs that invoke capsule update at boot time, not runtime), and not implemented by Apple machines in the first place. QueryVariableInfo() is used to 'safely' set variables, i.e., only when there is enough space. This prevents machines with buggy firmwares from corrupting their NVRAMs when they run out of space. Given that Apple machines have been using EFI v1.10 services only for the longest time (the EFI v2.0 spec was released in 2006, and Linux support for the newly introduced runtime services was added in 2011, but the MacbookPro12,1 released in 2015 still claims to be EFI v1.10 only), let's avoid the EFI v2.0 ones on all Apple x86 machines. [0] https://lore.kernel.org/all/6D757C75-65B1-468B-842D-10410081A8E4@live.com/ Cc: <stable@vger.kernel.org> Cc: Jeremy Kerr <jk@ozlabs.org> Cc: Matthew Garrett <mjg59@srcf.ucam.org> Reported-by: Aditya Garg <gargaditya08@live.com> Tested-by: Orlando Chamberlain <redecorating@protonmail.com> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Tested-by: Aditya Garg <gargaditya08@live.com> Link: https://bugzilla.kernel.org/show_bug.cgi?id=215277
2022-01-12 18:14:13 +08:00
if (IS_ENABLED(CONFIG_X86_64) &&
systab_hdr->revision > EFI_1_10_SYSTEM_TABLE_REVISION &&
!strcmp(vendor, "Apple")) {
pr_info("Apple Mac detected, using EFI v1.10 runtime services only\n");
efi.runtime_version = EFI_1_10_SYSTEM_TABLE_REVISION;
}
}
static __initdata char memory_type_name[][13] = {
"Reserved",
"Loader Code",
"Loader Data",
"Boot Code",
"Boot Data",
"Runtime Code",
"Runtime Data",
"Conventional",
"Unusable",
"ACPI Reclaim",
"ACPI Mem NVS",
"MMIO",
"MMIO Port",
"PAL Code",
"Persistent",
efi/libstub: Implement support for unaccepted memory UEFI Specification version 2.9 introduces the concept of memory acceptance: Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP, requiring memory to be accepted before it can be used by the guest. Accepting happens via a protocol specific for the Virtual Machine platform. Accepting memory is costly and it makes VMM allocate memory for the accepted guest physical address range. It's better to postpone memory acceptance until memory is needed. It lowers boot time and reduces memory overhead. The kernel needs to know what memory has been accepted. Firmware communicates this information via memory map: a new memory type -- EFI_UNACCEPTED_MEMORY -- indicates such memory. Range-based tracking works fine for firmware, but it gets bulky for the kernel: e820 (or whatever the arch uses) has to be modified on every page acceptance. It leads to table fragmentation and there's a limited number of entries in the e820 table. Another option is to mark such memory as usable in e820 and track if the range has been accepted in a bitmap. One bit in the bitmap represents a naturally aligned power-2-sized region of address space -- unit. For x86, unit size is 2MiB: 4k of the bitmap is enough to track 64GiB or physical address space. In the worst-case scenario -- a huge hole in the middle of the address space -- It needs 256MiB to handle 4PiB of the address space. Any unaccepted memory that is not aligned to unit_size gets accepted upfront. The bitmap is allocated and constructed in the EFI stub and passed down to the kernel via EFI configuration table. allocate_e820() allocates the bitmap if unaccepted memory is present, according to the size of unaccepted region. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Ard Biesheuvel <ardb@kernel.org> Link: https://lore.kernel.org/r/20230606142637.5171-4-kirill.shutemov@linux.intel.com
2023-06-06 22:26:31 +08:00
"Unaccepted",
};
char * __init efi_md_typeattr_format(char *buf, size_t size,
const efi_memory_desc_t *md)
{
char *pos;
int type_len;
u64 attr;
pos = buf;
if (md->type >= ARRAY_SIZE(memory_type_name))
type_len = snprintf(pos, size, "[type=%u", md->type);
else
type_len = snprintf(pos, size, "[%-*s",
(int)(sizeof(memory_type_name[0]) - 1),
memory_type_name[md->type]);
if (type_len >= size)
return buf;
pos += type_len;
size -= type_len;
attr = md->attribute;
if (attr & ~(EFI_MEMORY_UC | EFI_MEMORY_WC | EFI_MEMORY_WT |
EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_RO |
EFI_MEMORY_WP | EFI_MEMORY_RP | EFI_MEMORY_XP |
EFI_MEMORY_NV | EFI_MEMORY_SP | EFI_MEMORY_CPU_CRYPTO |
EFI_MEMORY_RUNTIME | EFI_MEMORY_MORE_RELIABLE))
snprintf(pos, size, "|attr=0x%016llx]",
(unsigned long long)attr);
else
snprintf(pos, size,
"|%3s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]",
attr & EFI_MEMORY_RUNTIME ? "RUN" : "",
attr & EFI_MEMORY_MORE_RELIABLE ? "MR" : "",
attr & EFI_MEMORY_CPU_CRYPTO ? "CC" : "",
attr & EFI_MEMORY_SP ? "SP" : "",
attr & EFI_MEMORY_NV ? "NV" : "",
attr & EFI_MEMORY_XP ? "XP" : "",
attr & EFI_MEMORY_RP ? "RP" : "",
attr & EFI_MEMORY_WP ? "WP" : "",
attr & EFI_MEMORY_RO ? "RO" : "",
attr & EFI_MEMORY_UCE ? "UCE" : "",
attr & EFI_MEMORY_WB ? "WB" : "",
attr & EFI_MEMORY_WT ? "WT" : "",
attr & EFI_MEMORY_WC ? "WC" : "",
attr & EFI_MEMORY_UC ? "UC" : "");
return buf;
}
/*
* efi_mem_attributes - lookup memmap attributes for physical address
* @phys_addr: the physical address to lookup
*
* Search in the EFI memory map for the region covering
* @phys_addr. Returns the EFI memory attributes if the region
* was found in the memory map, 0 otherwise.
*/
u64 efi_mem_attributes(unsigned long phys_addr)
{
efi_memory_desc_t *md;
if (!efi_enabled(EFI_MEMMAP))
return 0;
for_each_efi_memory_desc(md) {
if ((md->phys_addr <= phys_addr) &&
(phys_addr < (md->phys_addr +
(md->num_pages << EFI_PAGE_SHIFT))))
return md->attribute;
}
return 0;
}
/*
* efi_mem_type - lookup memmap type for physical address
* @phys_addr: the physical address to lookup
*
* Search in the EFI memory map for the region covering @phys_addr.
* Returns the EFI memory type if the region was found in the memory
* map, -EINVAL otherwise.
*/
int efi_mem_type(unsigned long phys_addr)
{
const efi_memory_desc_t *md;
if (!efi_enabled(EFI_MEMMAP))
return -ENOTSUPP;
for_each_efi_memory_desc(md) {
if ((md->phys_addr <= phys_addr) &&
(phys_addr < (md->phys_addr +
(md->num_pages << EFI_PAGE_SHIFT))))
return md->type;
}
return -EINVAL;
}
int efi_status_to_err(efi_status_t status)
{
int err;
switch (status) {
case EFI_SUCCESS:
err = 0;
break;
case EFI_INVALID_PARAMETER:
err = -EINVAL;
break;
case EFI_OUT_OF_RESOURCES:
err = -ENOSPC;
break;
case EFI_DEVICE_ERROR:
err = -EIO;
break;
case EFI_WRITE_PROTECTED:
err = -EROFS;
break;
case EFI_SECURITY_VIOLATION:
err = -EACCES;
break;
case EFI_NOT_FOUND:
err = -ENOENT;
break;
case EFI_ABORTED:
err = -EINTR;
break;
default:
err = -EINVAL;
}
return err;
}
EXPORT_SYMBOL_GPL(efi_status_to_err);
static DEFINE_SPINLOCK(efi_mem_reserve_persistent_lock);
static struct linux_efi_memreserve *efi_memreserve_root __ro_after_init;
static int __init efi_memreserve_map_root(void)
{
if (mem_reserve == EFI_INVALID_TABLE_ADDR)
return -ENODEV;
efi_memreserve_root = memremap(mem_reserve,
sizeof(*efi_memreserve_root),
MEMREMAP_WB);
if (WARN_ON_ONCE(!efi_memreserve_root))
return -ENOMEM;
return 0;
}
static int efi_mem_reserve_iomem(phys_addr_t addr, u64 size)
{
struct resource *res, *parent;
int ret;
res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
if (!res)
return -ENOMEM;
res->name = "reserved";
res->flags = IORESOURCE_MEM;
res->start = addr;
res->end = addr + size - 1;
/* we expect a conflict with a 'System RAM' region */
parent = request_resource_conflict(&iomem_resource, res);
ret = parent ? request_resource(parent, res) : 0;
/*
* Given that efi_mem_reserve_iomem() can be called at any
* time, only call memblock_reserve() if the architecture
* keeps the infrastructure around.
*/
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK) && !ret)
memblock_reserve(addr, size);
return ret;
}
int __ref efi_mem_reserve_persistent(phys_addr_t addr, u64 size)
{
struct linux_efi_memreserve *rsv;
efi: Reduce the amount of memblock reservations for persistent allocations The current implementation of efi_mem_reserve_persistent() is rather naive, in the sense that for each invocation, it creates a separate linked list entry to describe the reservation. Since the linked list entries themselves need to persist across subsequent kexec reboots, every reservation created this way results in two memblock_reserve() calls at the next boot. On arm64 systems with 100s of CPUs, this may result in a excessive number of memblock reservations, and needless fragmentation. So instead, make use of the newly updated struct linux_efi_memreserve layout to put multiple reservations into a single linked list entry. This should get rid of the numerous tiny memblock reservations, and effectively cut the total number of reservations in half on arm64 systems with many CPUs. [ mingo: build warning fix. ] Tested-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Arend van Spriel <arend.vanspriel@broadcom.com> Cc: Bhupesh Sharma <bhsharma@redhat.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Eric Snowberg <eric.snowberg@oracle.com> Cc: Hans de Goede <hdegoede@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Jon Hunter <jonathanh@nvidia.com> Cc: Julien Thierry <julien.thierry@arm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matt Fleming <matt@codeblueprint.co.uk> Cc: Nathan Chancellor <natechancellor@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com> Cc: Sedat Dilek <sedat.dilek@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: YiFei Zhu <zhuyifei1999@gmail.com> Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20181129171230.18699-11-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-11-30 01:12:29 +08:00
unsigned long prsv;
int rc, index;
if (efi_memreserve_root == (void *)ULONG_MAX)
return -ENODEV;
if (!efi_memreserve_root) {
rc = efi_memreserve_map_root();
if (rc)
return rc;
}
efi: Reduce the amount of memblock reservations for persistent allocations The current implementation of efi_mem_reserve_persistent() is rather naive, in the sense that for each invocation, it creates a separate linked list entry to describe the reservation. Since the linked list entries themselves need to persist across subsequent kexec reboots, every reservation created this way results in two memblock_reserve() calls at the next boot. On arm64 systems with 100s of CPUs, this may result in a excessive number of memblock reservations, and needless fragmentation. So instead, make use of the newly updated struct linux_efi_memreserve layout to put multiple reservations into a single linked list entry. This should get rid of the numerous tiny memblock reservations, and effectively cut the total number of reservations in half on arm64 systems with many CPUs. [ mingo: build warning fix. ] Tested-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Arend van Spriel <arend.vanspriel@broadcom.com> Cc: Bhupesh Sharma <bhsharma@redhat.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Eric Snowberg <eric.snowberg@oracle.com> Cc: Hans de Goede <hdegoede@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Jon Hunter <jonathanh@nvidia.com> Cc: Julien Thierry <julien.thierry@arm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matt Fleming <matt@codeblueprint.co.uk> Cc: Nathan Chancellor <natechancellor@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com> Cc: Sedat Dilek <sedat.dilek@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: YiFei Zhu <zhuyifei1999@gmail.com> Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20181129171230.18699-11-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-11-30 01:12:29 +08:00
/* first try to find a slot in an existing linked list entry */
for (prsv = efi_memreserve_root->next; prsv; ) {
rsv = memremap(prsv, sizeof(*rsv), MEMREMAP_WB);
if (!rsv)
return -ENOMEM;
efi: Reduce the amount of memblock reservations for persistent allocations The current implementation of efi_mem_reserve_persistent() is rather naive, in the sense that for each invocation, it creates a separate linked list entry to describe the reservation. Since the linked list entries themselves need to persist across subsequent kexec reboots, every reservation created this way results in two memblock_reserve() calls at the next boot. On arm64 systems with 100s of CPUs, this may result in a excessive number of memblock reservations, and needless fragmentation. So instead, make use of the newly updated struct linux_efi_memreserve layout to put multiple reservations into a single linked list entry. This should get rid of the numerous tiny memblock reservations, and effectively cut the total number of reservations in half on arm64 systems with many CPUs. [ mingo: build warning fix. ] Tested-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Arend van Spriel <arend.vanspriel@broadcom.com> Cc: Bhupesh Sharma <bhsharma@redhat.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Eric Snowberg <eric.snowberg@oracle.com> Cc: Hans de Goede <hdegoede@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Jon Hunter <jonathanh@nvidia.com> Cc: Julien Thierry <julien.thierry@arm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matt Fleming <matt@codeblueprint.co.uk> Cc: Nathan Chancellor <natechancellor@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com> Cc: Sedat Dilek <sedat.dilek@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: YiFei Zhu <zhuyifei1999@gmail.com> Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20181129171230.18699-11-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-11-30 01:12:29 +08:00
index = atomic_fetch_add_unless(&rsv->count, 1, rsv->size);
if (index < rsv->size) {
rsv->entry[index].base = addr;
rsv->entry[index].size = size;
memunmap(rsv);
return efi_mem_reserve_iomem(addr, size);
efi: Reduce the amount of memblock reservations for persistent allocations The current implementation of efi_mem_reserve_persistent() is rather naive, in the sense that for each invocation, it creates a separate linked list entry to describe the reservation. Since the linked list entries themselves need to persist across subsequent kexec reboots, every reservation created this way results in two memblock_reserve() calls at the next boot. On arm64 systems with 100s of CPUs, this may result in a excessive number of memblock reservations, and needless fragmentation. So instead, make use of the newly updated struct linux_efi_memreserve layout to put multiple reservations into a single linked list entry. This should get rid of the numerous tiny memblock reservations, and effectively cut the total number of reservations in half on arm64 systems with many CPUs. [ mingo: build warning fix. ] Tested-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Arend van Spriel <arend.vanspriel@broadcom.com> Cc: Bhupesh Sharma <bhsharma@redhat.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Eric Snowberg <eric.snowberg@oracle.com> Cc: Hans de Goede <hdegoede@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Jon Hunter <jonathanh@nvidia.com> Cc: Julien Thierry <julien.thierry@arm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matt Fleming <matt@codeblueprint.co.uk> Cc: Nathan Chancellor <natechancellor@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com> Cc: Sedat Dilek <sedat.dilek@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: YiFei Zhu <zhuyifei1999@gmail.com> Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20181129171230.18699-11-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-11-30 01:12:29 +08:00
}
prsv = rsv->next;
memunmap(rsv);
efi: Reduce the amount of memblock reservations for persistent allocations The current implementation of efi_mem_reserve_persistent() is rather naive, in the sense that for each invocation, it creates a separate linked list entry to describe the reservation. Since the linked list entries themselves need to persist across subsequent kexec reboots, every reservation created this way results in two memblock_reserve() calls at the next boot. On arm64 systems with 100s of CPUs, this may result in a excessive number of memblock reservations, and needless fragmentation. So instead, make use of the newly updated struct linux_efi_memreserve layout to put multiple reservations into a single linked list entry. This should get rid of the numerous tiny memblock reservations, and effectively cut the total number of reservations in half on arm64 systems with many CPUs. [ mingo: build warning fix. ] Tested-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Arend van Spriel <arend.vanspriel@broadcom.com> Cc: Bhupesh Sharma <bhsharma@redhat.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Eric Snowberg <eric.snowberg@oracle.com> Cc: Hans de Goede <hdegoede@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Jon Hunter <jonathanh@nvidia.com> Cc: Julien Thierry <julien.thierry@arm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matt Fleming <matt@codeblueprint.co.uk> Cc: Nathan Chancellor <natechancellor@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com> Cc: Sedat Dilek <sedat.dilek@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: YiFei Zhu <zhuyifei1999@gmail.com> Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20181129171230.18699-11-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-11-30 01:12:29 +08:00
}
/* no slot found - allocate a new linked list entry */
rsv = (struct linux_efi_memreserve *)__get_free_page(GFP_ATOMIC);
if (!rsv)
return -ENOMEM;
rc = efi_mem_reserve_iomem(__pa(rsv), SZ_4K);
if (rc) {
free_page((unsigned long)rsv);
return rc;
}
/*
* The memremap() call above assumes that a linux_efi_memreserve entry
* never crosses a page boundary, so let's ensure that this remains true
* even when kexec'ing a 4k pages kernel from a >4k pages kernel, by
* using SZ_4K explicitly in the size calculation below.
*/
rsv->size = EFI_MEMRESERVE_COUNT(SZ_4K);
atomic_set(&rsv->count, 1);
rsv->entry[0].base = addr;
rsv->entry[0].size = size;
spin_lock(&efi_mem_reserve_persistent_lock);
rsv->next = efi_memreserve_root->next;
efi_memreserve_root->next = __pa(rsv);
spin_unlock(&efi_mem_reserve_persistent_lock);
return efi_mem_reserve_iomem(addr, size);
}
static int __init efi_memreserve_root_init(void)
{
if (efi_memreserve_root)
return 0;
if (efi_memreserve_map_root())
efi_memreserve_root = (void *)ULONG_MAX;
return 0;
}
early_initcall(efi_memreserve_root_init);
#ifdef CONFIG_KEXEC
static int update_efi_random_seed(struct notifier_block *nb,
unsigned long code, void *unused)
{
struct linux_efi_random_seed *seed;
u32 size = 0;
if (!kexec_in_progress)
return NOTIFY_DONE;
seed = memremap(efi_rng_seed, sizeof(*seed), MEMREMAP_WB);
if (seed != NULL) {
size = min(seed->size, EFI_RANDOM_SEED_SIZE);
memunmap(seed);
} else {
pr_err("Could not map UEFI random seed!\n");
}
if (size > 0) {
seed = memremap(efi_rng_seed, sizeof(*seed) + size,
MEMREMAP_WB);
if (seed != NULL) {
seed->size = size;
get_random_bytes(seed->bits, seed->size);
memunmap(seed);
} else {
pr_err("Could not map UEFI random seed!\n");
}
}
return NOTIFY_DONE;
}
static struct notifier_block efi_random_seed_nb = {
.notifier_call = update_efi_random_seed,
};
static int __init register_update_efi_random_seed(void)
{
if (efi_rng_seed == EFI_INVALID_TABLE_ADDR)
return 0;
return register_reboot_notifier(&efi_random_seed_nb);
}
late_initcall(register_update_efi_random_seed);
#endif