2018-09-21 14:26:58 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/*
|
|
|
|
* Hibernation support for x86
|
|
|
|
*
|
|
|
|
* Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
|
|
|
|
* Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
|
|
|
|
* Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
|
|
|
|
*/
|
|
|
|
#include <linux/gfp.h>
|
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/suspend.h>
|
|
|
|
#include <linux/scatterlist.h>
|
|
|
|
#include <linux/kdebug.h>
|
x86/power: Fix 'nosmt' vs hibernation triple fault during resume
As explained in
0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once")
we always, no matter what, have to bring up x86 HT siblings during boot at
least once in order to avoid first MCE bringing the system to its knees.
That means that whenever 'nosmt' is supplied on the kernel command-line,
all the HT siblings are as a result sitting in mwait or cpudile after
going through the online-offline cycle at least once.
This causes a serious issue though when a kernel, which saw 'nosmt' on its
commandline, is going to perform resume from hibernation: if the resume
from the hibernated image is successful, cr3 is flipped in order to point
to the address space of the kernel that is being resumed, which in turn
means that all the HT siblings are all of a sudden mwaiting on address
which is no longer valid.
That results in triple fault shortly after cr3 is switched, and machine
reboots.
Fix this by always waking up all the SMT siblings before initiating the
'restore from hibernation' process; this guarantees that all the HT
siblings will be properly carried over to the resumed kernel waiting in
resume_play_dead(), and acted upon accordingly afterwards, based on the
target kernel configuration.
Symmetricaly, the resumed kernel has to push the SMT siblings to mwait
again in case it has SMT disabled; this means it has to online all
the siblings when resuming (so that they come out of hlt) and offline
them again to let them reach mwait.
Cc: 4.19+ <stable@vger.kernel.org> # v4.19+
Debugged-by: Thomas Gleixner <tglx@linutronix.de>
Fixes: 0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once")
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Acked-by: Pavel Machek <pavel@ucw.cz>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2019-05-30 06:09:39 +08:00
|
|
|
#include <linux/cpu.h>
|
2020-06-09 12:32:42 +08:00
|
|
|
#include <linux/pgtable.h>
|
2021-04-20 20:57:39 +08:00
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/crc32.h>
|
2018-09-21 14:26:58 +08:00
|
|
|
|
|
|
|
#include <asm/e820/api.h>
|
|
|
|
#include <asm/init.h>
|
|
|
|
#include <asm/proto.h>
|
|
|
|
#include <asm/page.h>
|
|
|
|
#include <asm/mtrr.h>
|
|
|
|
#include <asm/sections.h>
|
|
|
|
#include <asm/suspend.h>
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Address to jump to in the last phase of restore in order to get to the image
|
|
|
|
* kernel's text (this value is passed in the image header).
|
|
|
|
*/
|
|
|
|
unsigned long restore_jump_address __visible;
|
|
|
|
unsigned long jump_address_phys;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Value of the cr3 register from before the hibernation (this value is passed
|
|
|
|
* in the image header).
|
|
|
|
*/
|
|
|
|
unsigned long restore_cr3 __visible;
|
2018-09-21 14:27:40 +08:00
|
|
|
unsigned long temp_pgt __visible;
|
2018-09-21 14:26:58 +08:00
|
|
|
unsigned long relocated_restore_code __visible;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* pfn_is_nosave - check if given pfn is in the 'nosave' section
|
|
|
|
*/
|
|
|
|
int pfn_is_nosave(unsigned long pfn)
|
|
|
|
{
|
|
|
|
unsigned long nosave_begin_pfn;
|
|
|
|
unsigned long nosave_end_pfn;
|
|
|
|
|
|
|
|
nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
|
|
|
|
nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
|
|
|
|
|
|
|
|
return pfn >= nosave_begin_pfn && pfn < nosave_end_pfn;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct restore_data_record {
|
|
|
|
unsigned long jump_address;
|
|
|
|
unsigned long jump_address_phys;
|
|
|
|
unsigned long cr3;
|
|
|
|
unsigned long magic;
|
2021-04-20 20:57:39 +08:00
|
|
|
unsigned long e820_checksum;
|
2018-09-21 14:26:58 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
2021-04-20 20:57:39 +08:00
|
|
|
* compute_e820_crc32 - calculate crc32 of a given e820 table
|
2018-09-21 14:26:58 +08:00
|
|
|
*
|
|
|
|
* @table: the e820 table to be calculated
|
2021-04-20 20:57:39 +08:00
|
|
|
*
|
|
|
|
* Return: the resulting checksum
|
2018-09-21 14:26:58 +08:00
|
|
|
*/
|
2021-04-20 20:57:39 +08:00
|
|
|
static inline u32 compute_e820_crc32(struct e820_table *table)
|
2018-09-21 14:26:58 +08:00
|
|
|
{
|
2021-04-20 20:57:39 +08:00
|
|
|
int size = offsetof(struct e820_table, entries) +
|
2018-09-21 14:26:58 +08:00
|
|
|
sizeof(struct e820_entry) * table->nr_entries;
|
|
|
|
|
2021-04-20 20:57:39 +08:00
|
|
|
return ~crc32_le(~0, (unsigned char const *)table, size);
|
2018-09-21 14:26:58 +08:00
|
|
|
}
|
|
|
|
|
2018-09-21 14:27:29 +08:00
|
|
|
#ifdef CONFIG_X86_64
|
2021-04-20 20:57:39 +08:00
|
|
|
#define RESTORE_MAGIC 0x23456789ABCDEF02UL
|
2018-09-21 14:27:29 +08:00
|
|
|
#else
|
2021-04-20 20:57:39 +08:00
|
|
|
#define RESTORE_MAGIC 0x12345679UL
|
2018-09-21 14:27:29 +08:00
|
|
|
#endif
|
2018-09-21 14:26:58 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* arch_hibernation_header_save - populate the architecture specific part
|
|
|
|
* of a hibernation image header
|
|
|
|
* @addr: address to save the data at
|
|
|
|
*/
|
|
|
|
int arch_hibernation_header_save(void *addr, unsigned int max_size)
|
|
|
|
{
|
|
|
|
struct restore_data_record *rdr = addr;
|
|
|
|
|
|
|
|
if (max_size < sizeof(struct restore_data_record))
|
|
|
|
return -EOVERFLOW;
|
|
|
|
rdr->magic = RESTORE_MAGIC;
|
|
|
|
rdr->jump_address = (unsigned long)restore_registers;
|
|
|
|
rdr->jump_address_phys = __pa_symbol(restore_registers);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The restore code fixes up CR3 and CR4 in the following sequence:
|
|
|
|
*
|
|
|
|
* [in hibernation asm]
|
|
|
|
* 1. CR3 <= temporary page tables
|
|
|
|
* 2. CR4 <= mmu_cr4_features (from the kernel that restores us)
|
|
|
|
* 3. CR3 <= rdr->cr3
|
|
|
|
* 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel)
|
|
|
|
* [in restore_processor_state()]
|
|
|
|
* 5. CR4 <= saved CR4
|
|
|
|
* 6. CR3 <= saved CR3
|
|
|
|
*
|
|
|
|
* Our mmu_cr4_features has CR4.PCIDE=0, and toggling
|
|
|
|
* CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so
|
|
|
|
* rdr->cr3 needs to point to valid page tables but must not
|
|
|
|
* have any of the PCID bits set.
|
|
|
|
*/
|
|
|
|
rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK;
|
|
|
|
|
2021-04-20 20:57:39 +08:00
|
|
|
rdr->e820_checksum = compute_e820_crc32(e820_table_firmware);
|
|
|
|
return 0;
|
2018-09-21 14:26:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* arch_hibernation_header_restore - read the architecture specific data
|
|
|
|
* from the hibernation image header
|
|
|
|
* @addr: address to read the data from
|
|
|
|
*/
|
|
|
|
int arch_hibernation_header_restore(void *addr)
|
|
|
|
{
|
|
|
|
struct restore_data_record *rdr = addr;
|
|
|
|
|
|
|
|
if (rdr->magic != RESTORE_MAGIC) {
|
|
|
|
pr_crit("Unrecognized hibernate image header format!\n");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
restore_jump_address = rdr->jump_address;
|
|
|
|
jump_address_phys = rdr->jump_address_phys;
|
2018-09-21 14:28:11 +08:00
|
|
|
restore_cr3 = rdr->cr3;
|
2018-09-21 14:26:58 +08:00
|
|
|
|
2021-04-20 20:57:39 +08:00
|
|
|
if (rdr->e820_checksum != compute_e820_crc32(e820_table_firmware)) {
|
2018-09-21 14:26:58 +08:00
|
|
|
pr_crit("Hibernate inconsistent memory map detected!\n");
|
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int relocate_restore_code(void)
|
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
p4d_t *p4d;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
|
|
|
pte_t *pte;
|
|
|
|
|
|
|
|
relocated_restore_code = get_safe_page(GFP_ATOMIC);
|
|
|
|
if (!relocated_restore_code)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
x86,pm: Force out-of-line memcpy()
GCC fancies inlining memcpy(), and because it cannot prove the
destination is page-aligned (it is) it ends up generating atrocious
code like:
19e: 48 8b 15 00 00 00 00 mov 0x0(%rip),%rdx # 1a5 <relocate_restore_code+0x25> 1a1: R_X86_64_PC32 core_restore_code-0x4
1a5: 48 8d 78 08 lea 0x8(%rax),%rdi
1a9: 48 89 c1 mov %rax,%rcx
1ac: 48 c7 c6 00 00 00 00 mov $0x0,%rsi 1af: R_X86_64_32S core_restore_code
1b3: 48 83 e7 f8 and $0xfffffffffffffff8,%rdi
1b7: 48 29 f9 sub %rdi,%rcx
1ba: 48 89 10 mov %rdx,(%rax)
1bd: 48 8b 15 00 00 00 00 mov 0x0(%rip),%rdx # 1c4 <relocate_restore_code+0x44> 1c0: R_X86_64_PC32 core_restore_code+0xff4
1c4: 48 29 ce sub %rcx,%rsi
1c7: 81 c1 00 10 00 00 add $0x1000,%ecx
1cd: 48 89 90 f8 0f 00 00 mov %rdx,0xff8(%rax)
1d4: c1 e9 03 shr $0x3,%ecx
1d7: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
Notably the alignment code generates a text reference to
code_restore_code+0xff8, for which objtool raises the objection:
vmlinux.o: warning: objtool: relocate_restore_code+0x3d: relocation to !ENDBR: next_arg+0x18
Applying some __assume_aligned(PAGE_SIZE) improve code-gen to:
19e: 48 89 c7 mov %rax,%rdi
1a1: 48 c7 c6 00 00 00 00 mov $0x0,%rsi 1a4: R_X86_64_32S core_restore_code
1a8: b9 00 02 00 00 mov $0x200,%ecx
1ad: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
And resolve the problem, however, none of this is important code and
a much simpler solution still is to force a memcpy() call:
1a1: ba 00 10 00 00 mov $0x1000,%edx
1a6: 48 c7 c6 00 00 00 00 mov $0x0,%rsi 1a9: R_X86_64_32S core_restore_code
1ad: e8 00 00 00 00 call 1b2 <relocate_restore_code+0x32> 1ae: R_X86_64_PLT32 __memcpy-0x4
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2022-11-04 04:17:03 +08:00
|
|
|
__memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE);
|
2018-09-21 14:26:58 +08:00
|
|
|
|
|
|
|
/* Make the page containing the relocated code executable */
|
|
|
|
pgd = (pgd_t *)__va(read_cr3_pa()) +
|
|
|
|
pgd_index(relocated_restore_code);
|
|
|
|
p4d = p4d_offset(pgd, relocated_restore_code);
|
|
|
|
if (p4d_large(*p4d)) {
|
|
|
|
set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
pud = pud_offset(p4d, relocated_restore_code);
|
|
|
|
if (pud_large(*pud)) {
|
|
|
|
set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX));
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
pmd = pmd_offset(pud, relocated_restore_code);
|
|
|
|
if (pmd_large(*pmd)) {
|
|
|
|
set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
pte = pte_offset_kernel(pmd, relocated_restore_code);
|
|
|
|
set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
|
|
|
|
out:
|
|
|
|
__flush_tlb_all();
|
|
|
|
return 0;
|
|
|
|
}
|
x86/power: Fix 'nosmt' vs hibernation triple fault during resume
As explained in
0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once")
we always, no matter what, have to bring up x86 HT siblings during boot at
least once in order to avoid first MCE bringing the system to its knees.
That means that whenever 'nosmt' is supplied on the kernel command-line,
all the HT siblings are as a result sitting in mwait or cpudile after
going through the online-offline cycle at least once.
This causes a serious issue though when a kernel, which saw 'nosmt' on its
commandline, is going to perform resume from hibernation: if the resume
from the hibernated image is successful, cr3 is flipped in order to point
to the address space of the kernel that is being resumed, which in turn
means that all the HT siblings are all of a sudden mwaiting on address
which is no longer valid.
That results in triple fault shortly after cr3 is switched, and machine
reboots.
Fix this by always waking up all the SMT siblings before initiating the
'restore from hibernation' process; this guarantees that all the HT
siblings will be properly carried over to the resumed kernel waiting in
resume_play_dead(), and acted upon accordingly afterwards, based on the
target kernel configuration.
Symmetricaly, the resumed kernel has to push the SMT siblings to mwait
again in case it has SMT disabled; this means it has to online all
the siblings when resuming (so that they come out of hlt) and offline
them again to let them reach mwait.
Cc: 4.19+ <stable@vger.kernel.org> # v4.19+
Debugged-by: Thomas Gleixner <tglx@linutronix.de>
Fixes: 0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once")
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Acked-by: Pavel Machek <pavel@ucw.cz>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2019-05-30 06:09:39 +08:00
|
|
|
|
|
|
|
int arch_resume_nosmt(void)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
/*
|
|
|
|
* We reached this while coming out of hibernation. This means
|
|
|
|
* that SMT siblings are sleeping in hlt, as mwait is not safe
|
|
|
|
* against control transition during resume (see comment in
|
|
|
|
* hibernate_resume_nonboot_cpu_disable()).
|
|
|
|
*
|
|
|
|
* If the resumed kernel has SMT disabled, we have to take all the
|
|
|
|
* SMT siblings out of hlt, and offline them again so that they
|
|
|
|
* end up in mwait proper.
|
|
|
|
*
|
|
|
|
* Called with hotplug disabled.
|
|
|
|
*/
|
|
|
|
cpu_hotplug_enable();
|
|
|
|
if (cpu_smt_control == CPU_SMT_DISABLED ||
|
|
|
|
cpu_smt_control == CPU_SMT_FORCE_DISABLED) {
|
|
|
|
enum cpuhp_smt_control old = cpu_smt_control;
|
|
|
|
|
|
|
|
ret = cpuhp_smt_enable();
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
ret = cpuhp_smt_disable(old);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
cpu_hotplug_disable();
|
|
|
|
return ret;
|
|
|
|
}
|