diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index 8e0f8b85b209..a504adc661a4 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -4,6 +4,7 @@ #include extern struct e820_table *e820_table; +extern struct e820_table *e820_table_kexec; extern struct e820_table *e820_table_firmware; extern unsigned long pci_mem_start; diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 0b1ff4c1c14e..fffb2794dd89 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -7,6 +7,7 @@ bool pat_enabled(void); void pat_disable(const char *reason); extern void pat_init(void); +extern void init_cache_modes(void); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d78a586ba8dc..532da61d605c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -20,10 +20,12 @@ #include /* - * We organize the E820 table into two main data structures: + * We organize the E820 table into three main data structures: * * - 'e820_table_firmware': the original firmware version passed to us by the - * bootloader - not modified by the kernel. We use this to: + * bootloader - not modified by the kernel. It is composed of two parts: + * the first 128 E820 memory entries in boot_params.e820_table and the remaining + * (if any) entries of the SETUP_E820_EXT nodes. We use this to: * * - inform the user about the firmware's notion of memory layout * via /sys/firmware/memmap @@ -31,6 +33,14 @@ * - the hibernation code uses it to generate a kernel-independent MD5 * fingerprint of the physical memory layout of a system. * + * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version + * passed to us by the bootloader - the major difference between + * e820_table_firmware[] and this one is that, the latter marks the setup_data + * list created by the EFI boot stub as reserved, so that kexec can reuse the + * setup_data information in the second kernel. Besides, e820_table_kexec[] + * might also be modified by the kexec itself to fake a mptable. + * We use this to: + * * - kexec, which is a bootloader in disguise, uses the original E820 * layout to pass to the kexec-ed kernel. This way the original kernel * can have a restricted E820 map while the kexec()-ed kexec-kernel @@ -46,9 +56,11 @@ * specific memory layout data during early bootup. */ static struct e820_table e820_table_init __initdata; +static struct e820_table e820_table_kexec_init __initdata; static struct e820_table e820_table_firmware_init __initdata; struct e820_table *e820_table __refdata = &e820_table_init; +struct e820_table *e820_table_kexec __refdata = &e820_table_kexec_init; struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init; /* For PCI or other memory-mapped resources */ @@ -470,9 +482,9 @@ u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum return __e820__range_update(e820_table, start, size, old_type, new_type); } -static u64 __init e820__range_update_firmware(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { - return __e820__range_update(e820_table_firmware, start, size, old_type, new_type); + return __e820__range_update(e820_table_kexec, start, size, old_type, new_type); } /* Remove a range of memory from the E820 table: */ @@ -546,9 +558,9 @@ void __init e820__update_table_print(void) e820__print_table("modified"); } -static void __init e820__update_table_firmware(void) +static void __init e820__update_table_kexec(void) { - e820__update_table(e820_table_firmware); + e820__update_table(e820_table_kexec); } #define MAX_GAP_END 0x100000000ull @@ -623,7 +635,7 @@ __init void e820__setup_pci_gap(void) /* * Called late during init, in free_initmem(). * - * Initial e820_table and e820_table_firmware are largish __initdata arrays. + * Initial e820_table and e820_table_kexec are largish __initdata arrays. * * Copy them to a (usually much smaller) dynamically allocated area that is * sized precisely after the number of e820 entries. @@ -643,6 +655,12 @@ __init void e820__reallocate_tables(void) memcpy(n, e820_table, size); e820_table = n; + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries; + n = kmalloc(size, GFP_KERNEL); + BUG_ON(!n); + memcpy(n, e820_table_kexec, size); + e820_table_kexec = n; + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries; n = kmalloc(size, GFP_KERNEL); BUG_ON(!n); @@ -669,6 +687,9 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) __append_e820_table(extmap, entries); e820__update_table(e820_table); + memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); + memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + early_memunmap(sdata, data_len); pr_info("e820: extended physical RAM map:\n"); e820__print_table("extended"); @@ -727,7 +748,7 @@ core_initcall(e820__register_nvs_regions); /* * Allocate the requested number of bytes with the requsted alignment * and return (the physical address) to the caller. Also register this - * range in the 'firmware' E820 table as a reserved range. + * range in the 'kexec' E820 table as a reserved range. * * This allows kexec to fake a new mptable, as if it came from the real * system. @@ -738,9 +759,9 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); if (addr) { - e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); - pr_info("e820: update e820_table_firmware for e820__memblock_alloc_reserved()\n"); - e820__update_table_firmware(); + e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + pr_info("e820: update e820_table_kexec for e820__memblock_alloc_reserved()\n"); + e820__update_table_kexec(); } return addr; @@ -923,13 +944,13 @@ void __init e820__reserve_setup_data(void) while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); pa_data = data->next; early_memunmap(data, sizeof(*data)); } e820__update_table(e820_table); - - memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + e820__update_table(e820_table_kexec); pr_info("extended physical RAM map:\n"); e820__print_table("reserve setup_data"); @@ -1062,6 +1083,7 @@ void __init e820__reserve_resources(void) res++; } + /* Expose the bootloader-provided memory layout to the sysfs. */ for (i = 0; i < e820_table_firmware->nr_entries; i++) { struct e820_entry *entry = e820_table_firmware->entries + i; @@ -1175,6 +1197,7 @@ void __init e820__memory_setup(void) who = x86_init.resources.memory_setup(); + memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); pr_info("e820: BIOS-provided physical RAM map:\n"); diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 9d7fd5e6689a..fb095ba0c02f 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -100,14 +100,14 @@ static int setup_e820_entries(struct boot_params *params) { unsigned int nr_e820_entries; - nr_e820_entries = e820_table_firmware->nr_entries; + nr_e820_entries = e820_table_kexec->nr_entries; /* TODO: Pass entries more than E820_MAX_ENTRIES_ZEROPAGE in bootparams setup data */ if (nr_e820_entries > E820_MAX_ENTRIES_ZEROPAGE) nr_e820_entries = E820_MAX_ENTRIES_ZEROPAGE; params->e820_entries = nr_e820_entries; - memcpy(¶ms->e820_table, &e820_table_firmware->entries, nr_e820_entries*sizeof(struct e820_entry)); + memcpy(¶ms->e820_table, &e820_table_kexec->entries, nr_e820_entries*sizeof(struct e820_entry)); return 0; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 65622f07e633..3486d0498800 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1075,6 +1075,13 @@ void __init setup_arch(char **cmdline_p) max_possible_pfn = max_pfn; + /* + * This call is required when the CPU does not support PAT. If + * mtrr_bp_init() invoked it already via pat_init() the call has no + * effect. + */ + init_cache_modes(); + /* * Define random base addresses for memory sections after max_pfn is * defined and before each memory section base is used. diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 9b78685b66e6..45979502f64b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -37,14 +37,14 @@ #undef pr_fmt #define pr_fmt(fmt) "" fmt -static bool boot_cpu_done; - -static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); -static void init_cache_modes(void); +static bool __read_mostly boot_cpu_done; +static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); +static bool __read_mostly pat_initialized; +static bool __read_mostly init_cm_done; void pat_disable(const char *reason) { - if (!__pat_enabled) + if (pat_disabled) return; if (boot_cpu_done) { @@ -52,10 +52,8 @@ void pat_disable(const char *reason) return; } - __pat_enabled = 0; + pat_disabled = true; pr_info("x86/PAT: %s\n", reason); - - init_cache_modes(); } static int __init nopat(char *str) @@ -67,7 +65,7 @@ early_param("nopat", nopat); bool pat_enabled(void) { - return !!__pat_enabled; + return pat_initialized; } EXPORT_SYMBOL_GPL(pat_enabled); @@ -205,6 +203,8 @@ static void __init_cache_modes(u64 pat) update_cache_mode_entry(i, cache); } pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); + + init_cm_done = true; } #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) @@ -225,6 +225,7 @@ static void pat_bsp_init(u64 pat) } wrmsrl(MSR_IA32_CR_PAT, pat); + pat_initialized = true; __init_cache_modes(pat); } @@ -242,10 +243,9 @@ static void pat_ap_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); } -static void init_cache_modes(void) +void init_cache_modes(void) { u64 pat = 0; - static int init_cm_done; if (init_cm_done) return; @@ -287,8 +287,6 @@ static void init_cache_modes(void) } __init_cache_modes(pat); - - init_cm_done = 1; } /** @@ -306,10 +304,8 @@ void pat_init(void) u64 pat; struct cpuinfo_x86 *c = &boot_cpu_data; - if (!pat_enabled()) { - init_cache_modes(); + if (pat_disabled) return; - } if ((c->x86_vendor == X86_VENDOR_INTEL) && (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 2983faab5b18..d4a61ddf9e62 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -587,32 +587,12 @@ static unsigned long uv2_3_read_status(unsigned long offset, int rshft, int desc return ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1; } -/* - * Return whether the status of the descriptor that is normally used for this - * cpu (the one indexed by its hub-relative cpu number) is busy. - * The status of the original 32 descriptors is always reflected in the 64 - * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0. - * The bit provided by the activation_status_2 register is irrelevant to - * the status if it is only being tested for busy or not busy. - */ -int normal_busy(struct bau_control *bcp) -{ - int cpu = bcp->uvhub_cpu; - int mmr_offset; - int right_shift; - - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; - right_shift = cpu * UV_ACT_STATUS_SIZE; - return (((((read_lmmr(mmr_offset) >> right_shift) & - UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY); -} - /* * Entered when a bau descriptor has gone into a permanent busy wait because * of a hardware bug. * Workaround the bug. */ -int handle_uv2_busy(struct bau_control *bcp) +static int handle_uv2_busy(struct bau_control *bcp) { struct ptc_stats *stat = bcp->statp; @@ -917,8 +897,9 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, * Returns 1 if it gives up entirely and the original cpu mask is to be * returned to the kernel. */ -int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp, - struct bau_desc *bau_desc) +static int uv_flush_send_and_wait(struct cpumask *flush_mask, + struct bau_control *bcp, + struct bau_desc *bau_desc) { int seq_number = 0; int completion_stat = 0; @@ -1212,8 +1193,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, * Search the message queue for any 'other' unprocessed message with the * same software acknowledge resource bit vector as the 'msg' message. */ -struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, - struct bau_control *bcp) +static struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, + struct bau_control *bcp) { struct bau_pq_entry *msg_next = msg + 1; unsigned char swack_vec = msg->swack_vec;