2
0
mirror of https://github.com/edk2-porting/linux-next.git synced 2024-12-19 02:34:01 +08:00

Merge branch 'x86-boot-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 boot updates from Ingo Molnar:
 "The biggest changes in this cycle were:

   - prepare for more KASLR related changes, by restructuring, cleaning
     up and fixing the existing boot code.  (Kees Cook, Baoquan He,
     Yinghai Lu)

   - simplifly/concentrate subarch handling code, eliminate
     paravirt_enabled() usage.  (Luis R Rodriguez)"

* 'x86-boot-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (50 commits)
  x86/KASLR: Clarify purpose of each get_random_long()
  x86/KASLR: Add virtual address choosing function
  x86/KASLR: Return earliest overlap when avoiding regions
  x86/KASLR: Add 'struct slot_area' to manage random_addr slots
  x86/boot: Add missing file header comments
  x86/KASLR: Initialize mapping_info every time
  x86/boot: Comment what finalize_identity_maps() does
  x86/KASLR: Build identity mappings on demand
  x86/boot: Split out kernel_ident_mapping_init()
  x86/boot: Clean up indenting for asm/boot.h
  x86/KASLR: Improve comments around the mem_avoid[] logic
  x86/boot: Simplify pointer casting in choose_random_location()
  x86/KASLR: Consolidate mem_avoid[] entries
  x86/boot: Clean up pointer casting
  x86/boot: Warn on future overlapping memcpy() use
  x86/boot: Extract error reporting functions
  x86/boot: Correctly bounds-check relocations
  x86/KASLR: Clean up unused code from old 'run_size' and rename it to 'kernel_total_size'
  x86/boot: Fix "run_size" calculation
  x86/boot: Calculate decompression size during boot not build
  ...
This commit is contained in:
Linus Torvalds 2016-05-16 15:54:01 -07:00
commit 9a45f036af
50 changed files with 1258 additions and 782 deletions

View File

@ -1921,54 +1921,38 @@ config RELOCATABLE
(CONFIG_PHYSICAL_START) is used as the minimum location.
config RANDOMIZE_BASE
bool "Randomize the address of the kernel image"
bool "Randomize the address of the kernel image (KASLR)"
depends on RELOCATABLE
default n
---help---
Randomizes the physical and virtual address at which the
kernel image is decompressed, as a security feature that
deters exploit attempts relying on knowledge of the location
of kernel internals.
In support of Kernel Address Space Layout Randomization (KASLR),
this randomizes the physical address at which the kernel image
is decompressed and the virtual address where the kernel
image is mapped, as a security feature that deters exploit
attempts relying on knowledge of the location of kernel
code internals.
Entropy is generated using the RDRAND instruction if it is
supported. If RDTSC is supported, it is used as well. If
neither RDRAND nor RDTSC are supported, then randomness is
read from the i8254 timer.
The kernel physical and virtual address can be randomized
from 16MB up to 1GB on 64-bit and 512MB on 32-bit. (Note that
using RANDOMIZE_BASE reduces the memory space available to
kernel modules from 1.5GB to 1GB.)
The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET,
and aligned according to PHYSICAL_ALIGN. Since the kernel is
built using 2GiB addressing, and PHYSICAL_ALGIN must be at a
minimum of 2MiB, only 10 bits of entropy is theoretically
possible. At best, due to page table layouts, 64-bit can use
9 bits of entropy and 32-bit uses 8 bits.
Entropy is generated using the RDRAND instruction if it is
supported. If RDTSC is supported, its value is mixed into
the entropy pool as well. If neither RDRAND nor RDTSC are
supported, then entropy is read from the i8254 timer.
If unsure, say N.
Since the kernel is built using 2GB addressing, and
PHYSICAL_ALIGN must be at a minimum of 2MB, only 10 bits of
entropy is theoretically possible. Currently, with the
default value for PHYSICAL_ALIGN and due to page table
layouts, 64-bit uses 9 bits of entropy and 32-bit uses 8 bits.
config RANDOMIZE_BASE_MAX_OFFSET
hex "Maximum kASLR offset allowed" if EXPERT
depends on RANDOMIZE_BASE
range 0x0 0x20000000 if X86_32
default "0x20000000" if X86_32
range 0x0 0x40000000 if X86_64
default "0x40000000" if X86_64
---help---
The lesser of RANDOMIZE_BASE_MAX_OFFSET and available physical
memory is used to determine the maximal offset in bytes that will
be applied to the kernel when kernel Address Space Layout
Randomization (kASLR) is active. This must be a multiple of
PHYSICAL_ALIGN.
If CONFIG_HIBERNATE is also enabled, KASLR is disabled at boot
time. To enable it, boot with "kaslr" on the kernel command
line (which will also disable hibernation).
On 32-bit this is limited to 512MiB by page table layouts. The
default is 512MiB.
On 64-bit this is limited by how the kernel fixmap page table is
positioned, so this cannot be larger than 1GiB currently. Without
RANDOMIZE_BASE, there is a 512MiB to 1.5GiB split between kernel
and modules. When RANDOMIZE_BASE_MAX_OFFSET is above 512MiB, the
modules area will shrink to compensate, up to the current maximum
1GiB to 1GiB split. The default is 1GiB.
If unsure, leave at the default value.
If unsure, say N.
# Relocation on x86 needs some additional build support
config X86_NEED_RELOCS

View File

@ -208,7 +208,8 @@ endif
head-y := arch/x86/kernel/head_$(BITS).o
head-y += arch/x86/kernel/head$(BITS).o
head-y += arch/x86/kernel/head.o
head-y += arch/x86/kernel/ebda.o
head-y += arch/x86/kernel/platform-quirks.o
libs-y += arch/x86/lib/

View File

@ -86,16 +86,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 0x\1/p'
quiet_cmd_voffset = VOFFSET $@
cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
targets += voffset.h
$(obj)/voffset.h: vmlinux FORCE
$(call if_changed,voffset)
sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p'
quiet_cmd_zoffset = ZOFFSET $@
cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
@ -106,7 +97,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
AFLAGS_header.o += -I$(obj)
$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
$(obj)/header.o: $(obj)/zoffset.h
LDFLAGS_setup.elf := -T
$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE

View File

@ -57,12 +57,27 @@ LDFLAGS_vmlinux := -T
hostprogs-y := mkpiggy
HOST_EXTRACFLAGS += -I$(srctree)/tools/include
sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
quiet_cmd_voffset = VOFFSET $@
cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
targets += ../voffset.h
$(obj)/../voffset.h: vmlinux FORCE
$(call if_changed,voffset)
$(obj)/misc.o: $(obj)/../voffset.h
vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
$(obj)/string.o $(obj)/cmdline.o \
$(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \
$(obj)/piggy.o $(obj)/cpuflags.o
vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
ifdef CONFIG_X86_64
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o
endif
$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
@ -109,10 +124,8 @@ suffix-$(CONFIG_KERNEL_XZ) := xz
suffix-$(CONFIG_KERNEL_LZO) := lzo
suffix-$(CONFIG_KERNEL_LZ4) := lz4
RUN_SIZE = $(shell $(OBJDUMP) -h vmlinux | \
$(CONFIG_SHELL) $(srctree)/arch/x86/tools/calc_run_size.sh)
quiet_cmd_mkpiggy = MKPIGGY $@
cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false )
cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
targets += piggy.S
$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE

View File

@ -1,339 +0,0 @@
#include "misc.h"
#include <asm/msr.h>
#include <asm/archrandom.h>
#include <asm/e820.h>
#include <generated/compile.h>
#include <linux/module.h>
#include <linux/uts.h>
#include <linux/utsname.h>
#include <generated/utsrelease.h>
/* Simplified build-specific string for starting entropy. */
static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
#define I8254_PORT_CONTROL 0x43
#define I8254_PORT_COUNTER0 0x40
#define I8254_CMD_READBACK 0xC0
#define I8254_SELECT_COUNTER0 0x02
#define I8254_STATUS_NOTREADY 0x40
static inline u16 i8254(void)
{
u16 status, timer;
do {
outb(I8254_PORT_CONTROL,
I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
status = inb(I8254_PORT_COUNTER0);
timer = inb(I8254_PORT_COUNTER0);
timer |= inb(I8254_PORT_COUNTER0) << 8;
} while (status & I8254_STATUS_NOTREADY);
return timer;
}
static unsigned long rotate_xor(unsigned long hash, const void *area,
size_t size)
{
size_t i;
unsigned long *ptr = (unsigned long *)area;
for (i = 0; i < size / sizeof(hash); i++) {
/* Rotate by odd number of bits and XOR. */
hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
hash ^= ptr[i];
}
return hash;
}
/* Attempt to create a simple but unpredictable starting entropy. */
static unsigned long get_random_boot(void)
{
unsigned long hash = 0;
hash = rotate_xor(hash, build_str, sizeof(build_str));
hash = rotate_xor(hash, real_mode, sizeof(*real_mode));
return hash;
}
static unsigned long get_random_long(void)
{
#ifdef CONFIG_X86_64
const unsigned long mix_const = 0x5d6008cbf3848dd3UL;
#else
const unsigned long mix_const = 0x3f39e593UL;
#endif
unsigned long raw, random = get_random_boot();
bool use_i8254 = true;
debug_putstr("KASLR using");
if (has_cpuflag(X86_FEATURE_RDRAND)) {
debug_putstr(" RDRAND");
if (rdrand_long(&raw)) {
random ^= raw;
use_i8254 = false;
}
}
if (has_cpuflag(X86_FEATURE_TSC)) {
debug_putstr(" RDTSC");
raw = rdtsc();
random ^= raw;
use_i8254 = false;
}
if (use_i8254) {
debug_putstr(" i8254");
random ^= i8254();
}
/* Circular multiply for better bit diffusion */
asm("mul %3"
: "=a" (random), "=d" (raw)
: "a" (random), "rm" (mix_const));
random += raw;
debug_putstr("...\n");
return random;
}
struct mem_vector {
unsigned long start;
unsigned long size;
};
#define MEM_AVOID_MAX 5
static struct mem_vector mem_avoid[MEM_AVOID_MAX];
static bool mem_contains(struct mem_vector *region, struct mem_vector *item)
{
/* Item at least partially before region. */
if (item->start < region->start)
return false;
/* Item at least partially after region. */
if (item->start + item->size > region->start + region->size)
return false;
return true;
}
static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
{
/* Item one is entirely before item two. */
if (one->start + one->size <= two->start)
return false;
/* Item one is entirely after item two. */
if (one->start >= two->start + two->size)
return false;
return true;
}
static void mem_avoid_init(unsigned long input, unsigned long input_size,
unsigned long output, unsigned long output_size)
{
u64 initrd_start, initrd_size;
u64 cmd_line, cmd_line_size;
unsigned long unsafe, unsafe_len;
char *ptr;
/*
* Avoid the region that is unsafe to overlap during
* decompression (see calculations at top of misc.c).
*/
unsafe_len = (output_size >> 12) + 32768 + 18;
unsafe = (unsigned long)input + input_size - unsafe_len;
mem_avoid[0].start = unsafe;
mem_avoid[0].size = unsafe_len;
/* Avoid initrd. */
initrd_start = (u64)real_mode->ext_ramdisk_image << 32;
initrd_start |= real_mode->hdr.ramdisk_image;
initrd_size = (u64)real_mode->ext_ramdisk_size << 32;
initrd_size |= real_mode->hdr.ramdisk_size;
mem_avoid[1].start = initrd_start;
mem_avoid[1].size = initrd_size;
/* Avoid kernel command line. */
cmd_line = (u64)real_mode->ext_cmd_line_ptr << 32;
cmd_line |= real_mode->hdr.cmd_line_ptr;
/* Calculate size of cmd_line. */
ptr = (char *)(unsigned long)cmd_line;
for (cmd_line_size = 0; ptr[cmd_line_size++]; )
;
mem_avoid[2].start = cmd_line;
mem_avoid[2].size = cmd_line_size;
/* Avoid heap memory. */
mem_avoid[3].start = (unsigned long)free_mem_ptr;
mem_avoid[3].size = BOOT_HEAP_SIZE;
/* Avoid stack memory. */
mem_avoid[4].start = (unsigned long)free_mem_end_ptr;
mem_avoid[4].size = BOOT_STACK_SIZE;
}
/* Does this memory vector overlap a known avoided area? */
static bool mem_avoid_overlap(struct mem_vector *img)
{
int i;
struct setup_data *ptr;
for (i = 0; i < MEM_AVOID_MAX; i++) {
if (mem_overlaps(img, &mem_avoid[i]))
return true;
}
/* Avoid all entries in the setup_data linked list. */
ptr = (struct setup_data *)(unsigned long)real_mode->hdr.setup_data;
while (ptr) {
struct mem_vector avoid;
avoid.start = (unsigned long)ptr;
avoid.size = sizeof(*ptr) + ptr->len;
if (mem_overlaps(img, &avoid))
return true;
ptr = (struct setup_data *)(unsigned long)ptr->next;
}
return false;
}
static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET /
CONFIG_PHYSICAL_ALIGN];
static unsigned long slot_max;
static void slots_append(unsigned long addr)
{
/* Overflowing the slots list should be impossible. */
if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET /
CONFIG_PHYSICAL_ALIGN)
return;
slots[slot_max++] = addr;
}
static unsigned long slots_fetch_random(void)
{
/* Handle case of no slots stored. */
if (slot_max == 0)
return 0;
return slots[get_random_long() % slot_max];
}
static void process_e820_entry(struct e820entry *entry,
unsigned long minimum,
unsigned long image_size)
{
struct mem_vector region, img;
/* Skip non-RAM entries. */
if (entry->type != E820_RAM)
return;
/* Ignore entries entirely above our maximum. */
if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
return;
/* Ignore entries entirely below our minimum. */
if (entry->addr + entry->size < minimum)
return;
region.start = entry->addr;
region.size = entry->size;
/* Potentially raise address to minimum location. */
if (region.start < minimum)
region.start = minimum;
/* Potentially raise address to meet alignment requirements. */
region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
/* Did we raise the address above the bounds of this e820 region? */
if (region.start > entry->addr + entry->size)
return;
/* Reduce size by any delta from the original address. */
region.size -= region.start - entry->addr;
/* Reduce maximum size to fit end of image within maximum limit. */
if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start;
/* Walk each aligned slot and check for avoided areas. */
for (img.start = region.start, img.size = image_size ;
mem_contains(&region, &img) ;
img.start += CONFIG_PHYSICAL_ALIGN) {
if (mem_avoid_overlap(&img))
continue;
slots_append(img.start);
}
}
static unsigned long find_random_addr(unsigned long minimum,
unsigned long size)
{
int i;
unsigned long addr;
/* Make sure minimum is aligned. */
minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
/* Verify potential e820 positions, appending to slots list. */
for (i = 0; i < real_mode->e820_entries; i++) {
process_e820_entry(&real_mode->e820_map[i], minimum, size);
}
return slots_fetch_random();
}
unsigned char *choose_kernel_location(struct boot_params *boot_params,
unsigned char *input,
unsigned long input_size,
unsigned char *output,
unsigned long output_size)
{
unsigned long choice = (unsigned long)output;
unsigned long random;
#ifdef CONFIG_HIBERNATION
if (!cmdline_find_option_bool("kaslr")) {
debug_putstr("KASLR disabled by default...\n");
goto out;
}
#else
if (cmdline_find_option_bool("nokaslr")) {
debug_putstr("KASLR disabled by cmdline...\n");
goto out;
}
#endif
boot_params->hdr.loadflags |= KASLR_FLAG;
/* Record the various known unsafe memory ranges. */
mem_avoid_init((unsigned long)input, input_size,
(unsigned long)output, output_size);
/* Walk e820 and find a random address. */
random = find_random_addr(choice, output_size);
if (!random) {
debug_putstr("KASLR could not find suitable E820 region...\n");
goto out;
}
/* Always enforce the minimum. */
if (random < choice)
goto out;
choice = random;
out:
return (unsigned char *)choice;
}

View File

@ -15,9 +15,9 @@ static inline char rdfs8(addr_t addr)
#include "../cmdline.c"
static unsigned long get_cmd_line_ptr(void)
{
unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr;
unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr;
cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32;
cmd_line_ptr |= (u64)boot_params->ext_cmd_line_ptr << 32;
return cmd_line_ptr;
}

View File

@ -0,0 +1,22 @@
/*
* Callers outside of misc.c need access to the error reporting routines,
* but the *_putstr() functions need to stay in misc.c because of how
* memcpy() and memmove() are defined for the compressed boot environment.
*/
#include "misc.h"
void warn(char *m)
{
error_putstr("\n\n");
error_putstr(m);
error_putstr("\n\n");
}
void error(char *m)
{
warn(m);
error_putstr(" -- System halted");
while (1)
asm("hlt");
}

View File

@ -0,0 +1,7 @@
#ifndef BOOT_COMPRESSED_ERROR_H
#define BOOT_COMPRESSED_ERROR_H
void warn(char *m);
void error(char *m);
#endif /* BOOT_COMPRESSED_ERROR_H */

View File

@ -176,7 +176,9 @@ preferred_addr:
1:
/* Target address to relocate to for decompression */
addl $z_extract_offset, %ebx
movl BP_init_size(%esi), %eax
subl $_end, %eax
addl %eax, %ebx
/* Set up the stack */
leal boot_stack_end(%ebx), %esp
@ -233,24 +235,28 @@ relocated:
2:
/*
* Do the decompression, and jump to the new kernel..
* Do the extraction, and jump to the new kernel..
*/
/* push arguments for decompress_kernel: */
pushl $z_run_size /* size of kernel with .bss and .brk */
/* push arguments for extract_kernel: */
pushl $z_output_len /* decompressed length, end of relocs */
leal z_extract_offset_negative(%ebx), %ebp
movl BP_init_size(%esi), %eax
subl $_end, %eax
movl %ebx, %ebp
subl %eax, %ebp
pushl %ebp /* output address */
pushl $z_input_len /* input_len */
leal input_data(%ebx), %eax
pushl %eax /* input_data */
leal boot_heap(%ebx), %eax
pushl %eax /* heap area */
pushl %esi /* real mode pointer */
call decompress_kernel /* returns kernel location in %eax */
addl $28, %esp
call extract_kernel /* returns kernel location in %eax */
addl $24, %esp
/*
* Jump to the decompressed kernel.
* Jump to the extracted kernel.
*/
xorl %ebx, %ebx
jmp *%eax

View File

@ -110,7 +110,9 @@ ENTRY(startup_32)
1:
/* Target address to relocate to for decompression */
addl $z_extract_offset, %ebx
movl BP_init_size(%esi), %eax
subl $_end, %eax
addl %eax, %ebx
/*
* Prepare for entering 64 bit mode
@ -132,7 +134,7 @@ ENTRY(startup_32)
/* Initialize Page tables to 0 */
leal pgtable(%ebx), %edi
xorl %eax, %eax
movl $((4096*6)/4), %ecx
movl $(BOOT_INIT_PGT_SIZE/4), %ecx
rep stosl
/* Build Level 4 */
@ -338,7 +340,9 @@ preferred_addr:
1:
/* Target address to relocate to for decompression */
leaq z_extract_offset(%rbp), %rbx
movl BP_init_size(%rsi), %ebx
subl $_end, %ebx
addq %rbp, %rbx
/* Set up the stack */
leaq boot_stack_end(%rbx), %rsp
@ -408,19 +412,16 @@ relocated:
2:
/*
* Do the decompression, and jump to the new kernel..
* Do the extraction, and jump to the new kernel..
*/
pushq %rsi /* Save the real mode argument */
movq $z_run_size, %r9 /* size of kernel with .bss and .brk */
pushq %r9
movq %rsi, %rdi /* real mode address */
leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
leaq input_data(%rip), %rdx /* input_data */
movl $z_input_len, %ecx /* input_len */
movq %rbp, %r8 /* output target address */
movq $z_output_len, %r9 /* decompressed length, end of relocs */
call decompress_kernel /* returns kernel location in %rax */
popq %r9
call extract_kernel /* returns kernel location in %rax */
popq %rsi
/*
@ -485,4 +486,4 @@ boot_stack_end:
.section ".pgtable","a",@nobits
.balign 4096
pgtable:
.fill 6*4096, 1, 0
.fill BOOT_PGT_SIZE, 1, 0

View File

@ -0,0 +1,510 @@
/*
* kaslr.c
*
* This contains the routines needed to generate a reasonable level of
* entropy to choose a randomized kernel base address offset in support
* of Kernel Address Space Layout Randomization (KASLR). Additionally
* handles walking the physical memory maps (and tracking memory regions
* to avoid) in order to select a physical memory location that can
* contain the entire properly aligned running kernel image.
*
*/
#include "misc.h"
#include "error.h"
#include <asm/msr.h>
#include <asm/archrandom.h>
#include <asm/e820.h>
#include <generated/compile.h>
#include <linux/module.h>
#include <linux/uts.h>
#include <linux/utsname.h>
#include <generated/utsrelease.h>
/* Simplified build-specific string for starting entropy. */
static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
#define I8254_PORT_CONTROL 0x43
#define I8254_PORT_COUNTER0 0x40
#define I8254_CMD_READBACK 0xC0
#define I8254_SELECT_COUNTER0 0x02
#define I8254_STATUS_NOTREADY 0x40
static inline u16 i8254(void)
{
u16 status, timer;
do {
outb(I8254_PORT_CONTROL,
I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
status = inb(I8254_PORT_COUNTER0);
timer = inb(I8254_PORT_COUNTER0);
timer |= inb(I8254_PORT_COUNTER0) << 8;
} while (status & I8254_STATUS_NOTREADY);
return timer;
}
static unsigned long rotate_xor(unsigned long hash, const void *area,
size_t size)
{
size_t i;
unsigned long *ptr = (unsigned long *)area;
for (i = 0; i < size / sizeof(hash); i++) {
/* Rotate by odd number of bits and XOR. */
hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
hash ^= ptr[i];
}
return hash;
}
/* Attempt to create a simple but unpredictable starting entropy. */
static unsigned long get_random_boot(void)
{
unsigned long hash = 0;
hash = rotate_xor(hash, build_str, sizeof(build_str));
hash = rotate_xor(hash, boot_params, sizeof(*boot_params));
return hash;
}
static unsigned long get_random_long(const char *purpose)
{
#ifdef CONFIG_X86_64
const unsigned long mix_const = 0x5d6008cbf3848dd3UL;
#else
const unsigned long mix_const = 0x3f39e593UL;
#endif
unsigned long raw, random = get_random_boot();
bool use_i8254 = true;
debug_putstr(purpose);
debug_putstr(" KASLR using");
if (has_cpuflag(X86_FEATURE_RDRAND)) {
debug_putstr(" RDRAND");
if (rdrand_long(&raw)) {
random ^= raw;
use_i8254 = false;
}
}
if (has_cpuflag(X86_FEATURE_TSC)) {
debug_putstr(" RDTSC");
raw = rdtsc();
random ^= raw;
use_i8254 = false;
}
if (use_i8254) {
debug_putstr(" i8254");
random ^= i8254();
}
/* Circular multiply for better bit diffusion */
asm("mul %3"
: "=a" (random), "=d" (raw)
: "a" (random), "rm" (mix_const));
random += raw;
debug_putstr("...\n");
return random;
}
struct mem_vector {
unsigned long start;
unsigned long size;
};
enum mem_avoid_index {
MEM_AVOID_ZO_RANGE = 0,
MEM_AVOID_INITRD,
MEM_AVOID_CMDLINE,
MEM_AVOID_BOOTPARAMS,
MEM_AVOID_MAX,
};
static struct mem_vector mem_avoid[MEM_AVOID_MAX];
static bool mem_contains(struct mem_vector *region, struct mem_vector *item)
{
/* Item at least partially before region. */
if (item->start < region->start)
return false;
/* Item at least partially after region. */
if (item->start + item->size > region->start + region->size)
return false;
return true;
}
static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
{
/* Item one is entirely before item two. */
if (one->start + one->size <= two->start)
return false;
/* Item one is entirely after item two. */
if (one->start >= two->start + two->size)
return false;
return true;
}
/*
* In theory, KASLR can put the kernel anywhere in the range of [16M, 64T).
* The mem_avoid array is used to store the ranges that need to be avoided
* when KASLR searches for an appropriate random address. We must avoid any
* regions that are unsafe to overlap with during decompression, and other
* things like the initrd, cmdline and boot_params. This comment seeks to
* explain mem_avoid as clearly as possible since incorrect mem_avoid
* memory ranges lead to really hard to debug boot failures.
*
* The initrd, cmdline, and boot_params are trivial to identify for
* avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and
* MEM_AVOID_BOOTPARAMS respectively below.
*
* What is not obvious how to avoid is the range of memory that is used
* during decompression (MEM_AVOID_ZO_RANGE below). This range must cover
* the compressed kernel (ZO) and its run space, which is used to extract
* the uncompressed kernel (VO) and relocs.
*
* ZO's full run size sits against the end of the decompression buffer, so
* we can calculate where text, data, bss, etc of ZO are positioned more
* easily.
*
* For additional background, the decompression calculations can be found
* in header.S, and the memory diagram is based on the one found in misc.c.
*
* The following conditions are already enforced by the image layouts and
* associated code:
* - input + input_size >= output + output_size
* - kernel_total_size <= init_size
* - kernel_total_size <= output_size (see Note below)
* - output + init_size >= output + output_size
*
* (Note that kernel_total_size and output_size have no fundamental
* relationship, but output_size is passed to choose_random_location
* as a maximum of the two. The diagram is showing a case where
* kernel_total_size is larger than output_size, but this case is
* handled by bumping output_size.)
*
* The above conditions can be illustrated by a diagram:
*
* 0 output input input+input_size output+init_size
* | | | | |
* | | | | |
* |-----|--------|--------|--------------|-----------|--|-------------|
* | | |
* | | |
* output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size
*
* [output, output+init_size) is the entire memory range used for
* extracting the compressed image.
*
* [output, output+kernel_total_size) is the range needed for the
* uncompressed kernel (VO) and its run size (bss, brk, etc).
*
* [output, output+output_size) is VO plus relocs (i.e. the entire
* uncompressed payload contained by ZO). This is the area of the buffer
* written to during decompression.
*
* [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case
* range of the copied ZO and decompression code. (i.e. the range
* covered backwards of size ZO_INIT_SIZE, starting from output+init_size.)
*
* [input, input+input_size) is the original copied compressed image (ZO)
* (i.e. it does not include its run size). This range must be avoided
* because it contains the data used for decompression.
*
* [input+input_size, output+init_size) is [_text, _end) for ZO. This
* range includes ZO's heap and stack, and must be avoided since it
* performs the decompression.
*
* Since the above two ranges need to be avoided and they are adjacent,
* they can be merged, resulting in: [input, output+init_size) which
* becomes the MEM_AVOID_ZO_RANGE below.
*/
static void mem_avoid_init(unsigned long input, unsigned long input_size,
unsigned long output)
{
unsigned long init_size = boot_params->hdr.init_size;
u64 initrd_start, initrd_size;
u64 cmd_line, cmd_line_size;
char *ptr;
/*
* Avoid the region that is unsafe to overlap during
* decompression.
*/
mem_avoid[MEM_AVOID_ZO_RANGE].start = input;
mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input;
add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start,
mem_avoid[MEM_AVOID_ZO_RANGE].size);
/* Avoid initrd. */
initrd_start = (u64)boot_params->ext_ramdisk_image << 32;
initrd_start |= boot_params->hdr.ramdisk_image;
initrd_size = (u64)boot_params->ext_ramdisk_size << 32;
initrd_size |= boot_params->hdr.ramdisk_size;
mem_avoid[MEM_AVOID_INITRD].start = initrd_start;
mem_avoid[MEM_AVOID_INITRD].size = initrd_size;
/* No need to set mapping for initrd, it will be handled in VO. */
/* Avoid kernel command line. */
cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32;
cmd_line |= boot_params->hdr.cmd_line_ptr;
/* Calculate size of cmd_line. */
ptr = (char *)(unsigned long)cmd_line;
for (cmd_line_size = 0; ptr[cmd_line_size++]; )
;
mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start,
mem_avoid[MEM_AVOID_CMDLINE].size);
/* Avoid boot parameters. */
mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params;
mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params);
add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start,
mem_avoid[MEM_AVOID_BOOTPARAMS].size);
/* We don't need to set a mapping for setup_data. */
#ifdef CONFIG_X86_VERBOSE_BOOTUP
/* Make sure video RAM can be used. */
add_identity_map(0, PMD_SIZE);
#endif
}
/*
* Does this memory vector overlap a known avoided area? If so, record the
* overlap region with the lowest address.
*/
static bool mem_avoid_overlap(struct mem_vector *img,
struct mem_vector *overlap)
{
int i;
struct setup_data *ptr;
unsigned long earliest = img->start + img->size;
bool is_overlapping = false;
for (i = 0; i < MEM_AVOID_MAX; i++) {
if (mem_overlaps(img, &mem_avoid[i]) &&
mem_avoid[i].start < earliest) {
*overlap = mem_avoid[i];
is_overlapping = true;
}
}
/* Avoid all entries in the setup_data linked list. */
ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data;
while (ptr) {
struct mem_vector avoid;
avoid.start = (unsigned long)ptr;
avoid.size = sizeof(*ptr) + ptr->len;
if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) {
*overlap = avoid;
is_overlapping = true;
}
ptr = (struct setup_data *)(unsigned long)ptr->next;
}
return is_overlapping;
}
static unsigned long slots[KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN];
struct slot_area {
unsigned long addr;
int num;
};
#define MAX_SLOT_AREA 100
static struct slot_area slot_areas[MAX_SLOT_AREA];
static unsigned long slot_max;
static unsigned long slot_area_index;
static void store_slot_info(struct mem_vector *region, unsigned long image_size)
{
struct slot_area slot_area;
if (slot_area_index == MAX_SLOT_AREA)
return;
slot_area.addr = region->start;
slot_area.num = (region->size - image_size) /
CONFIG_PHYSICAL_ALIGN + 1;
if (slot_area.num > 0) {
slot_areas[slot_area_index++] = slot_area;
slot_max += slot_area.num;
}
}
static void slots_append(unsigned long addr)
{
/* Overflowing the slots list should be impossible. */
if (slot_max >= KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN)
return;
slots[slot_max++] = addr;
}
static unsigned long slots_fetch_random(void)
{
/* Handle case of no slots stored. */
if (slot_max == 0)
return 0;
return slots[get_random_long("Physical") % slot_max];
}
static void process_e820_entry(struct e820entry *entry,
unsigned long minimum,
unsigned long image_size)
{
struct mem_vector region, img, overlap;
/* Skip non-RAM entries. */
if (entry->type != E820_RAM)
return;
/* Ignore entries entirely above our maximum. */
if (entry->addr >= KERNEL_IMAGE_SIZE)
return;
/* Ignore entries entirely below our minimum. */
if (entry->addr + entry->size < minimum)
return;
region.start = entry->addr;
region.size = entry->size;
/* Potentially raise address to minimum location. */
if (region.start < minimum)
region.start = minimum;
/* Potentially raise address to meet alignment requirements. */
region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
/* Did we raise the address above the bounds of this e820 region? */
if (region.start > entry->addr + entry->size)
return;
/* Reduce size by any delta from the original address. */
region.size -= region.start - entry->addr;
/* Reduce maximum size to fit end of image within maximum limit. */
if (region.start + region.size > KERNEL_IMAGE_SIZE)
region.size = KERNEL_IMAGE_SIZE - region.start;
/* Walk each aligned slot and check for avoided areas. */
for (img.start = region.start, img.size = image_size ;
mem_contains(&region, &img) ;
img.start += CONFIG_PHYSICAL_ALIGN) {
if (mem_avoid_overlap(&img, &overlap))
continue;
slots_append(img.start);
}
}
static unsigned long find_random_phys_addr(unsigned long minimum,
unsigned long image_size)
{
int i;
unsigned long addr;
/* Make sure minimum is aligned. */
minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
/* Verify potential e820 positions, appending to slots list. */
for (i = 0; i < boot_params->e820_entries; i++) {
process_e820_entry(&boot_params->e820_map[i], minimum,
image_size);
}
return slots_fetch_random();
}
static unsigned long find_random_virt_addr(unsigned long minimum,
unsigned long image_size)
{
unsigned long slots, random_addr;
/* Make sure minimum is aligned. */
minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
/* Align image_size for easy slot calculations. */
image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN);
/*
* There are how many CONFIG_PHYSICAL_ALIGN-sized slots
* that can hold image_size within the range of minimum to
* KERNEL_IMAGE_SIZE?
*/
slots = (KERNEL_IMAGE_SIZE - minimum - image_size) /
CONFIG_PHYSICAL_ALIGN + 1;
random_addr = get_random_long("Virtual") % slots;
return random_addr * CONFIG_PHYSICAL_ALIGN + minimum;
}
/*
* Since this function examines addresses much more numerically,
* it takes the input and output pointers as 'unsigned long'.
*/
unsigned char *choose_random_location(unsigned long input,
unsigned long input_size,
unsigned long output,
unsigned long output_size)
{
unsigned long choice = output;
unsigned long random_addr;
#ifdef CONFIG_HIBERNATION
if (!cmdline_find_option_bool("kaslr")) {
warn("KASLR disabled: 'kaslr' not on cmdline (hibernation selected).");
goto out;
}
#else
if (cmdline_find_option_bool("nokaslr")) {
warn("KASLR disabled: 'nokaslr' on cmdline.");
goto out;
}
#endif
boot_params->hdr.loadflags |= KASLR_FLAG;
/* Record the various known unsafe memory ranges. */
mem_avoid_init(input, input_size, output);
/* Walk e820 and find a random address. */
random_addr = find_random_phys_addr(output, output_size);
if (!random_addr) {
warn("KASLR disabled: could not find suitable E820 region!");
goto out;
}
/* Always enforce the minimum. */
if (random_addr < choice)
goto out;
choice = random_addr;
add_identity_map(choice, output_size);
/* This actually loads the identity pagetable on x86_64. */
finalize_identity_maps();
out:
return (unsigned char *)choice;
}

View File

@ -1,8 +1,10 @@
/*
* misc.c
*
* This is a collection of several routines from gzip-1.0.3
* adapted for Linux.
* This is a collection of several routines used to extract the kernel
* which includes KASLR relocation, decompression, ELF parsing, and
* relocation processing. Additionally included are the screen and serial
* output functions and related debugging support functions.
*
* malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
* puts by Nick Holloway 1993, better puts by Martin Mares 1995
@ -10,111 +12,37 @@
*/
#include "misc.h"
#include "error.h"
#include "../string.h"
/* WARNING!!
* This code is compiled with -fPIC and it is relocated dynamically
* at run time, but no relocation processing is performed.
* This means that it is not safe to place pointers in static structures.
*/
#include "../voffset.h"
/*
* Getting to provable safe in place decompression is hard.
* Worst case behaviours need to be analyzed.
* Background information:
*
* The file layout is:
* magic[2]
* method[1]
* flags[1]
* timestamp[4]
* extraflags[1]
* os[1]
* compressed data blocks[N]
* crc[4] orig_len[4]
*
* resulting in 18 bytes of non compressed data overhead.
*
* Files divided into blocks
* 1 bit (last block flag)
* 2 bits (block type)
*
* 1 block occurs every 32K -1 bytes or when there 50% compression
* has been achieved. The smallest block type encoding is always used.
*
* stored:
* 32 bits length in bytes.
*
* fixed:
* magic fixed tree.
* symbols.
*
* dynamic:
* dynamic tree encoding.
* symbols.
*
*
* The buffer for decompression in place is the length of the
* uncompressed data, plus a small amount extra to keep the algorithm safe.
* The compressed data is placed at the end of the buffer. The output
* pointer is placed at the start of the buffer and the input pointer
* is placed where the compressed data starts. Problems will occur
* when the output pointer overruns the input pointer.
*
* The output pointer can only overrun the input pointer if the input
* pointer is moving faster than the output pointer. A condition only
* triggered by data whose compressed form is larger than the uncompressed
* form.
*
* The worst case at the block level is a growth of the compressed data
* of 5 bytes per 32767 bytes.
*
* The worst case internal to a compressed block is very hard to figure.
* The worst case can at least be boundined by having one bit that represents
* 32764 bytes and then all of the rest of the bytes representing the very
* very last byte.
*
* All of which is enough to compute an amount of extra data that is required
* to be safe. To avoid problems at the block level allocating 5 extra bytes
* per 32767 bytes of data is sufficient. To avoind problems internal to a
* block adding an extra 32767 bytes (the worst case uncompressed block size)
* is sufficient, to ensure that in the worst case the decompressed data for
* block will stop the byte before the compressed data for a block begins.
* To avoid problems with the compressed data's meta information an extra 18
* bytes are needed. Leading to the formula:
*
* extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
*
* Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
* Adding 32768 instead of 32767 just makes for round numbers.
* Adding the decompressor_size is necessary as it musht live after all
* of the data as well. Last I measured the decompressor is about 14K.
* 10K of actual data and 4K of bss.
*
* WARNING!!
* This code is compiled with -fPIC and it is relocated dynamically at
* run time, but no relocation processing is performed. This means that
* it is not safe to place pointers in static structures.
*/
/*
* gzip declarations
*/
/* Macros used by the included decompressor code below. */
#define STATIC static
#undef memcpy
/*
* Use a normal definition of memset() from string.c. There are already
* Use normal definitions of mem*() from string.c. There are already
* included header files which expect a definition of memset() and by
* the time we define memset macro, it is too late.
*/
#undef memcpy
#undef memset
#define memzero(s, n) memset((s), 0, (n))
#define memmove memmove
static void error(char *m);
/* Functions used by the included decompressor code below. */
void *memmove(void *dest, const void *src, size_t n);
/*
* This is set up by the setup-routine at boot-time
*/
struct boot_params *real_mode; /* Pointer to real-mode data */
struct boot_params *boot_params;
memptr free_mem_ptr;
memptr free_mem_end_ptr;
@ -146,12 +74,16 @@ static int lines, cols;
#ifdef CONFIG_KERNEL_LZ4
#include "../../../../lib/decompress_unlz4.c"
#endif
/*
* NOTE: When adding a new decompressor, please update the analysis in
* ../header.S.
*/
static void scroll(void)
{
int i;
memcpy(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2);
memmove(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2);
for (i = (lines - 1) * cols * 2; i < lines * cols * 2; i += 2)
vidmem[i] = ' ';
}
@ -184,12 +116,12 @@ void __putstr(const char *s)
}
}
if (real_mode->screen_info.orig_video_mode == 0 &&
if (boot_params->screen_info.orig_video_mode == 0 &&
lines == 0 && cols == 0)
return;
x = real_mode->screen_info.orig_x;
y = real_mode->screen_info.orig_y;
x = boot_params->screen_info.orig_x;
y = boot_params->screen_info.orig_y;
while ((c = *s++) != '\0') {
if (c == '\n') {
@ -210,8 +142,8 @@ void __putstr(const char *s)
}
}
real_mode->screen_info.orig_x = x;
real_mode->screen_info.orig_y = y;
boot_params->screen_info.orig_x = x;
boot_params->screen_info.orig_y = y;
pos = (x + cols * y) * 2; /* Update cursor position */
outb(14, vidport);
@ -237,23 +169,13 @@ void __puthex(unsigned long value)
}
}
static void error(char *x)
{
error_putstr("\n\n");
error_putstr(x);
error_putstr("\n\n -- System halted");
while (1)
asm("hlt");
}
#if CONFIG_X86_NEED_RELOCS
static void handle_relocations(void *output, unsigned long output_len)
{
int *reloc;
unsigned long delta, map, ptr;
unsigned long min_addr = (unsigned long)output;
unsigned long max_addr = min_addr + output_len;
unsigned long max_addr = min_addr + (VO___bss_start - VO__text);
/*
* Calculate the delta between where vmlinux was linked to load
@ -295,7 +217,7 @@ static void handle_relocations(void *output, unsigned long output_len)
* So we work backwards from the end of the decompressed image.
*/
for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) {
int extended = *reloc;
long extended = *reloc;
extended += map;
ptr = (unsigned long)extended;
@ -372,9 +294,7 @@ static void parse_elf(void *output)
#else
dest = (void *)(phdr->p_paddr);
#endif
memcpy(dest,
output + phdr->p_offset,
phdr->p_filesz);
memmove(dest, output + phdr->p_offset, phdr->p_filesz);
break;
default: /* Ignore other PT_* */ break;
}
@ -383,23 +303,41 @@ static void parse_elf(void *output)
free(phdrs);
}
asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
/*
* The compressed kernel image (ZO), has been moved so that its position
* is against the end of the buffer used to hold the uncompressed kernel
* image (VO) and the execution environment (.bss, .brk), which makes sure
* there is room to do the in-place decompression. (See header.S for the
* calculations.)
*
* |-----compressed kernel image------|
* V V
* 0 extract_offset +INIT_SIZE
* |-----------|---------------|-------------------------|--------|
* | | | |
* VO__text startup_32 of ZO VO__end ZO__end
* ^ ^
* |-------uncompressed kernel image---------|
*
*/
asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
unsigned char *input_data,
unsigned long input_len,
unsigned char *output,
unsigned long output_len,
unsigned long run_size)
unsigned long output_len)
{
const unsigned long kernel_total_size = VO__end - VO__text;
unsigned char *output_orig = output;
real_mode = rmode;
/* Retain x86 boot parameters pointer passed from startup_32/64. */
boot_params = rmode;
/* Clear it for solely in-kernel use */
real_mode->hdr.loadflags &= ~KASLR_FLAG;
/* Clear flags intended for solely in-kernel use. */
boot_params->hdr.loadflags &= ~KASLR_FLAG;
sanitize_boot_params(real_mode);
sanitize_boot_params(boot_params);
if (real_mode->screen_info.orig_video_mode == 7) {
if (boot_params->screen_info.orig_video_mode == 7) {
vidmem = (char *) 0xb0000;
vidport = 0x3b4;
} else {
@ -407,11 +345,11 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
vidport = 0x3d4;
}
lines = real_mode->screen_info.orig_video_lines;
cols = real_mode->screen_info.orig_video_cols;
lines = boot_params->screen_info.orig_video_lines;
cols = boot_params->screen_info.orig_video_cols;
console_init();
debug_putstr("early console in decompress_kernel\n");
debug_putstr("early console in extract_kernel\n");
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
@ -421,16 +359,16 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
debug_putaddr(input_len);
debug_putaddr(output);
debug_putaddr(output_len);
debug_putaddr(run_size);
debug_putaddr(kernel_total_size);
/*
* The memory hole needed for the kernel is the larger of either
* the entire decompressed kernel plus relocation table, or the
* entire decompressed kernel plus .bss and .brk sections.
*/
output = choose_kernel_location(real_mode, input_data, input_len, output,
output_len > run_size ? output_len
: run_size);
output = choose_random_location((unsigned long)input_data, input_len,
(unsigned long)output,
max(output_len, kernel_total_size));
/* Validate memory location choices. */
if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))

View File

@ -32,7 +32,7 @@
/* misc.c */
extern memptr free_mem_ptr;
extern memptr free_mem_end_ptr;
extern struct boot_params *real_mode; /* Pointer to real-mode data */
extern struct boot_params *boot_params;
void __putstr(const char *s);
void __puthex(unsigned long value);
#define error_putstr(__x) __putstr(__x)
@ -66,26 +66,35 @@ int cmdline_find_option_bool(const char *option);
#if CONFIG_RANDOMIZE_BASE
/* aslr.c */
unsigned char *choose_kernel_location(struct boot_params *boot_params,
unsigned char *input,
/* kaslr.c */
unsigned char *choose_random_location(unsigned long input_ptr,
unsigned long input_size,
unsigned char *output,
unsigned long output_ptr,
unsigned long output_size);
/* cpuflags.c */
bool has_cpuflag(int flag);
#else
static inline
unsigned char *choose_kernel_location(struct boot_params *boot_params,
unsigned char *input,
unsigned char *choose_random_location(unsigned long input_ptr,
unsigned long input_size,
unsigned char *output,
unsigned long output_ptr,
unsigned long output_size)
{
return output;
return (unsigned char *)output_ptr;
}
#endif
#ifdef CONFIG_X86_64
void add_identity_map(unsigned long start, unsigned long size);
void finalize_identity_maps(void);
extern unsigned char _pgtable[];
#else
static inline void add_identity_map(unsigned long start, unsigned long size)
{ }
static inline void finalize_identity_maps(void)
{ }
#endif
#ifdef CONFIG_EARLY_PRINTK
/* early_serial_console.c */
extern int early_serial_base;

View File

@ -18,11 +18,10 @@
*
* H. Peter Anvin <hpa@linux.intel.com>
*
* ----------------------------------------------------------------------- */
/*
* Compute the desired load offset from a compressed program; outputs
* a small assembly wrapper with the appropriate symbols defined.
* -----------------------------------------------------------------------
*
* Outputs a small assembly wrapper with the appropriate symbols defined.
*
*/
#include <stdlib.h>
@ -35,14 +34,11 @@ int main(int argc, char *argv[])
{
uint32_t olen;
long ilen;
unsigned long offs;
unsigned long run_size;
FILE *f = NULL;
int retval = 1;
if (argc < 3) {
fprintf(stderr, "Usage: %s compressed_file run_size\n",
argv[0]);
if (argc < 2) {
fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
goto bail;
}
@ -67,29 +63,11 @@ int main(int argc, char *argv[])
ilen = ftell(f);
olen = get_unaligned_le32(&olen);
/*
* Now we have the input (compressed) and output (uncompressed)
* sizes, compute the necessary decompression offset...
*/
offs = (olen > ilen) ? olen - ilen : 0;
offs += olen >> 12; /* Add 8 bytes for each 32K block */
offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */
offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
run_size = atoi(argv[2]);
printf(".section \".rodata..compressed\",\"a\",@progbits\n");
printf(".globl z_input_len\n");
printf("z_input_len = %lu\n", ilen);
printf(".globl z_output_len\n");
printf("z_output_len = %lu\n", (unsigned long)olen);
printf(".globl z_extract_offset\n");
printf("z_extract_offset = 0x%lx\n", offs);
/* z_extract_offset_negative allows simplification of head_32.S */
printf(".globl z_extract_offset_negative\n");
printf("z_extract_offset_negative = -0x%lx\n", offs);
printf(".globl z_run_size\n");
printf("z_run_size = %lu\n", run_size);
printf(".globl input_data, input_data_end\n");
printf("input_data:\n");

View File

@ -0,0 +1,129 @@
/*
* This code is used on x86_64 to create page table identity mappings on
* demand by building up a new set of page tables (or appending to the
* existing ones), and then switching over to them when ready.
*/
/*
* Since we're dealing with identity mappings, physical and virtual
* addresses are the same, so override these defines which are ultimately
* used by the headers in misc.h.
*/
#define __pa(x) ((unsigned long)(x))
#define __va(x) ((void *)((unsigned long)(x)))
#include "misc.h"
/* These actually do the work of building the kernel identity maps. */
#include <asm/init.h>
#include <asm/pgtable.h>
#include "../../mm/ident_map.c"
/* Used by pgtable.h asm code to force instruction serialization. */
unsigned long __force_order;
/* Used to track our page table allocation area. */
struct alloc_pgt_data {
unsigned char *pgt_buf;
unsigned long pgt_buf_size;
unsigned long pgt_buf_offset;
};
/*
* Allocates space for a page table entry, using struct alloc_pgt_data
* above. Besides the local callers, this is used as the allocation
* callback in mapping_info below.
*/
static void *alloc_pgt_page(void *context)
{
struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context;
unsigned char *entry;
/* Validate there is space available for a new page. */
if (pages->pgt_buf_offset >= pages->pgt_buf_size) {
debug_putstr("out of pgt_buf in " __FILE__ "!?\n");
debug_putaddr(pages->pgt_buf_offset);
debug_putaddr(pages->pgt_buf_size);
return NULL;
}
entry = pages->pgt_buf + pages->pgt_buf_offset;
pages->pgt_buf_offset += PAGE_SIZE;
return entry;
}
/* Used to track our allocated page tables. */
static struct alloc_pgt_data pgt_data;
/* The top level page table entry pointer. */
static unsigned long level4p;
/* Locates and clears a region for a new top level page table. */
static void prepare_level4(void)
{
/*
* It should be impossible for this not to already be true,
* but since calling this a second time would rewind the other
* counters, let's just make sure this is reset too.
*/
pgt_data.pgt_buf_offset = 0;
/*
* If we came here via startup_32(), cr3 will be _pgtable already
* and we must append to the existing area instead of entirely
* overwriting it.
*/
level4p = read_cr3();
if (level4p == (unsigned long)_pgtable) {
debug_putstr("booted via startup_32()\n");
pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
} else {
debug_putstr("booted via startup_64()\n");
pgt_data.pgt_buf = _pgtable;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
level4p = (unsigned long)alloc_pgt_page(&pgt_data);
}
}
/*
* Adds the specified range to what will become the new identity mappings.
* Once all ranges have been added, the new mapping is activated by calling
* finalize_identity_maps() below.
*/
void add_identity_map(unsigned long start, unsigned long size)
{
struct x86_mapping_info mapping_info = {
.alloc_pgt_page = alloc_pgt_page,
.context = &pgt_data,
.pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
};
unsigned long end = start + size;
/* Make sure we have a top level page table ready to use. */
if (!level4p)
prepare_level4();
/* Align boundary to 2M. */
start = round_down(start, PMD_SIZE);
end = round_up(end, PMD_SIZE);
if (start >= end)
return;
/* Build the mapping. */
kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p,
start, end);
}
/*
* This switches the page tables to the new level4 that has been built
* via calls to add_identity_map() above. If booted via startup_32(),
* this is effectively a no-op.
*/
void finalize_identity_maps(void)
{
write_cr3(level4p);
}

View File

@ -1,7 +1,16 @@
/*
* This provides an optimized implementation of memcpy, and a simplified
* implementation of memset and memmove. These are used here because the
* standard kernel runtime versions are not yet available and we don't
* trust the gcc built-in implementations as they may do unexpected things
* (e.g. FPU ops) in the minimal decompression stub execution environment.
*/
#include "error.h"
#include "../string.c"
#ifdef CONFIG_X86_32
void *memcpy(void *dest, const void *src, size_t n)
static void *__memcpy(void *dest, const void *src, size_t n)
{
int d0, d1, d2;
asm volatile(
@ -15,7 +24,7 @@ void *memcpy(void *dest, const void *src, size_t n)
return dest;
}
#else
void *memcpy(void *dest, const void *src, size_t n)
static void *__memcpy(void *dest, const void *src, size_t n)
{
long d0, d1, d2;
asm volatile(
@ -39,3 +48,27 @@ void *memset(void *s, int c, size_t n)
ss[i] = c;
return s;
}
void *memmove(void *dest, const void *src, size_t n)
{
unsigned char *d = dest;
const unsigned char *s = src;
if (d <= s || d - s >= n)
return __memcpy(dest, src, n);
while (n-- > 0)
d[n] = s[n];
return dest;
}
/* Detect and warn about potential overlaps, but handle them with memmove. */
void *memcpy(void *dest, const void *src, size_t n)
{
if (dest > src && dest - src < n) {
warn("Avoiding potentially unsafe overlapping memcpy()!");
return memmove(dest, src, n);
}
return __memcpy(dest, src, n);
}

View File

@ -70,5 +70,6 @@ SECTIONS
_epgtable = . ;
}
#endif
. = ALIGN(PAGE_SIZE); /* keep ZO size page aligned */
_end = .;
}

View File

@ -1,3 +1,7 @@
/*
* Serial port routines for use during early boot reporting. This code is
* included from both the compressed kernel and the regular kernel.
*/
#include "boot.h"
#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */

View File

@ -440,13 +440,116 @@ setup_data: .quad 0 # 64-bit physical pointer to
pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset)
#
# Getting to provably safe in-place decompression is hard. Worst case
# behaviours need to be analyzed. Here let's take the decompression of
# a gzip-compressed kernel as example, to illustrate it:
#
# The file layout of gzip compressed kernel is:
#
# magic[2]
# method[1]
# flags[1]
# timestamp[4]
# extraflags[1]
# os[1]
# compressed data blocks[N]
# crc[4] orig_len[4]
#
# ... resulting in +18 bytes overhead of uncompressed data.
#
# (For more information, please refer to RFC 1951 and RFC 1952.)
#
# Files divided into blocks
# 1 bit (last block flag)
# 2 bits (block type)
#
# 1 block occurs every 32K -1 bytes or when there 50% compression
# has been achieved. The smallest block type encoding is always used.
#
# stored:
# 32 bits length in bytes.
#
# fixed:
# magic fixed tree.
# symbols.
#
# dynamic:
# dynamic tree encoding.
# symbols.
#
#
# The buffer for decompression in place is the length of the uncompressed
# data, plus a small amount extra to keep the algorithm safe. The
# compressed data is placed at the end of the buffer. The output pointer
# is placed at the start of the buffer and the input pointer is placed
# where the compressed data starts. Problems will occur when the output
# pointer overruns the input pointer.
#
# The output pointer can only overrun the input pointer if the input
# pointer is moving faster than the output pointer. A condition only
# triggered by data whose compressed form is larger than the uncompressed
# form.
#
# The worst case at the block level is a growth of the compressed data
# of 5 bytes per 32767 bytes.
#
# The worst case internal to a compressed block is very hard to figure.
# The worst case can at least be bounded by having one bit that represents
# 32764 bytes and then all of the rest of the bytes representing the very
# very last byte.
#
# All of which is enough to compute an amount of extra data that is required
# to be safe. To avoid problems at the block level allocating 5 extra bytes
# per 32767 bytes of data is sufficient. To avoid problems internal to a
# block adding an extra 32767 bytes (the worst case uncompressed block size)
# is sufficient, to ensure that in the worst case the decompressed data for
# block will stop the byte before the compressed data for a block begins.
# To avoid problems with the compressed data's meta information an extra 18
# bytes are needed. Leading to the formula:
#
# extra_bytes = (uncompressed_size >> 12) + 32768 + 18
#
# Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
# Adding 32768 instead of 32767 just makes for round numbers.
#
# Above analysis is for decompressing gzip compressed kernel only. Up to
# now 6 different decompressor are supported all together. And among them
# xz stores data in chunks and has maximum chunk of 64K. Hence safety
# margin should be updated to cover all decompressors so that we don't
# need to deal with each of them separately. Please check
# the description in lib/decompressor_xxx.c for specific information.
#
# extra_bytes = (uncompressed_size >> 12) + 65536 + 128
#define ZO_z_extra_bytes ((ZO_z_output_len >> 12) + 65536 + 128)
#if ZO_z_output_len > ZO_z_input_len
# define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \
ZO_z_input_len)
#else
# define ZO_z_extract_offset ZO_z_extra_bytes
#endif
/*
* The extract_offset has to be bigger than ZO head section. Otherwise when
* the head code is running to move ZO to the end of the buffer, it will
* overwrite the head code itself.
*/
#if (ZO__ehead - ZO_startup_32) > ZO_z_extract_offset
# define ZO_z_min_extract_offset ((ZO__ehead - ZO_startup_32 + 4095) & ~4095)
#else
# define ZO_z_min_extract_offset ((ZO_z_extract_offset + 4095) & ~4095)
#endif
#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_min_extract_offset)
#define VO_INIT_SIZE (VO__end - VO__text)
#if ZO_INIT_SIZE > VO_INIT_SIZE
#define INIT_SIZE ZO_INIT_SIZE
# define INIT_SIZE ZO_INIT_SIZE
#else
#define INIT_SIZE VO_INIT_SIZE
# define INIT_SIZE VO_INIT_SIZE
#endif
init_size: .long INIT_SIZE # kernel initialization size
handover_offset: .long 0 # Filled in by build.c

View File

@ -12,29 +12,46 @@
/* Minimum kernel alignment, as a power of two */
#ifdef CONFIG_X86_64
#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
# define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
#else
#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER)
# define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER)
#endif
#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
(CONFIG_PHYSICAL_ALIGN < MIN_KERNEL_ALIGN)
#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
# error "Invalid value for CONFIG_PHYSICAL_ALIGN"
#endif
#ifdef CONFIG_KERNEL_BZIP2
#define BOOT_HEAP_SIZE 0x400000
# define BOOT_HEAP_SIZE 0x400000
#else /* !CONFIG_KERNEL_BZIP2 */
#define BOOT_HEAP_SIZE 0x10000
#endif /* !CONFIG_KERNEL_BZIP2 */
# define BOOT_HEAP_SIZE 0x10000
#endif
#ifdef CONFIG_X86_64
#define BOOT_STACK_SIZE 0x4000
#else
#define BOOT_STACK_SIZE 0x1000
# define BOOT_STACK_SIZE 0x4000
# define BOOT_INIT_PGT_SIZE (6*4096)
# ifdef CONFIG_RANDOMIZE_BASE
/*
* Assuming all cross the 512GB boundary:
* 1 page for level4
* (2+2)*4 pages for kernel, param, cmd_line, and randomized kernel
* 2 pages for first 2M (video RAM: CONFIG_X86_VERBOSE_BOOTUP).
* Total is 19 pages.
*/
# ifdef CONFIG_X86_VERBOSE_BOOTUP
# define BOOT_PGT_SIZE (19*4096)
# else /* !CONFIG_X86_VERBOSE_BOOTUP */
# define BOOT_PGT_SIZE (17*4096)
# endif
# else /* !CONFIG_RANDOMIZE_BASE */
# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE
# endif
#else /* !CONFIG_X86_64 */
# define BOOT_STACK_SIZE 0x1000
#endif
#endif /* _ASM_X86_BOOT_H */

View File

@ -37,7 +37,10 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
#ifndef __pa
#define __pa(x) __phys_addr((unsigned long)(x))
#endif
#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x))
/* __pa_symbol should be used for C visible symbols.
This seems to be the official gcc blessed way to do such arithmetic. */
@ -51,7 +54,9 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
#define __pa_symbol(x) \
__phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))
#ifndef __va
#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
#endif
#define __boot_va(x) __va(x)
#define __boot_pa(x) __pa(x)

View File

@ -47,12 +47,10 @@
* are fully set up. If kernel ASLR is configured, it can extend the
* kernel page table mapping, reducing the size of the modules area.
*/
#define KERNEL_IMAGE_SIZE_DEFAULT (512 * 1024 * 1024)
#if defined(CONFIG_RANDOMIZE_BASE) && \
CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE_DEFAULT
#define KERNEL_IMAGE_SIZE CONFIG_RANDOMIZE_BASE_MAX_OFFSET
#if defined(CONFIG_RANDOMIZE_BASE)
#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
#else
#define KERNEL_IMAGE_SIZE KERNEL_IMAGE_SIZE_DEFAULT
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
#endif
#endif /* _ASM_X86_PAGE_64_DEFS_H */

View File

@ -15,17 +15,6 @@
#include <linux/cpumask.h>
#include <asm/frame.h>
static inline int paravirt_enabled(void)
{
return pv_info.paravirt_enabled;
}
static inline int paravirt_has_feature(unsigned int feature)
{
WARN_ON_ONCE(!pv_info.paravirt_enabled);
return (pv_info.features & feature);
}
static inline void load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
{

View File

@ -69,15 +69,9 @@ struct pv_info {
u16 extra_user_64bit_cs; /* __USER_CS if none */
#endif
int paravirt_enabled;
unsigned int features; /* valid only if paravirt_enabled is set */
const char *name;
};
#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x)
/* Supported features */
#define PV_SUPPORTED_RTC (1<<0)
struct pv_init_ops {
/*
* Patch may replace one of the defined code sequences with

View File

@ -480,8 +480,6 @@ static inline unsigned long current_top_of_stack(void)
#include <asm/paravirt.h>
#else
#define __cpuid native_cpuid
#define paravirt_enabled() 0
#define paravirt_has(x) 0
static inline void load_sp0(struct tss_struct *tss,
struct thread_struct *thread)

View File

@ -141,6 +141,44 @@ struct x86_cpuinit_ops {
struct timespec;
/**
* struct x86_legacy_devices - legacy x86 devices
*
* @pnpbios: this platform can have a PNPBIOS. If this is disabled the platform
* is known to never have a PNPBIOS.
*
* These are devices known to require LPC or ISA bus. The definition of legacy
* devices adheres to the ACPI 5.2.9.3 IA-PC Boot Architecture flag
* ACPI_FADT_LEGACY_DEVICES. These devices consist of user visible devices on
* the LPC or ISA bus. User visible devices are devices that have end-user
* accessible connectors (for example, LPT parallel port). Legacy devices on
* the LPC bus consist for example of serial and parallel ports, PS/2 keyboard
* / mouse, and the floppy disk controller. A system that lacks all known
* legacy devices can assume all devices can be detected exclusively via
* standard device enumeration mechanisms including the ACPI namespace.
*
* A system which has does not have ACPI_FADT_LEGACY_DEVICES enabled must not
* have any of the legacy devices enumerated below present.
*/
struct x86_legacy_devices {
int pnpbios;
};
/**
* struct x86_legacy_features - legacy x86 features
*
* @rtc: this device has a CMOS real-time clock present
* @ebda_search: it's safe to search for the EBDA signature in the hardware's
* low RAM
* @devices: legacy x86 devices, refer to struct x86_legacy_devices
* documentation for further details.
*/
struct x86_legacy_features {
int rtc;
int ebda_search;
struct x86_legacy_devices devices;
};
/**
* struct x86_platform_ops - platform specific runtime functions
* @calibrate_tsc: calibrate TSC
@ -152,6 +190,14 @@ struct timespec;
* @save_sched_clock_state: save state for sched_clock() on suspend
* @restore_sched_clock_state: restore state for sched_clock() on resume
* @apic_post_init: adjust apic if neeeded
* @legacy: legacy features
* @set_legacy_features: override legacy features. Use of this callback
* is highly discouraged. You should only need
* this if your hardware platform requires further
* custom fine tuning far beyong what may be
* possible in x86_early_init_platform_quirks() by
* only using the current x86_hardware_subarch
* semantics.
*/
struct x86_platform_ops {
unsigned long (*calibrate_tsc)(void);
@ -165,6 +211,8 @@ struct x86_platform_ops {
void (*save_sched_clock_state)(void);
void (*restore_sched_clock_state)(void);
void (*apic_post_init)(void);
struct x86_legacy_features legacy;
void (*set_legacy_features)(void);
};
struct pci_dev;
@ -186,6 +234,8 @@ extern struct x86_cpuinit_ops x86_cpuinit;
extern struct x86_platform_ops x86_platform;
extern struct x86_msi_ops x86_msi;
extern struct x86_io_apic_ops x86_io_apic_ops;
extern void x86_early_init_platform_quirks(void);
extern void x86_init_noop(void);
extern void x86_init_uint_noop(unsigned int unused);

View File

@ -157,7 +157,46 @@ struct boot_params {
__u8 _pad9[276]; /* 0xeec */
} __attribute__((packed));
enum {
/**
* enum x86_hardware_subarch - x86 hardware subarchitecture
*
* The x86 hardware_subarch and hardware_subarch_data were added as of the x86
* boot protocol 2.07 to help distinguish and support custom x86 boot
* sequences. This enum represents accepted values for the x86
* hardware_subarch. Custom x86 boot sequences (not X86_SUBARCH_PC) do not
* have or simply *cannot* make use of natural stubs like BIOS or EFI, the
* hardware_subarch can be used on the Linux entry path to revector to a
* subarchitecture stub when needed. This subarchitecture stub can be used to
* set up Linux boot parameters or for special care to account for nonstandard
* handling of page tables.
*
* These enums should only ever be used by x86 code, and the code that uses
* it should be well contained and compartamentalized.
*
* KVM and Xen HVM do not have a subarch as these are expected to follow
* standard x86 boot entries. If there is a genuine need for "hypervisor" type
* that should be considered separately in the future. Future guest types
* should seriously consider working with standard x86 boot stubs such as
* the BIOS or EFI boot stubs.
*
* WARNING: this enum is only used for legacy hacks, for platform features that
* are not easily enumerated or discoverable. You should not ever use
* this for new features.
*
* @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
* PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
* @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest
* @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
* which start at asm startup_xen() entry point and later jump to the C
* xen_start_kernel() entry point. Both domU and dom0 type of guests are
* currently supportd through this PV boot path.
* @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform
* systems which do not have the PCI legacy interfaces.
* @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC for
* for settop boxes and media devices, the use of a subarch for CE4100
* is more of a hack...
*/
enum x86_hardware_subarch {
X86_SUBARCH_PC = 0,
X86_SUBARCH_LGUEST,
X86_SUBARCH_XEN,

View File

@ -2,7 +2,11 @@
# Makefile for the linux kernel.
#
extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds
extra-y := head_$(BITS).o
extra-y += head$(BITS).o
extra-y += ebda.o
extra-y += platform-quirks.o
extra-y += vmlinux.lds
CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)

View File

@ -913,6 +913,15 @@ late_initcall(hpet_insert_resource);
static int __init acpi_parse_fadt(struct acpi_table_header *table)
{
if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) {
pr_debug("ACPI: no legacy devices present\n");
x86_platform.legacy.devices.pnpbios = 0;
}
if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
pr_debug("ACPI: not registering RTC platform device\n");
x86_platform.legacy.rtc = 0;
}
#ifdef CONFIG_X86_PM_TIMER
/* detect the location of the ACPI PM Timer */

View File

@ -2267,7 +2267,7 @@ static int __init apm_init(void)
dmi_check_system(apm_dmi_table);
if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
if (apm_info.bios.version == 0 || machine_is_olpc()) {
printk(KERN_INFO "apm: BIOS not found.\n");
return -ENODEV;
}

View File

@ -80,6 +80,7 @@ void common(void) {
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
OFFSET(BP_init_size, boot_params, hdr.init_size);
OFFSET(BP_pref_address, boot_params, hdr.pref_address);
OFFSET(BP_code32_start, boot_params, hdr.code32_start);

View File

@ -233,7 +233,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* The Quark is also family 5, but does not have the same bug.
*/
clear_cpu_bug(c, X86_BUG_F00F);
if (!paravirt_enabled() && c->x86 == 5 && c->x86_model < 9) {
if (c->x86 == 5 && c->x86_model < 9) {
static int f00f_workaround_enabled;
set_cpu_bug(c, X86_BUG_F00F);

View File

@ -38,7 +38,7 @@ void __init reserve_ebda_region(void)
* that the paravirt case can handle memory setup
* correctly, without our help.
*/
if (paravirt_enabled())
if (!x86_platform.legacy.ebda_search)
return;
/* end of low (conventional) memory */

View File

@ -34,6 +34,8 @@ asmlinkage __visible void __init i386_start_kernel(void)
cr4_init_shadow();
sanitize_boot_params(&boot_params);
x86_early_init_platform_quirks();
/* Call the subarch specific early setup function */
switch (boot_params.hdr.hardware_subarch) {
case X86_SUBARCH_INTEL_MID:

View File

@ -182,6 +182,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
if (!boot_params.hdr.version)
copy_bootdata(__va(real_mode_data));
x86_early_init_platform_quirks();
reserve_ebda_region();
switch (boot_params.hdr.hardware_subarch) {

View File

@ -285,14 +285,6 @@ static void __init paravirt_ops_setup(void)
{
pv_info.name = "KVM";
/*
* KVM isn't paravirt in the sense of paravirt_enabled. A KVM
* guest kernel works like a bare metal kernel with additional
* features, and paravirt_enabled is about features that are
* missing.
*/
pv_info.paravirt_enabled = 0;
if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
pv_cpu_ops.io_delay = kvm_io_delay;

View File

@ -294,7 +294,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
struct pv_info pv_info = {
.name = "bare hardware",
.paravirt_enabled = 0,
.kernel_rpl = 0,
.shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */

View File

@ -0,0 +1,35 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <asm/setup.h>
#include <asm/bios_ebda.h>
void __init x86_early_init_platform_quirks(void)
{
x86_platform.legacy.rtc = 1;
x86_platform.legacy.ebda_search = 0;
x86_platform.legacy.devices.pnpbios = 1;
switch (boot_params.hdr.hardware_subarch) {
case X86_SUBARCH_PC:
x86_platform.legacy.ebda_search = 1;
break;
case X86_SUBARCH_XEN:
case X86_SUBARCH_LGUEST:
case X86_SUBARCH_INTEL_MID:
case X86_SUBARCH_CE4100:
x86_platform.legacy.devices.pnpbios = 0;
x86_platform.legacy.rtc = 0;
break;
}
if (x86_platform.set_legacy_features)
x86_platform.set_legacy_features();
}
#if defined(CONFIG_PNPBIOS)
bool __init arch_pnpbios_disabled(void)
{
return x86_platform.legacy.devices.pnpbios == 0;
}
#endif

View File

@ -14,6 +14,7 @@
#include <asm/time.h>
#include <asm/intel-mid.h>
#include <asm/rtc.h>
#include <asm/setup.h>
#ifdef CONFIG_X86_32
/*
@ -185,22 +186,7 @@ static __init int add_rtc_cmos(void)
}
}
#endif
if (of_have_populated_dt())
return 0;
/* Intel MID platforms don't have ioport rtc */
if (intel_mid_identify_cpu())
return -ENODEV;
#ifdef CONFIG_ACPI
if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
/* This warning can likely go away again in a year or two. */
pr_info("ACPI: not registering RTC platform device\n");
return -ENODEV;
}
#endif
if (paravirt_enabled() && !paravirt_has(RTC))
if (!x86_platform.legacy.rtc)
return -ENODEV;
platform_device_register(&rtc_device);

View File

@ -74,12 +74,6 @@ void __init tboot_probe(void)
return;
}
/* only a natively booted kernel should be using TXT */
if (paravirt_enabled()) {
pr_warning("non-0 tboot_addr but pv_ops is enabled\n");
return;
}
/* Map and check for tboot UUID. */
set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);

View File

@ -334,7 +334,7 @@ SECTIONS
__brk_limit = .;
}
. = ALIGN(PAGE_SIZE);
. = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */
_end = .;
STABS_DEBUG

View File

@ -1408,13 +1408,10 @@ __init void lguest_init(void)
{
/* We're under lguest. */
pv_info.name = "lguest";
/* Paravirt is enabled. */
pv_info.paravirt_enabled = 1;
/* We're running at privilege level 1, not 0 as normal. */
pv_info.kernel_rpl = 1;
/* Everyone except Xen runs with this set. */
pv_info.shared_kernel_pmd = 1;
pv_info.features = 0;
/*
* We set up all the lguest overrides for sensitive operations. These

79
arch/x86/mm/ident_map.c Normal file
View File

@ -0,0 +1,79 @@
/*
* Helper routines for building identity mapping page tables. This is
* included by both the compressed kernel and the regular kernel.
*/
static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
unsigned long addr, unsigned long end)
{
addr &= PMD_MASK;
for (; addr < end; addr += PMD_SIZE) {
pmd_t *pmd = pmd_page + pmd_index(addr);
if (!pmd_present(*pmd))
set_pmd(pmd, __pmd(addr | pmd_flag));
}
}
static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
unsigned long addr, unsigned long end)
{
unsigned long next;
for (; addr < end; addr = next) {
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
next = (addr & PUD_MASK) + PUD_SIZE;
if (next > end)
next = end;
if (pud_present(*pud)) {
pmd = pmd_offset(pud, 0);
ident_pmd_init(info->pmd_flag, pmd, addr, next);
continue;
}
pmd = (pmd_t *)info->alloc_pgt_page(info->context);
if (!pmd)
return -ENOMEM;
ident_pmd_init(info->pmd_flag, pmd, addr, next);
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
}
return 0;
}
int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
unsigned long addr, unsigned long end)
{
unsigned long next;
int result;
int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
for (; addr < end; addr = next) {
pgd_t *pgd = pgd_page + pgd_index(addr) + off;
pud_t *pud;
next = (addr & PGDIR_MASK) + PGDIR_SIZE;
if (next > end)
next = end;
if (pgd_present(*pgd)) {
pud = pud_offset(pgd, 0);
result = ident_pud_init(info, pud, addr, next);
if (result)
return result;
continue;
}
pud = (pud_t *)info->alloc_pgt_page(info->context);
if (!pud)
return -ENOMEM;
result = ident_pud_init(info, pud, addr, next);
if (result)
return result;
set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
}
return 0;
}

View File

@ -804,9 +804,6 @@ void __init mem_init(void)
BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
#undef high_memory
#undef __FIXADDR_TOP
#ifdef CONFIG_RANDOMIZE_BASE
BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE);
#endif
#ifdef CONFIG_HIGHMEM
BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);

View File

@ -58,79 +58,7 @@
#include "mm_internal.h"
static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
unsigned long addr, unsigned long end)
{
addr &= PMD_MASK;
for (; addr < end; addr += PMD_SIZE) {
pmd_t *pmd = pmd_page + pmd_index(addr);
if (!pmd_present(*pmd))
set_pmd(pmd, __pmd(addr | pmd_flag));
}
}
static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
unsigned long addr, unsigned long end)
{
unsigned long next;
for (; addr < end; addr = next) {
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
next = (addr & PUD_MASK) + PUD_SIZE;
if (next > end)
next = end;
if (pud_present(*pud)) {
pmd = pmd_offset(pud, 0);
ident_pmd_init(info->pmd_flag, pmd, addr, next);
continue;
}
pmd = (pmd_t *)info->alloc_pgt_page(info->context);
if (!pmd)
return -ENOMEM;
ident_pmd_init(info->pmd_flag, pmd, addr, next);
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
}
return 0;
}
int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
unsigned long addr, unsigned long end)
{
unsigned long next;
int result;
int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
for (; addr < end; addr = next) {
pgd_t *pgd = pgd_page + pgd_index(addr) + off;
pud_t *pud;
next = (addr & PGDIR_MASK) + PGDIR_SIZE;
if (next > end)
next = end;
if (pgd_present(*pgd)) {
pud = pud_offset(pgd, 0);
result = ident_pud_init(info, pud, addr, next);
if (result)
return result;
continue;
}
pud = (pud_t *)info->alloc_pgt_page(info->context);
if (!pud)
return -ENOMEM;
result = ident_pud_init(info, pud, addr, next);
if (result)
return result;
set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
}
return 0;
}
#include "ident_map.c"
/*
* NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the

View File

@ -1,42 +0,0 @@
#!/bin/sh
#
# Calculate the amount of space needed to run the kernel, including room for
# the .bss and .brk sections.
#
# Usage:
# objdump -h a.out | sh calc_run_size.sh
NUM='\([0-9a-fA-F]*[ \t]*\)'
OUT=$(sed -n 's/^[ \t0-9]*.b[sr][sk][ \t]*'"$NUM$NUM$NUM$NUM"'.*/\1\4/p')
if [ -z "$OUT" ] ; then
echo "Never found .bss or .brk file offset" >&2
exit 1
fi
OUT=$(echo ${OUT# })
sizeA=$(printf "%d" 0x${OUT%% *})
OUT=${OUT#* }
offsetA=$(printf "%d" 0x${OUT%% *})
OUT=${OUT#* }
sizeB=$(printf "%d" 0x${OUT%% *})
OUT=${OUT#* }
offsetB=$(printf "%d" 0x${OUT%% *})
run_size=$(( $offsetA + $sizeA + $sizeB ))
# BFD linker shows the same file offset in ELF.
if [ "$offsetA" -ne "$offsetB" ] ; then
# Gold linker shows them as consecutive.
endB=$(( $offsetB + $sizeB ))
if [ "$endB" != "$run_size" ] ; then
printf "sizeA: 0x%x\n" $sizeA >&2
printf "offsetA: 0x%x\n" $offsetA >&2
printf "sizeB: 0x%x\n" $sizeB >&2
printf "offsetB: 0x%x\n" $offsetB >&2
echo ".bss and .brk are non-contiguous" >&2
exit 1
fi
fi
printf "%d\n" $run_size
exit 0

View File

@ -1206,13 +1206,11 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
}
static const struct pv_info xen_info __initconst = {
.paravirt_enabled = 1,
.shared_kernel_pmd = 0,
#ifdef CONFIG_X86_64
.extra_user_64bit_cs = FLAT_USER_CS64,
#endif
.features = 0,
.name = "Xen",
};
@ -1528,6 +1526,11 @@ static void __init xen_pvh_early_guest_init(void)
}
#endif /* CONFIG_XEN_PVH */
static void __init xen_dom0_set_legacy_features(void)
{
x86_platform.legacy.rtc = 1;
}
/* First C function to be called on Xen boot */
asmlinkage __visible void __init xen_start_kernel(void)
{
@ -1548,8 +1551,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* Install Xen paravirt ops */
pv_info = xen_info;
if (xen_initial_domain())
pv_info.features |= PV_SUPPORTED_RTC;
pv_init_ops = xen_init_ops;
if (!xen_pvh_domain()) {
pv_cpu_ops = xen_cpu_ops;
@ -1684,6 +1685,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
boot_params.hdr.ramdisk_image = initrd_start;
boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN;
if (!xen_initial_domain()) {
add_preferred_console("xenboot", 0, NULL);
@ -1701,6 +1703,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
.u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS,
};
x86_platform.set_legacy_features =
xen_dom0_set_legacy_features;
xen_init_vga(info, xen_start_info->console.dom0.info_size);
xen_start_info->console.domU.mfn = 0;
xen_start_info->console.domU.evtchn = 0;

View File

@ -521,10 +521,11 @@ static int __init pnpbios_init(void)
int ret;
if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table) ||
paravirt_enabled()) {
arch_pnpbios_disabled()) {
printk(KERN_INFO "PnPBIOS: Disabled\n");
return -ENODEV;
}
#ifdef CONFIG_PNPACPI
if (!acpi_disabled && !pnpacpi_disabled) {
pnpbios_disabled = 1;

View File

@ -337,9 +337,11 @@ extern struct mutex pnp_res_mutex;
#ifdef CONFIG_PNPBIOS
extern struct pnp_protocol pnpbios_protocol;
extern bool arch_pnpbios_disabled(void);
#define pnp_device_is_pnpbios(dev) ((dev)->protocol == (&pnpbios_protocol))
#else
#define pnp_device_is_pnpbios(dev) 0
#define arch_pnpbios_disabled() false
#endif
#ifdef CONFIG_PNPACPI

View File

@ -3351,12 +3351,18 @@ int main(int argc, char *argv[])
/* Boot protocol version: 2.07 supports the fields for lguest. */
boot->hdr.version = 0x207;
/* The hardware_subarch value of "1" tells the Guest it's an lguest. */
boot->hdr.hardware_subarch = 1;
/* X86_SUBARCH_LGUEST tells the Guest it's an lguest. */
boot->hdr.hardware_subarch = X86_SUBARCH_LGUEST;
/* Tell the entry path not to try to reload segment registers. */
boot->hdr.loadflags |= KEEP_SEGMENTS;
/* We don't support tboot: */
boot->tboot_addr = 0;
/* Ensure this is 0 to prevent APM from loading: */
boot->apm_bios_info.version = 0;
/* We tell the kernel to initialize the Guest. */
tell_kernel(start);