linux/arch/x86/kernel/traps.c
Linus Torvalds 039aeb9deb ARM:
- Move the arch-specific code into arch/arm64/kvm
 - Start the post-32bit cleanup
 - Cherry-pick a few non-invasive pre-NV patches
 
 x86:
 - Rework of TLB flushing
 - Rework of event injection, especially with respect to nested virtualization
 - Nested AMD event injection facelift, building on the rework of generic code
 and fixing a lot of corner cases
 - Nested AMD live migration support
 - Optimization for TSC deadline MSR writes and IPIs
 - Various cleanups
 - Asynchronous page fault cleanups (from tglx, common topic branch with tip tree)
 - Interrupt-based delivery of asynchronous "page ready" events (host side)
 - Hyper-V MSRs and hypercalls for guest debugging
 - VMX preemption timer fixes
 
 s390:
 - Cleanups
 
 Generic:
 - switch vCPU thread wakeup from swait to rcuwait
 
 The other architectures, and the guest side of the asynchronous page fault
 work, will come next week.
 -----BEGIN PGP SIGNATURE-----
 
 iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAl7VJcYUHHBib256aW5p
 QHJlZGhhdC5jb20ACgkQv/vSX3jHroPf6QgAq4wU5wdd1lTGz/i3DIhNVJNJgJlp
 ozLzRdMaJbdbn5RpAK6PEBd9+pt3+UlojpFB3gpJh2Nazv2OzV4yLQgXXXyyMEx1
 5Hg7b4UCJYDrbkCiegNRv7f/4FWDkQ9dx++RZITIbxeskBBCEI+I7GnmZhGWzuC4
 7kj4ytuKAySF2OEJu0VQF6u0CvrNYfYbQIRKBXjtOwuRK4Q6L63FGMJpYo159MBQ
 asg3B1jB5TcuGZ9zrjL5LkuzaP4qZZHIRs+4kZsH9I6MODHGUxKonrkablfKxyKy
 CFK+iaHCuEXXty5K0VmWM3nrTfvpEjVjbMc7e1QGBQ5oXsDM0pqn84syRg==
 =v7Wn
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm updates from Paolo Bonzini:
 "ARM:
   - Move the arch-specific code into arch/arm64/kvm

   - Start the post-32bit cleanup

   - Cherry-pick a few non-invasive pre-NV patches

  x86:
   - Rework of TLB flushing

   - Rework of event injection, especially with respect to nested
     virtualization

   - Nested AMD event injection facelift, building on the rework of
     generic code and fixing a lot of corner cases

   - Nested AMD live migration support

   - Optimization for TSC deadline MSR writes and IPIs

   - Various cleanups

   - Asynchronous page fault cleanups (from tglx, common topic branch
     with tip tree)

   - Interrupt-based delivery of asynchronous "page ready" events (host
     side)

   - Hyper-V MSRs and hypercalls for guest debugging

   - VMX preemption timer fixes

  s390:
   - Cleanups

  Generic:
   - switch vCPU thread wakeup from swait to rcuwait

  The other architectures, and the guest side of the asynchronous page
  fault work, will come next week"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (256 commits)
  KVM: selftests: fix rdtsc() for vmx_tsc_adjust_test
  KVM: check userspace_addr for all memslots
  KVM: selftests: update hyperv_cpuid with SynDBG tests
  x86/kvm/hyper-v: Add support for synthetic debugger via hypercalls
  x86/kvm/hyper-v: enable hypercalls regardless of hypercall page
  x86/kvm/hyper-v: Add support for synthetic debugger interface
  x86/hyper-v: Add synthetic debugger definitions
  KVM: selftests: VMX preemption timer migration test
  KVM: nVMX: Fix VMX preemption timer migration
  x86/kvm/hyper-v: Explicitly align hcall param for kvm_hyperv_exit
  KVM: x86/pmu: Support full width counting
  KVM: x86/pmu: Tweak kvm_pmu_get_msr to pass 'struct msr_data' in
  KVM: x86: announce KVM_FEATURE_ASYNC_PF_INT
  KVM: x86: acknowledgment mechanism for async pf page ready notifications
  KVM: x86: interrupt based APF 'page ready' event delivery
  KVM: introduce kvm_read_guest_offset_cached()
  KVM: rename kvm_arch_can_inject_async_page_present() to kvm_arch_can_dequeue_async_page_present()
  KVM: x86: extend struct kvm_vcpu_pv_apf_data with token info
  Revert "KVM: async_pf: Fix #DF due to inject "Page not Present" and "Page Ready" exceptions simultaneously"
  KVM: VMX: Replace zero-length array with flexible-array
  ...
2020-06-03 15:13:47 -07:00

908 lines
26 KiB
C

/*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
*
* Pentium III FXSR, SSE support
* Gareth Hughes <gareth@valinux.com>, May 2000
*/
/*
* Handle hardware traps and faults.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/context_tracking.h>
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
#include <linux/spinlock.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/kdebug.h>
#include <linux/kgdb.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/ptrace.h>
#include <linux/uprobes.h>
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/kexec.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/timer.h>
#include <linux/init.h>
#include <linux/bug.h>
#include <linux/nmi.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/io.h>
#include <linux/hardirq.h>
#include <linux/atomic.h>
#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <asm/text-patching.h>
#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/fpu/internal.h>
#include <asm/cpu.h>
#include <asm/cpu_entry_area.h>
#include <asm/mce.h>
#include <asm/fixmap.h>
#include <asm/mach_traps.h>
#include <asm/alternative.h>
#include <asm/fpu/xstate.h>
#include <asm/vm86.h>
#include <asm/umip.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
#else
#include <asm/processor-flags.h>
#include <asm/setup.h>
#include <asm/proto.h>
#endif
DECLARE_BITMAP(system_vectors, NR_VECTORS);
static inline void cond_local_irq_enable(struct pt_regs *regs)
{
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
}
static inline void cond_local_irq_disable(struct pt_regs *regs)
{
if (regs->flags & X86_EFLAGS_IF)
local_irq_disable();
}
int is_valid_bugaddr(unsigned long addr)
{
unsigned short ud;
if (addr < TASK_SIZE_MAX)
return 0;
if (probe_kernel_address((unsigned short *)addr, ud))
return 0;
return ud == INSN_UD0 || ud == INSN_UD2;
}
int fixup_bug(struct pt_regs *regs, int trapnr)
{
if (trapnr != X86_TRAP_UD)
return 0;
switch (report_bug(regs->ip, regs)) {
case BUG_TRAP_TYPE_NONE:
case BUG_TRAP_TYPE_BUG:
break;
case BUG_TRAP_TYPE_WARN:
regs->ip += LEN_UD2;
return 1;
}
return 0;
}
static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
struct pt_regs *regs, long error_code)
{
if (v8086_mode(regs)) {
/*
* Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
* On nmi (interrupt 2), do_trap should not be called.
*/
if (trapnr < X86_TRAP_UD) {
if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
error_code, trapnr))
return 0;
}
} else if (!user_mode(regs)) {
if (fixup_exception(regs, trapnr, error_code, 0))
return 0;
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = trapnr;
die(str, regs, error_code);
}
/*
* We want error_code and trap_nr set for userspace faults and
* kernelspace faults which result in die(), but not
* kernelspace faults which are fixed up. die() gives the
* process no chance to handle the signal and notice the
* kernel fault information, so that won't result in polluting
* the information about previously queued, but not yet
* delivered, faults. See also do_general_protection below.
*/
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = trapnr;
return -1;
}
static void show_signal(struct task_struct *tsk, int signr,
const char *type, const char *desc,
struct pt_regs *regs, long error_code)
{
if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
printk_ratelimit()) {
pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx",
tsk->comm, task_pid_nr(tsk), type, desc,
regs->ip, regs->sp, error_code);
print_vma_addr(KERN_CONT " in ", regs->ip);
pr_cont("\n");
}
}
static void
do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
long error_code, int sicode, void __user *addr)
{
struct task_struct *tsk = current;
if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
return;
show_signal(tsk, signr, "trap ", str, regs, error_code);
if (!sicode)
force_sig(signr);
else
force_sig_fault(signr, sicode, addr);
}
NOKPROBE_SYMBOL(do_trap);
static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
unsigned long trapnr, int signr, int sicode, void __user *addr)
{
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
/*
* WARN*()s end up here; fix them up before we call the
* notifier chain.
*/
if (!user_mode(regs) && fixup_bug(regs, trapnr))
return;
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
NOTIFY_STOP) {
cond_local_irq_enable(regs);
do_trap(trapnr, signr, str, regs, error_code, sicode, addr);
}
}
#define IP ((void __user *)uprobe_get_trap_addr(regs))
#define DO_ERROR(trapnr, signr, sicode, addr, str, name) \
dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \
}
DO_ERROR(X86_TRAP_DE, SIGFPE, FPE_INTDIV, IP, "divide error", divide_error)
DO_ERROR(X86_TRAP_OF, SIGSEGV, 0, NULL, "overflow", overflow)
DO_ERROR(X86_TRAP_UD, SIGILL, ILL_ILLOPN, IP, "invalid opcode", invalid_op)
DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun)
DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS)
DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present)
DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment)
#undef IP
dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code)
{
char *str = "alignment check";
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP)
return;
if (!user_mode(regs))
die("Split lock detected\n", regs, error_code);
local_irq_enable();
if (handle_user_split_lock(regs, error_code))
return;
do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs,
error_code, BUS_ADRALN, NULL);
}
#ifdef CONFIG_VMAP_STACK
__visible void __noreturn handle_stack_overflow(const char *message,
struct pt_regs *regs,
unsigned long fault_address)
{
printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
(void *)fault_address, current->stack,
(char *)current->stack + THREAD_SIZE - 1);
die(message, regs, 0);
/* Be absolutely certain we don't return. */
panic("%s", message);
}
#endif
/*
* Runs on an IST stack for x86_64 and on a special task stack for x86_32.
*
* On x86_64, this is more or less a normal kernel entry. Notwithstanding the
* SDM's warnings about double faults being unrecoverable, returning works as
* expected. Presumably what the SDM actually means is that the CPU may get
* the register state wrong on entry, so returning could be a bad idea.
*
* Various CPU engineers have promised that double faults due to an IRET fault
* while the stack is read-only are, in fact, recoverable.
*
* On x86_32, this is entered through a task gate, and regs are synthesized
* from the TSS. Returning is, in principle, okay, but changes to regs will
* be lost. If, for some reason, we need to return to a context with modified
* regs, the shim code could be adjusted to synchronize the registers.
*/
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
#ifdef CONFIG_X86_ESPFIX64
extern unsigned char native_irq_return_iret[];
/*
* If IRET takes a non-IST fault on the espfix64 stack, then we
* end up promoting it to a doublefault. In that case, take
* advantage of the fact that we're not using the normal (TSS.sp0)
* stack right now. We can write a fake #GP(0) frame at TSS.sp0
* and then modify our own IRET frame so that, when we return,
* we land directly at the #GP(0) vector with the stack already
* set up according to its expectations.
*
* The net result is that our #GP handler will think that we
* entered from usermode with the bad user context.
*
* No need for nmi_enter() here because we don't use RCU.
*/
if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
regs->cs == __KERNEL_CS &&
regs->ip == (unsigned long)native_irq_return_iret)
{
struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/*
* regs->sp points to the failing IRET frame on the
* ESPFIX64 stack. Copy it to the entry stack. This fills
* in gpregs->ss through gpregs->ip.
*
*/
memmove(&gpregs->ip, (void *)regs->sp, 5*8);
gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
/*
* Adjust our frame so that we return straight to the #GP
* vector with the expected RSP value. This is safe because
* we won't enable interupts or schedule before we invoke
* general_protection, so nothing will clobber the stack
* frame we just set up.
*
* We will enter general_protection with kernel GSBASE,
* which is what the stub expects, given that the faulting
* RIP will be the IRET instruction.
*/
regs->ip = (unsigned long)general_protection;
regs->sp = (unsigned long)&gpregs->orig_ax;
return;
}
#endif
nmi_enter();
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_DF;
#ifdef CONFIG_VMAP_STACK
/*
* If we overflow the stack into a guard page, the CPU will fail
* to deliver #PF and will send #DF instead. Similarly, if we
* take any non-IST exception while too close to the bottom of
* the stack, the processor will get a page fault while
* delivering the exception and will generate a double fault.
*
* According to the SDM (footnote in 6.15 under "Interrupt 14 -
* Page-Fault Exception (#PF):
*
* Processors update CR2 whenever a page fault is detected. If a
* second page fault occurs while an earlier page fault is being
* delivered, the faulting linear address of the second fault will
* overwrite the contents of CR2 (replacing the previous
* address). These updates to CR2 occur even if the page fault
* results in a double fault or occurs during the delivery of a
* double fault.
*
* The logic below has a small possibility of incorrectly diagnosing
* some errors as stack overflows. For example, if the IDT or GDT
* gets corrupted such that #GP delivery fails due to a bad descriptor
* causing #GP and we hit this condition while CR2 coincidentally
* points to the stack guard page, we'll think we overflowed the
* stack. Given that we're going to panic one way or another
* if this happens, this isn't necessarily worth fixing.
*
* If necessary, we could improve the test by only diagnosing
* a stack overflow if the saved RSP points within 47 bytes of
* the bottom of the stack: if RSP == tsk_stack + 48 and we
* take an exception, the stack is already aligned and there
* will be enough room SS, RSP, RFLAGS, CS, RIP, and a
* possible error code, so a stack overflow would *not* double
* fault. With any less space left, exception delivery could
* fail, and, as a practical matter, we've overflowed the
* stack even if the actual trigger for the double fault was
* something else.
*/
if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
#endif
pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
die("double fault", regs, error_code);
panic("Machine halted.");
}
dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
{
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
if (notify_die(DIE_TRAP, "bounds", regs, error_code,
X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
return;
cond_local_irq_enable(regs);
if (!user_mode(regs))
die("bounds", regs, error_code);
do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
}
enum kernel_gp_hint {
GP_NO_HINT,
GP_NON_CANONICAL,
GP_CANONICAL
};
/*
* When an uncaught #GP occurs, try to determine the memory address accessed by
* the instruction and return that address to the caller. Also, try to figure
* out whether any part of the access to that address was non-canonical.
*/
static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
unsigned long *addr)
{
u8 insn_buf[MAX_INSN_SIZE];
struct insn insn;
if (probe_kernel_read(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
return GP_NO_HINT;
kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE);
insn_get_modrm(&insn);
insn_get_sib(&insn);
*addr = (unsigned long)insn_get_addr_ref(&insn, regs);
if (*addr == -1UL)
return GP_NO_HINT;
#ifdef CONFIG_X86_64
/*
* Check that:
* - the operand is not in the kernel half
* - the last byte of the operand is not in the user canonical half
*/
if (*addr < ~__VIRTUAL_MASK &&
*addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK)
return GP_NON_CANONICAL;
#endif
return GP_CANONICAL;
}
#define GPFSTR "general protection fault"
dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
enum kernel_gp_hint hint = GP_NO_HINT;
struct task_struct *tsk;
unsigned long gp_addr;
int ret;
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
cond_local_irq_enable(regs);
if (static_cpu_has(X86_FEATURE_UMIP)) {
if (user_mode(regs) && fixup_umip_exception(regs))
return;
}
if (v8086_mode(regs)) {
local_irq_enable();
handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
return;
}
tsk = current;
if (user_mode(regs)) {
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
force_sig(SIGSEGV);
return;
}
if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
return;
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
/*
* To be potentially processing a kprobe fault and to trust the result
* from kprobe_running(), we have to be non-preemptible.
*/
if (!preemptible() &&
kprobe_running() &&
kprobe_fault_handler(regs, X86_TRAP_GP))
return;
ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV);
if (ret == NOTIFY_STOP)
return;
if (error_code)
snprintf(desc, sizeof(desc), "segment-related " GPFSTR);
else
hint = get_kernel_gp_address(regs, &gp_addr);
if (hint != GP_NO_HINT)
snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx",
(hint == GP_NON_CANONICAL) ? "probably for non-canonical address"
: "maybe for address",
gp_addr);
/*
* KASAN is interested only in the non-canonical case, clear it
* otherwise.
*/
if (hint != GP_NON_CANONICAL)
gp_addr = 0;
die_addr(desc, regs, error_code, gp_addr);
}
NOKPROBE_SYMBOL(do_general_protection);
dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
{
if (poke_int3_handler(regs))
return;
/*
* Unlike any other non-IST entry, we can be called from pretty much
* any location in the kernel through kprobes -- text_poke() will most
* likely be handled by poke_int3_handler() above. This means this
* handler is effectively NMI-like.
*/
if (!user_mode(regs))
nmi_enter();
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
goto exit;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
#ifdef CONFIG_KPROBES
if (kprobe_int3_handler(regs))
goto exit;
#endif
if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
goto exit;
cond_local_irq_enable(regs);
do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, 0, NULL);
cond_local_irq_disable(regs);
exit:
if (!user_mode(regs))
nmi_exit();
}
NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
* Help handler running on a per-cpu (IST or entry trampoline) stack
* to switch to the normal thread stack if the interrupted code was in
* user mode. The actual stack switch is done in entry_64.S
*/
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
if (regs != eregs)
*regs = *eregs;
return regs;
}
NOKPROBE_SYMBOL(sync_regs);
struct bad_iret_stack {
void *error_entry_ret;
struct pt_regs regs;
};
asmlinkage __visible notrace
struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
{
/*
* This is called from entry_64.S early in handling a fault
* caused by a bad iret to user mode. To handle the fault
* correctly, we want to move our stack frame to where it would
* be had we entered directly on the entry stack (rather than
* just below the IRET frame) and we want to pretend that the
* exception came from the IRET target.
*/
struct bad_iret_stack *new_stack =
(struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Copy the IRET target to the new stack. */
memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
/* Copy the remainder of the stack from the current stack. */
memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
BUG_ON(!user_mode(&new_stack->regs));
return new_stack;
}
NOKPROBE_SYMBOL(fixup_bad_iret);
#endif
static bool is_sysenter_singlestep(struct pt_regs *regs)
{
/*
* We don't try for precision here. If we're anywhere in the region of
* code that can be single-stepped in the SYSENTER entry path, then
* assume that this is a useless single-step trap due to SYSENTER
* being invoked with TF set. (We don't know in advance exactly
* which instructions will be hit because BTF could plausibly
* be set.)
*/
#ifdef CONFIG_X86_32
return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
(unsigned long)__end_SYSENTER_singlestep_region -
(unsigned long)__begin_SYSENTER_singlestep_region;
#elif defined(CONFIG_IA32_EMULATION)
return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
(unsigned long)__end_entry_SYSENTER_compat -
(unsigned long)entry_SYSENTER_compat;
#else
return false;
#endif
}
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
* it is possible to get a watchpoint trap here from inside the kernel.
* However, the code in ./ptrace.c has ensured that the user can
* only set watchpoints on userspace addresses. Therefore the in-kernel
* watchpoint trap can only occur in code which is reading/writing
* from user space. Such code must not hold kernel locks (since it
* can equally take a page fault), therefore it is safe to call
* force_sig_info even though that claims and releases locks.
*
* Code in ./signal.c ensures that the debug control register
* is restored before we deliver any signal, and therefore that
* user code runs with the correct debug control register even though
* we clear it here.
*
* Being careful here means that we don't have to be as careful in a
* lot of more complicated places (task switching can be a bit lazy
* about restoring all the debug state, and ptrace doesn't have to
* find every occurrence of the TF bit that could be saved away even
* by user code)
*
* May run on IST stack.
*/
dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk = current;
int user_icebp = 0;
unsigned long dr6;
int si_code;
nmi_enter();
get_debugreg(dr6, 6);
/*
* The Intel SDM says:
*
* Certain debug exceptions may clear bits 0-3. The remaining
* contents of the DR6 register are never cleared by the
* processor. To avoid confusion in identifying debug
* exceptions, debug handlers should clear the register before
* returning to the interrupted task.
*
* Keep it simple: clear DR6 immediately.
*/
set_debugreg(0, 6);
/* Filter out all the reserved bits which are preset to 1 */
dr6 &= ~DR6_RESERVED;
/*
* The SDM says "The processor clears the BTF flag when it
* generates a debug exception." Clear TIF_BLOCKSTEP to keep
* TIF_BLOCKSTEP in sync with the hardware BTF flag.
*/
clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
is_sysenter_singlestep(regs))) {
dr6 &= ~DR_STEP;
if (!dr6)
goto exit;
/*
* else we might have gotten a single-step trap and hit a
* watchpoint at the same time, in which case we should fall
* through and handle the watchpoint.
*/
}
/*
* If dr6 has no reason to give us about the origin of this trap,
* then it's very likely the result of an icebp/int01 trap.
* User wants a sigtrap for that.
*/
if (!dr6 && user_mode(regs))
user_icebp = 1;
/* Store the virtualized DR6 value */
tsk->thread.debugreg6 = dr6;
#ifdef CONFIG_KPROBES
if (kprobe_debug_handler(regs))
goto exit;
#endif
if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
SIGTRAP) == NOTIFY_STOP)
goto exit;
/*
* Let others (NMI) know that the debug stack is in use
* as we may switch to the interrupt stack.
*/
debug_stack_usage_inc();
/* It's safe to allow irq's after DR6 has been saved */
cond_local_irq_enable(regs);
if (v8086_mode(regs)) {
handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
X86_TRAP_DB);
cond_local_irq_disable(regs);
debug_stack_usage_dec();
goto exit;
}
if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
/*
* Historical junk that used to handle SYSENTER single-stepping.
* This should be unreachable now. If we survive for a while
* without anyone hitting this warning, we'll turn this into
* an oops.
*/
tsk->thread.debugreg6 &= ~DR_STEP;
set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
regs->flags &= ~X86_EFLAGS_TF;
}
si_code = get_si_code(tsk->thread.debugreg6);
if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
send_sigtrap(regs, error_code, si_code);
cond_local_irq_disable(regs);
debug_stack_usage_dec();
exit:
nmi_exit();
}
NOKPROBE_SYMBOL(do_debug);
/*
* Note that we play around with the 'TS' bit in an attempt to get
* the correct behaviour even in the presence of the asynchronous
* IRQ13 behaviour
*/
static void math_error(struct pt_regs *regs, int error_code, int trapnr)
{
struct task_struct *task = current;
struct fpu *fpu = &task->thread.fpu;
int si_code;
char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
"simd exception";
cond_local_irq_enable(regs);
if (!user_mode(regs)) {
if (fixup_exception(regs, trapnr, error_code, 0))
return;
task->thread.error_code = error_code;
task->thread.trap_nr = trapnr;
if (notify_die(DIE_TRAP, str, regs, error_code,
trapnr, SIGFPE) != NOTIFY_STOP)
die(str, regs, error_code);
return;
}
/*
* Save the info for the exception handler and clear the error.
*/
fpu__save(fpu);
task->thread.trap_nr = trapnr;
task->thread.error_code = error_code;
si_code = fpu__exception_code(fpu, trapnr);
/* Retry when we get spurious exceptions: */
if (!si_code)
return;
force_sig_fault(SIGFPE, si_code,
(void __user *)uprobe_get_trap_addr(regs));
}
dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
{
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
math_error(regs, error_code, X86_TRAP_MF);
}
dotraplinkage void
do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
{
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
math_error(regs, error_code, X86_TRAP_XF);
}
dotraplinkage void
do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
{
/*
* This addresses a Pentium Pro Erratum:
*
* PROBLEM: If the APIC subsystem is configured in mixed mode with
* Virtual Wire mode implemented through the local APIC, an
* interrupt vector of 0Fh (Intel reserved encoding) may be
* generated by the local APIC (Int 15). This vector may be
* generated upon receipt of a spurious interrupt (an interrupt
* which is removed before the system receives the INTA sequence)
* instead of the programmed 8259 spurious interrupt vector.
*
* IMPLICATION: The spurious interrupt vector programmed in the
* 8259 is normally handled by an operating system's spurious
* interrupt handler. However, a vector of 0Fh is unknown to some
* operating systems, which would crash if this erratum occurred.
*
* In theory this could be limited to 32bit, but the handler is not
* hurting and who knows which other CPUs suffer from this.
*/
}
dotraplinkage void
do_device_not_available(struct pt_regs *regs, long error_code)
{
unsigned long cr0 = read_cr0();
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
#ifdef CONFIG_MATH_EMULATION
if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) {
struct math_emu_info info = { };
cond_local_irq_enable(regs);
info.regs = regs;
math_emulate(&info);
return;
}
#endif
/* This should not happen. */
if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
/* Try to fix it up and carry on. */
write_cr0(cr0 & ~X86_CR0_TS);
} else {
/*
* Something terrible happened, and we're better off trying
* to kill the task than getting stuck in a never-ending
* loop of #NM faults.
*/
die("unexpected #NM exception", regs, error_code);
}
}
NOKPROBE_SYMBOL(do_device_not_available);
#ifdef CONFIG_X86_32
dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
{
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
local_irq_enable();
if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
ILL_BADSTK, (void __user *)NULL);
}
}
#endif
void __init trap_init(void)
{
/* Init cpu_entry_area before IST entries are set up */
setup_cpu_entry_areas();
idt_setup_traps();
/*
* Set the IDT descriptor to a fixed read-only location, so that the
* "sidt" instruction will not leak the location of the kernel, and
* to defend the IDT against arbitrary memory write vulnerabilities.
* It will be reloaded in cpu_init() */
cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
PAGE_KERNEL_RO);
idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
/*
* Should be a barrier for any external CPU state:
*/
cpu_init();
idt_setup_ist_traps();
idt_setup_debugidt_traps();
}