Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 asm changes from Ingo Molnar:
 "The main changes in this cycle were the x86/entry and sysret
  enhancements from Andy Lutomirski, see merge commits 772a9aca12 and
  b57c0b5175 for details"

[ Exectutive summary: IST exceptions that interrupt user space will run
  on the regular kernel stack instead of the IST stack.  Which
  simplifies things particularly on return to user space.

  The sysret cleanup ends up simplifying the logic on when we can use
  sysret vs when we have to use iret.                - Linus ]

* 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86_64, entry: Remove the syscall exit audit and schedule optimizations
  x86_64, entry: Use sysret to return to userspace when possible
  x86, traps: Fix ist_enter from userspace
  x86, vdso: teach 'make clean' remove vdso64 binaries
  x86_64 entry: Fix RCX for ptraced syscalls
  x86: entry_64.S: fold SAVE_ARGS_IRQ macro into its sole user
  x86: ia32entry.S: fix wrong symbolic constant usage: R11->ARGOFFSET
  x86: entry_64.S: delete unused code
  x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks
  x86, traps: Add ist_begin_non_atomic and ist_end_non_atomic
  x86: Clean up current_stack_pointer
  x86, traps: Track entry into and exit from IST context
  x86, entry: Switch stacks on a paranoid entry from userspace
This commit is contained in:
Linus Torvalds 2015-02-09 17:16:44 -08:00
commit 7453311d68
15 changed files with 325 additions and 310 deletions

View File

@ -78,9 +78,6 @@ The expensive (paranoid) way is to read back the MSR_GS_BASE value
xorl %ebx,%ebx xorl %ebx,%ebx
1: ret 1: ret
and the whole paranoid non-paranoid macro complexity is about whether
to suffer that RDMSR cost.
If we are at an interrupt or user-trap/gate-alike boundary then we can If we are at an interrupt or user-trap/gate-alike boundary then we can
use the faster check: the stack will be a reliable indicator of use the faster check: the stack will be a reliable indicator of
whether SWAPGS was already done: if we see that we are a secondary whether SWAPGS was already done: if we see that we are a secondary
@ -93,6 +90,15 @@ which might have triggered right after a normal entry wrote CS to the
stack but before we executed SWAPGS, then the only safe way to check stack but before we executed SWAPGS, then the only safe way to check
for GS is the slower method: the RDMSR. for GS is the slower method: the RDMSR.
So we try only to mark those entry methods 'paranoid' that absolutely Therefore, super-atomic entries (except NMI, which is handled separately)
need the more expensive check for the GS base - and we generate all must use idtentry with paranoid=1 to handle gsbase correctly. This
'normal' entry points with the regular (faster) entry macros. triggers three main behavior changes:
- Interrupt entry will use the slower gsbase check.
- Interrupt entry from user mode will switch off the IST stack.
- Interrupt exit to kernel mode will not attempt to reschedule.
We try to only use IST entries and the paranoid entry code for vectors
that absolutely need the more expensive check for the GS base - and we
generate all 'normal' entry points with the regular (faster) paranoid=0
variant.

View File

@ -40,9 +40,11 @@ An IST is selected by a non-zero value in the IST field of an
interrupt-gate descriptor. When an interrupt occurs and the hardware interrupt-gate descriptor. When an interrupt occurs and the hardware
loads such a descriptor, the hardware automatically sets the new stack loads such a descriptor, the hardware automatically sets the new stack
pointer based on the IST value, then invokes the interrupt handler. If pointer based on the IST value, then invokes the interrupt handler. If
software wants to allow nested IST interrupts then the handler must the interrupt came from user mode, then the interrupt handler prologue
adjust the IST values on entry to and exit from the interrupt handler. will switch back to the per-thread stack. If software wants to allow
(This is occasionally done, e.g. for debug exceptions.) nested IST interrupts then the handler must adjust the IST values on
entry to and exit from the interrupt handler. (This is occasionally
done, e.g. for debug exceptions.)
Events with different IST codes (i.e. with different stacks) can be Events with different IST codes (i.e. with different stacks) can be
nested. For example, a debug interrupt can safely be interrupted by an nested. For example, a debug interrupt can safely be interrupted by an

View File

@ -179,8 +179,8 @@ sysenter_dispatch:
sysexit_from_sys_call: sysexit_from_sys_call:
andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
/* clear IF, that popfq doesn't enable interrupts early */ /* clear IF, that popfq doesn't enable interrupts early */
andl $~0x200,EFLAGS-R11(%rsp) andl $~0x200,EFLAGS-ARGOFFSET(%rsp)
movl RIP-R11(%rsp),%edx /* User %eip */ movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */
CFI_REGISTER rip,rdx CFI_REGISTER rip,rdx
RESTORE_ARGS 0,24,0,0,0,0 RESTORE_ARGS 0,24,0,0,0,0
xorq %r8,%r8 xorq %r8,%r8

View File

@ -83,7 +83,6 @@ For 32-bit we have the following conventions - kernel is built with
#define SS 160 #define SS 160
#define ARGOFFSET R11 #define ARGOFFSET R11
#define SWFRAME ORIG_RAX
.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
subq $9*8+\addskip, %rsp subq $9*8+\addskip, %rsp

View File

@ -190,7 +190,6 @@ enum mcp_flags {
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
int mce_notify_irq(void); int mce_notify_irq(void);
void mce_notify_process(void);
DECLARE_PER_CPU(struct mce, injectm); DECLARE_PER_CPU(struct mce, injectm);

View File

@ -75,7 +75,6 @@ struct thread_info {
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */ #define TIF_SECCOMP 8 /* secure computing */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */
@ -100,7 +99,6 @@ struct thread_info {
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_NOTSC (1 << TIF_NOTSC)
@ -140,7 +138,7 @@ struct thread_info {
/* Only used for 64 bit */ /* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \ #define _TIF_DO_NOTIFY_MASK \
(_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \
_TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE)
/* flags to check in __switch_to() */ /* flags to check in __switch_to() */
@ -170,6 +168,17 @@ static inline struct thread_info *current_thread_info(void)
return ti; return ti;
} }
static inline unsigned long current_stack_pointer(void)
{
unsigned long sp;
#ifdef CONFIG_X86_64
asm("mov %%rsp,%0" : "=g" (sp));
#else
asm("mov %%esp,%0" : "=g" (sp));
#endif
return sp;
}
#else /* !__ASSEMBLY__ */ #else /* !__ASSEMBLY__ */
/* how to get the thread information struct from ASM */ /* how to get the thread information struct from ASM */

View File

@ -1,6 +1,7 @@
#ifndef _ASM_X86_TRAPS_H #ifndef _ASM_X86_TRAPS_H
#define _ASM_X86_TRAPS_H #define _ASM_X86_TRAPS_H
#include <linux/context_tracking_state.h>
#include <linux/kprobes.h> #include <linux/kprobes.h>
#include <asm/debugreg.h> #include <asm/debugreg.h>
@ -110,6 +111,11 @@ asmlinkage void smp_thermal_interrupt(void);
asmlinkage void mce_threshold_interrupt(void); asmlinkage void mce_threshold_interrupt(void);
#endif #endif
extern enum ctx_state ist_enter(struct pt_regs *regs);
extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state);
extern void ist_begin_non_atomic(struct pt_regs *regs);
extern void ist_end_non_atomic(void);
/* Interrupts/Exceptions */ /* Interrupts/Exceptions */
enum { enum {
X86_TRAP_DE = 0, /* 0, Divide-by-zero */ X86_TRAP_DE = 0, /* 0, Divide-by-zero */

View File

@ -43,6 +43,7 @@
#include <linux/export.h> #include <linux/export.h>
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/traps.h>
#include <asm/mce.h> #include <asm/mce.h>
#include <asm/msr.h> #include <asm/msr.h>
@ -1002,51 +1003,6 @@ static void mce_clear_state(unsigned long *toclear)
} }
} }
/*
* Need to save faulting physical address associated with a process
* in the machine check handler some place where we can grab it back
* later in mce_notify_process()
*/
#define MCE_INFO_MAX 16
struct mce_info {
atomic_t inuse;
struct task_struct *t;
__u64 paddr;
int restartable;
} mce_info[MCE_INFO_MAX];
static void mce_save_info(__u64 addr, int c)
{
struct mce_info *mi;
for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
mi->t = current;
mi->paddr = addr;
mi->restartable = c;
return;
}
}
mce_panic("Too many concurrent recoverable errors", NULL, NULL);
}
static struct mce_info *mce_find_info(void)
{
struct mce_info *mi;
for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
if (atomic_read(&mi->inuse) && mi->t == current)
return mi;
return NULL;
}
static void mce_clear_info(struct mce_info *mi)
{
atomic_set(&mi->inuse, 0);
}
/* /*
* The actual machine check handler. This only handles real * The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18. * exceptions when something got corrupted coming in through int 18.
@ -1063,6 +1019,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
{ {
struct mca_config *cfg = &mca_cfg; struct mca_config *cfg = &mca_cfg;
struct mce m, *final; struct mce m, *final;
enum ctx_state prev_state;
int i; int i;
int worst = 0; int worst = 0;
int severity; int severity;
@ -1084,6 +1041,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
DECLARE_BITMAP(toclear, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS);
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
char *msg = "Unknown"; char *msg = "Unknown";
u64 recover_paddr = ~0ull;
int flags = MF_ACTION_REQUIRED;
prev_state = ist_enter(regs);
this_cpu_inc(mce_exception_count); this_cpu_inc(mce_exception_count);
@ -1203,9 +1164,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (no_way_out) if (no_way_out)
mce_panic("Fatal machine check on current CPU", &m, msg); mce_panic("Fatal machine check on current CPU", &m, msg);
if (worst == MCE_AR_SEVERITY) { if (worst == MCE_AR_SEVERITY) {
/* schedule action before return to userland */ recover_paddr = m.addr;
mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); if (!(m.mcgstatus & MCG_STATUS_RIPV))
set_thread_flag(TIF_MCE_NOTIFY); flags |= MF_MUST_KILL;
} else if (kill_it) { } else if (kill_it) {
force_sig(SIGBUS, current); force_sig(SIGBUS, current);
} }
@ -1216,6 +1177,27 @@ void do_machine_check(struct pt_regs *regs, long error_code)
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
out: out:
sync_core(); sync_core();
if (recover_paddr == ~0ull)
goto done;
pr_err("Uncorrected hardware memory error in user-access at %llx",
recover_paddr);
/*
* We must call memory_failure() here even if the current process is
* doomed. We still need to mark the page as poisoned and alert any
* other users of the page.
*/
ist_begin_non_atomic(regs);
local_irq_enable();
if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
pr_err("Memory error not recovered");
force_sig(SIGBUS, current);
}
local_irq_disable();
ist_end_non_atomic();
done:
ist_exit(regs, prev_state);
} }
EXPORT_SYMBOL_GPL(do_machine_check); EXPORT_SYMBOL_GPL(do_machine_check);
@ -1232,42 +1214,6 @@ int memory_failure(unsigned long pfn, int vector, int flags)
} }
#endif #endif
/*
* Called in process context that interrupted by MCE and marked with
* TIF_MCE_NOTIFY, just before returning to erroneous userland.
* This code is allowed to sleep.
* Attempt possible recovery such as calling the high level VM handler to
* process any corrupted pages, and kill/signal current process if required.
* Action required errors are handled here.
*/
void mce_notify_process(void)
{
unsigned long pfn;
struct mce_info *mi = mce_find_info();
int flags = MF_ACTION_REQUIRED;
if (!mi)
mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
pfn = mi->paddr >> PAGE_SHIFT;
clear_thread_flag(TIF_MCE_NOTIFY);
pr_err("Uncorrected hardware memory error in user-access at %llx",
mi->paddr);
/*
* We must call memory_failure() here even if the current process is
* doomed. We still need to mark the page as poisoned and alert any
* other users of the page.
*/
if (!mi->restartable)
flags |= MF_MUST_KILL;
if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
pr_err("Memory error not recovered");
force_sig(SIGBUS, current);
}
mce_clear_info(mi);
}
/* /*
* Action optional processing happens here (picking up * Action optional processing happens here (picking up
* from the list of faulting pages that do_machine_check() * from the list of faulting pages that do_machine_check()

View File

@ -8,6 +8,7 @@
#include <linux/smp.h> #include <linux/smp.h>
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/traps.h>
#include <asm/mce.h> #include <asm/mce.h>
#include <asm/msr.h> #include <asm/msr.h>
@ -17,8 +18,11 @@ int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */ /* Machine check handler for Pentium class Intel CPUs: */
static void pentium_machine_check(struct pt_regs *regs, long error_code) static void pentium_machine_check(struct pt_regs *regs, long error_code)
{ {
enum ctx_state prev_state;
u32 loaddr, hi, lotype; u32 loaddr, hi, lotype;
prev_state = ist_enter(regs);
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@ -33,6 +37,8 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
} }
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
ist_exit(regs, prev_state);
} }
/* Set up machine check reporting for processors with Intel style MCE: */ /* Set up machine check reporting for processors with Intel style MCE: */

View File

@ -7,14 +7,19 @@
#include <linux/types.h> #include <linux/types.h>
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/traps.h>
#include <asm/mce.h> #include <asm/mce.h>
#include <asm/msr.h> #include <asm/msr.h>
/* Machine check handler for WinChip C6: */ /* Machine check handler for WinChip C6: */
static void winchip_machine_check(struct pt_regs *regs, long error_code) static void winchip_machine_check(struct pt_regs *regs, long error_code)
{ {
enum ctx_state prev_state = ist_enter(regs);
printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
ist_exit(regs, prev_state);
} }
/* Set up machine check reporting on the Winchip C6 series */ /* Set up machine check reporting on the Winchip C6 series */

View File

@ -143,7 +143,8 @@ ENDPROC(native_usergs_sysret64)
movq \tmp,RSP+\offset(%rsp) movq \tmp,RSP+\offset(%rsp)
movq $__USER_DS,SS+\offset(%rsp) movq $__USER_DS,SS+\offset(%rsp)
movq $__USER_CS,CS+\offset(%rsp) movq $__USER_CS,CS+\offset(%rsp)
movq $-1,RCX+\offset(%rsp) movq RIP+\offset(%rsp),\tmp /* get rip */
movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */
movq R11+\offset(%rsp),\tmp /* get eflags */ movq R11+\offset(%rsp),\tmp /* get eflags */
movq \tmp,EFLAGS+\offset(%rsp) movq \tmp,EFLAGS+\offset(%rsp)
.endm .endm
@ -155,27 +156,6 @@ ENDPROC(native_usergs_sysret64)
movq \tmp,R11+\offset(%rsp) movq \tmp,R11+\offset(%rsp)
.endm .endm
.macro FAKE_STACK_FRAME child_rip
/* push in order ss, rsp, eflags, cs, rip */
xorl %eax, %eax
pushq_cfi $__KERNEL_DS /* ss */
/*CFI_REL_OFFSET ss,0*/
pushq_cfi %rax /* rsp */
CFI_REL_OFFSET rsp,0
pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */
/*CFI_REL_OFFSET rflags,0*/
pushq_cfi $__KERNEL_CS /* cs */
/*CFI_REL_OFFSET cs,0*/
pushq_cfi \child_rip /* rip */
CFI_REL_OFFSET rip,0
pushq_cfi %rax /* orig rax */
.endm
.macro UNFAKE_STACK_FRAME
addq $8*6, %rsp
CFI_ADJUST_CFA_OFFSET -(6*8)
.endm
/* /*
* initial frame state for interrupts (and exceptions without error code) * initial frame state for interrupts (and exceptions without error code)
*/ */
@ -238,51 +218,6 @@ ENDPROC(native_usergs_sysret64)
CFI_REL_OFFSET r15, R15+\offset CFI_REL_OFFSET r15, R15+\offset
.endm .endm
/* save partial stack frame */
.macro SAVE_ARGS_IRQ
cld
/* start from rbp in pt_regs and jump over */
movq_cfi rdi, (RDI-RBP)
movq_cfi rsi, (RSI-RBP)
movq_cfi rdx, (RDX-RBP)
movq_cfi rcx, (RCX-RBP)
movq_cfi rax, (RAX-RBP)
movq_cfi r8, (R8-RBP)
movq_cfi r9, (R9-RBP)
movq_cfi r10, (R10-RBP)
movq_cfi r11, (R11-RBP)
/* Save rbp so that we can unwind from get_irq_regs() */
movq_cfi rbp, 0
/* Save previous stack value */
movq %rsp, %rsi
leaq -RBP(%rsp),%rdi /* arg1 for handler */
testl $3, CS-RBP(%rsi)
je 1f
SWAPGS
/*
* irq_count is used to check if a CPU is already on an interrupt stack
* or not. While this is essentially redundant with preempt_count it is
* a little cheaper to use a separate counter in the PDA (short of
* moving irq_enter into assembly, which would be too much work)
*/
1: incl PER_CPU_VAR(irq_count)
cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
CFI_DEF_CFA_REGISTER rsi
/* Store previous stack value */
pushq %rsi
CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
0x77 /* DW_OP_breg7 */, 0, \
0x06 /* DW_OP_deref */, \
0x08 /* DW_OP_const1u */, SS+8-RBP, \
0x22 /* DW_OP_plus */
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF
.endm
ENTRY(save_paranoid) ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8 XCPT_FRAME 1 RDI+8
cld cld
@ -426,15 +361,12 @@ system_call_fastpath:
* Has incomplete stack frame and undefined top of stack. * Has incomplete stack frame and undefined top of stack.
*/ */
ret_from_sys_call: ret_from_sys_call:
movl $_TIF_ALLWORK_MASK,%edi testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
/* edi: flagmask */ jnz int_ret_from_sys_call_fixup /* Go the the slow path */
sysret_check:
LOCKDEP_SYS_EXIT LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_NONE) DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF TRACE_IRQS_OFF
movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
andl %edi,%edx
jnz sysret_careful
CFI_REMEMBER_STATE CFI_REMEMBER_STATE
/* /*
* sysretq will re-enable interrupts: * sysretq will re-enable interrupts:
@ -448,49 +380,10 @@ sysret_check:
USERGS_SYSRET64 USERGS_SYSRET64
CFI_RESTORE_STATE CFI_RESTORE_STATE
/* Handle reschedules */
/* edx: work, edi: workmask */
sysret_careful:
bt $TIF_NEED_RESCHED,%edx
jnc sysret_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
SCHEDULE_USER
popq_cfi %rdi
jmp sysret_check
/* Handle a signal */ int_ret_from_sys_call_fixup:
sysret_signal:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
#ifdef CONFIG_AUDITSYSCALL
bt $TIF_SYSCALL_AUDIT,%edx
jc sysret_audit
#endif
/*
* We have a signal, or exit tracing or single-step.
* These all wind up with the iret return path anyway,
* so just join that path right now.
*/
FIXUP_TOP_OF_STACK %r11, -ARGOFFSET FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
jmp int_check_syscall_exit_work jmp int_ret_from_sys_call
#ifdef CONFIG_AUDITSYSCALL
/*
* Return fast path for syscall audit. Call __audit_syscall_exit()
* directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
* masked off.
*/
sysret_audit:
movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
setbe %al /* 1 if so, 0 if not */
movzbl %al,%edi /* zero-extend that into %edi */
call __audit_syscall_exit
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
jmp sysret_check
#endif /* CONFIG_AUDITSYSCALL */
/* Do syscall tracing */ /* Do syscall tracing */
tracesys: tracesys:
@ -626,19 +519,6 @@ END(\label)
FORK_LIKE vfork FORK_LIKE vfork
FIXED_FRAME stub_iopl, sys_iopl FIXED_FRAME stub_iopl, sys_iopl
ENTRY(ptregscall_common)
DEFAULT_FRAME 1 8 /* offset 8: return address */
RESTORE_TOP_OF_STACK %r11, 8
movq_cfi_restore R15+8, r15
movq_cfi_restore R14+8, r14
movq_cfi_restore R13+8, r13
movq_cfi_restore R12+8, r12
movq_cfi_restore RBP+8, rbp
movq_cfi_restore RBX+8, rbx
ret $REST_SKIP /* pop extended registers */
CFI_ENDPROC
END(ptregscall_common)
ENTRY(stub_execve) ENTRY(stub_execve)
CFI_STARTPROC CFI_STARTPROC
addq $8, %rsp addq $8, %rsp
@ -779,7 +659,48 @@ END(interrupt)
/* reserve pt_regs for scratch regs and rbp */ /* reserve pt_regs for scratch regs and rbp */
subq $ORIG_RAX-RBP, %rsp subq $ORIG_RAX-RBP, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
SAVE_ARGS_IRQ cld
/* start from rbp in pt_regs and jump over */
movq_cfi rdi, (RDI-RBP)
movq_cfi rsi, (RSI-RBP)
movq_cfi rdx, (RDX-RBP)
movq_cfi rcx, (RCX-RBP)
movq_cfi rax, (RAX-RBP)
movq_cfi r8, (R8-RBP)
movq_cfi r9, (R9-RBP)
movq_cfi r10, (R10-RBP)
movq_cfi r11, (R11-RBP)
/* Save rbp so that we can unwind from get_irq_regs() */
movq_cfi rbp, 0
/* Save previous stack value */
movq %rsp, %rsi
leaq -RBP(%rsp),%rdi /* arg1 for handler */
testl $3, CS-RBP(%rsi)
je 1f
SWAPGS
/*
* irq_count is used to check if a CPU is already on an interrupt stack
* or not. While this is essentially redundant with preempt_count it is
* a little cheaper to use a separate counter in the PDA (short of
* moving irq_enter into assembly, which would be too much work)
*/
1: incl PER_CPU_VAR(irq_count)
cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
CFI_DEF_CFA_REGISTER rsi
/* Store previous stack value */
pushq %rsi
CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
0x77 /* DW_OP_breg7 */, 0, \
0x06 /* DW_OP_deref */, \
0x08 /* DW_OP_const1u */, SS+8-RBP, \
0x22 /* DW_OP_plus */
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF
call \func call \func
.endm .endm
@ -831,6 +752,60 @@ retint_swapgs: /* return to user-space */
*/ */
DISABLE_INTERRUPTS(CLBR_ANY) DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_IRETQ TRACE_IRQS_IRETQ
/*
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context.
*/
movq (RCX-R11)(%rsp), %rcx
cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
jne opportunistic_sysret_failed
/*
* On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
* in kernel space. This essentially lets the user take over
* the kernel, since userspace controls RSP. It's not worth
* testing for canonicalness exactly -- this check detects any
* of the 17 high bits set, which is true for non-canonical
* or kernel addresses. (This will pessimize vsyscall=native.
* Big deal.)
*
* If virtual addresses ever become wider, this will need
* to be updated to remain correct on both old and new CPUs.
*/
.ifne __VIRTUAL_MASK_SHIFT - 47
.error "virtual address width changed -- sysret checks need update"
.endif
shr $__VIRTUAL_MASK_SHIFT, %rcx
jnz opportunistic_sysret_failed
cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
jne opportunistic_sysret_failed
movq (R11-ARGOFFSET)(%rsp), %r11
cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
jne opportunistic_sysret_failed
testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */
jnz opportunistic_sysret_failed
/* nothing to check for RSP */
cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
jne opportunistic_sysret_failed
/*
* We win! This label is here just for ease of understanding
* perf profiles. Nothing jumps here.
*/
irq_return_via_sysret:
CFI_REMEMBER_STATE
RESTORE_ARGS 1,8,1
movq (RSP-RIP)(%rsp),%rsp
USERGS_SYSRET64
CFI_RESTORE_STATE
opportunistic_sysret_failed:
SWAPGS SWAPGS
jmp restore_args jmp restore_args
@ -1048,6 +1023,11 @@ ENTRY(\sym)
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
.if \paranoid .if \paranoid
.if \paranoid == 1
CFI_REMEMBER_STATE
testl $3, CS(%rsp) /* If coming from userspace, switch */
jnz 1f /* stacks. */
.endif
call save_paranoid call save_paranoid
.else .else
call error_entry call error_entry
@ -1088,6 +1068,36 @@ ENTRY(\sym)
jmp error_exit /* %ebx: no swapgs flag */ jmp error_exit /* %ebx: no swapgs flag */
.endif .endif
.if \paranoid == 1
CFI_RESTORE_STATE
/*
* Paranoid entry from userspace. Switch stacks and treat it
* as a normal entry. This means that paranoid handlers
* run in real process context if user_mode(regs).
*/
1:
call error_entry
DEFAULT_FRAME 0
movq %rsp,%rdi /* pt_regs pointer */
call sync_regs
movq %rax,%rsp /* switch stack */
movq %rsp,%rdi /* pt_regs pointer */
.if \has_error_code
movq ORIG_RAX(%rsp),%rsi /* get error code */
movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
.else
xorl %esi,%esi /* no error code */
.endif
call \do_sym
jmp error_exit /* %ebx: no swapgs flag */
.endif
CFI_ENDPROC CFI_ENDPROC
END(\sym) END(\sym)
.endm .endm
@ -1108,7 +1118,7 @@ idtentry overflow do_overflow has_error_code=0
idtentry bounds do_bounds has_error_code=0 idtentry bounds do_bounds has_error_code=0
idtentry invalid_op do_invalid_op has_error_code=0 idtentry invalid_op do_invalid_op has_error_code=0
idtentry device_not_available do_device_not_available has_error_code=0 idtentry device_not_available do_device_not_available has_error_code=0
idtentry double_fault do_double_fault has_error_code=1 paranoid=1 idtentry double_fault do_double_fault has_error_code=1 paranoid=2
idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
idtentry invalid_TSS do_invalid_TSS has_error_code=1 idtentry invalid_TSS do_invalid_TSS has_error_code=1
idtentry segment_not_present do_segment_not_present has_error_code=1 idtentry segment_not_present do_segment_not_present has_error_code=1
@ -1289,16 +1299,14 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(
#endif #endif
/* /*
* "Paranoid" exit path from exception stack. * "Paranoid" exit path from exception stack. This is invoked
* Paranoid because this is used by NMIs and cannot take * only on return from non-NMI IST interrupts that came
* any kernel state for granted. * from kernel space.
* We don't do kernel preemption checks here, because only
* NMI should be common and it does not enable IRQs and
* cannot get reschedule ticks.
* *
* "trace" is 0 for the NMI handler only, because irq-tracing * We may be returning to very strange contexts (e.g. very early
* is fundamentally NMI-unsafe. (we cannot change the soft and * in syscall entry), so checking for preemption here would
* hard flags at once, atomically) * be complicated. Fortunately, we there's no good reason
* to try to handle preemption here.
*/ */
/* ebx: no swapgs flag */ /* ebx: no swapgs flag */
@ -1308,43 +1316,14 @@ ENTRY(paranoid_exit)
TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF_DEBUG
testl %ebx,%ebx /* swapgs needed? */ testl %ebx,%ebx /* swapgs needed? */
jnz paranoid_restore jnz paranoid_restore
testl $3,CS(%rsp)
jnz paranoid_userspace
paranoid_swapgs:
TRACE_IRQS_IRETQ 0 TRACE_IRQS_IRETQ 0
SWAPGS_UNSAFE_STACK SWAPGS_UNSAFE_STACK
RESTORE_ALL 8 RESTORE_ALL 8
jmp irq_return INTERRUPT_RETURN
paranoid_restore: paranoid_restore:
TRACE_IRQS_IRETQ_DEBUG 0 TRACE_IRQS_IRETQ_DEBUG 0
RESTORE_ALL 8 RESTORE_ALL 8
jmp irq_return INTERRUPT_RETURN
paranoid_userspace:
GET_THREAD_INFO(%rcx)
movl TI_flags(%rcx),%ebx
andl $_TIF_WORK_MASK,%ebx
jz paranoid_swapgs
movq %rsp,%rdi /* &pt_regs */
call sync_regs
movq %rax,%rsp /* switch stack for scheduling */
testl $_TIF_NEED_RESCHED,%ebx
jnz paranoid_schedule
movl %ebx,%edx /* arg3: thread flags */
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
xorl %esi,%esi /* arg2: oldset */
movq %rsp,%rdi /* arg1: &pt_regs */
call do_notify_resume
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp paranoid_userspace
paranoid_schedule:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
SCHEDULE_USER
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
jmp paranoid_userspace
CFI_ENDPROC CFI_ENDPROC
END(paranoid_exit) END(paranoid_exit)

View File

@ -69,16 +69,9 @@ static void call_on_stack(void *func, void *stack)
: "memory", "cc", "edx", "ecx", "eax"); : "memory", "cc", "edx", "ecx", "eax");
} }
/* how to get the current stack pointer from C */
#define current_stack_pointer ({ \
unsigned long sp; \
asm("mov %%esp,%0" : "=g" (sp)); \
sp; \
})
static inline void *current_stack(void) static inline void *current_stack(void)
{ {
return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
} }
static inline int static inline int
@ -103,7 +96,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
/* Save the next esp at the bottom of the stack */ /* Save the next esp at the bottom of the stack */
prev_esp = (u32 *)irqstk; prev_esp = (u32 *)irqstk;
*prev_esp = current_stack_pointer; *prev_esp = current_stack_pointer();
if (unlikely(overflow)) if (unlikely(overflow))
call_on_stack(print_stack_overflow, isp); call_on_stack(print_stack_overflow, isp);
@ -156,7 +149,7 @@ void do_softirq_own_stack(void)
/* Push the previous esp onto the stack */ /* Push the previous esp onto the stack */
prev_esp = (u32 *)irqstk; prev_esp = (u32 *)irqstk;
*prev_esp = current_stack_pointer; *prev_esp = current_stack_pointer();
call_on_stack(__do_softirq, isp); call_on_stack(__do_softirq, isp);
} }

View File

@ -740,12 +740,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{ {
user_exit(); user_exit();
#ifdef CONFIG_X86_MCE
/* notify userspace of pending MCEs */
if (thread_info_flags & _TIF_MCE_NOTIFY)
mce_notify_process();
#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
if (thread_info_flags & _TIF_UPROBE) if (thread_info_flags & _TIF_UPROBE)
uprobe_notify_resume(regs); uprobe_notify_resume(regs);

View File

@ -108,6 +108,88 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
preempt_count_dec(); preempt_count_dec();
} }
enum ctx_state ist_enter(struct pt_regs *regs)
{
enum ctx_state prev_state;
if (user_mode_vm(regs)) {
/* Other than that, we're just an exception. */
prev_state = exception_enter();
} else {
/*
* We might have interrupted pretty much anything. In
* fact, if we're a machine check, we can even interrupt
* NMI processing. We don't want in_nmi() to return true,
* but we need to notify RCU.
*/
rcu_nmi_enter();
prev_state = IN_KERNEL; /* the value is irrelevant. */
}
/*
* We are atomic because we're on the IST stack (or we're on x86_32,
* in which case we still shouldn't schedule).
*
* This must be after exception_enter(), because exception_enter()
* won't do anything if in_interrupt() returns true.
*/
preempt_count_add(HARDIRQ_OFFSET);
/* This code is a bit fragile. Test it. */
rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
return prev_state;
}
void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
{
/* Must be before exception_exit. */
preempt_count_sub(HARDIRQ_OFFSET);
if (user_mode_vm(regs))
return exception_exit(prev_state);
else
rcu_nmi_exit();
}
/**
* ist_begin_non_atomic() - begin a non-atomic section in an IST exception
* @regs: regs passed to the IST exception handler
*
* IST exception handlers normally cannot schedule. As a special
* exception, if the exception interrupted userspace code (i.e.
* user_mode_vm(regs) would return true) and the exception was not
* a double fault, it can be safe to schedule. ist_begin_non_atomic()
* begins a non-atomic section within an ist_enter()/ist_exit() region.
* Callers are responsible for enabling interrupts themselves inside
* the non-atomic section, and callers must call is_end_non_atomic()
* before ist_exit().
*/
void ist_begin_non_atomic(struct pt_regs *regs)
{
BUG_ON(!user_mode_vm(regs));
/*
* Sanity check: we need to be on the normal thread stack. This
* will catch asm bugs and any attempt to use ist_preempt_enable
* from double_fault.
*/
BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack))
& ~(THREAD_SIZE - 1)) != 0);
preempt_count_sub(HARDIRQ_OFFSET);
}
/**
* ist_end_non_atomic() - begin a non-atomic section in an IST exception
*
* Ends a non-atomic section started with ist_begin_non_atomic().
*/
void ist_end_non_atomic(void)
{
preempt_count_add(HARDIRQ_OFFSET);
}
static nokprobe_inline int static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
struct pt_regs *regs, long error_code) struct pt_regs *regs, long error_code)
@ -251,6 +333,8 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
* end up promoting it to a doublefault. In that case, modify * end up promoting it to a doublefault. In that case, modify
* the stack to make it look like we just entered the #GP * the stack to make it look like we just entered the #GP
* handler from user space, similar to bad_iret. * handler from user space, similar to bad_iret.
*
* No need for ist_enter here because we don't use RCU.
*/ */
if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
regs->cs == __KERNEL_CS && regs->cs == __KERNEL_CS &&
@ -263,12 +347,12 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
regs->ip = (unsigned long)general_protection; regs->ip = (unsigned long)general_protection;
regs->sp = (unsigned long)&normal_regs->orig_ax; regs->sp = (unsigned long)&normal_regs->orig_ax;
return; return;
} }
#endif #endif
exception_enter(); ist_enter(regs); /* Discard prev_state because we won't return. */
/* Return not checked because double check cannot be ignored */
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
tsk->thread.error_code = error_code; tsk->thread.error_code = error_code;
@ -434,7 +518,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
if (poke_int3_handler(regs)) if (poke_int3_handler(regs))
return; return;
prev_state = exception_enter(); prev_state = ist_enter(regs);
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP) SIGTRAP) == NOTIFY_STOP)
@ -460,33 +544,20 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
preempt_conditional_cli(regs); preempt_conditional_cli(regs);
debug_stack_usage_dec(); debug_stack_usage_dec();
exit: exit:
exception_exit(prev_state); ist_exit(regs, prev_state);
} }
NOKPROBE_SYMBOL(do_int3); NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
/* /*
* Help handler running on IST stack to switch back to user stack * Help handler running on IST stack to switch off the IST stack if the
* for scheduling or signal handling. The actual stack switch is done in * interrupted code was in user mode. The actual stack switch is done in
* entry.S * entry_64.S
*/ */
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
{ {
struct pt_regs *regs = eregs; struct pt_regs *regs = task_pt_regs(current);
/* Did already sync */ *regs = *eregs;
if (eregs == (struct pt_regs *)eregs->sp)
;
/* Exception from user space */
else if (user_mode(eregs))
regs = task_pt_regs(current);
/*
* Exception from kernel and interrupts are enabled. Move to
* kernel process stack.
*/
else if (eregs->flags & X86_EFLAGS_IF)
regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
if (eregs != regs)
*regs = *eregs;
return regs; return regs;
} }
NOKPROBE_SYMBOL(sync_regs); NOKPROBE_SYMBOL(sync_regs);
@ -554,7 +625,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
unsigned long dr6; unsigned long dr6;
int si_code; int si_code;
prev_state = exception_enter(); prev_state = ist_enter(regs);
get_debugreg(dr6, 6); get_debugreg(dr6, 6);
@ -629,7 +700,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_dec(); debug_stack_usage_dec();
exit: exit:
exception_exit(prev_state); ist_exit(regs, prev_state);
} }
NOKPROBE_SYMBOL(do_debug); NOKPROBE_SYMBOL(do_debug);

View File

@ -205,4 +205,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
PHONY += vdso_install $(vdso_img_insttargets) PHONY += vdso_install $(vdso_img_insttargets)
vdso_install: $(vdso_img_insttargets) FORCE vdso_install: $(vdso_img_insttargets) FORCE
clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64*