x86/entry/32: Re-implement SYSENTER using the new C path

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/5b99659e8be70f3dd10cd8970a5c90293d9ad9a7.1444091585.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Andy Lutomirski 2015-10-05 17:48:15 -07:00 committed by Ingo Molnar
parent 150ac78d63
commit 5f310f739b
3 changed files with 51 additions and 100 deletions

View File

@ -363,7 +363,7 @@ __visible void do_int80_syscall_32(struct pt_regs *regs)
syscall_return_slowpath(regs);
}
/* Returns 0 to return using IRET or 1 to return using SYSRETL. */
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
__visible long do_fast_syscall_32(struct pt_regs *regs)
{
/*
@ -417,7 +417,20 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
regs->ip == landing_pad &&
(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
#else
return 0;
/*
* Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
*
* Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
* because the ECX fixup above will ensure that this is essentially
* never the case.
*
* We don't allow syscalls at all from VM86 mode, but we still
* need to check VM, because we might be returning from sys_vm86.
*/
return static_cpu_has(X86_FEATURE_SEP) &&
regs->cs == __USER_CS && regs->ss == __USER_DS &&
regs->ip == landing_pad &&
(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
#endif
}
#endif

View File

@ -287,76 +287,47 @@ need_resched:
END(resume_kernel)
#endif
/*
* SYSENTER_RETURN points to after the SYSENTER instruction
* in the vsyscall page. See vsyscall-sysentry.S, which defines
* the symbol.
*/
# SYSENTER call handler stub
ENTRY(entry_SYSENTER_32)
movl TSS_sysenter_sp0(%esp), %esp
sysenter_past_esp:
pushl $__USER_DS /* pt_regs->ss */
pushl %ecx /* pt_regs->cx */
pushfl /* pt_regs->flags (except IF = 0) */
orl $X86_EFLAGS_IF, (%esp) /* Fix IF */
pushl $__USER_CS /* pt_regs->cs */
pushl $0 /* pt_regs->ip = 0 (placeholder) */
pushl %eax /* pt_regs->orig_ax */
SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
/*
* Interrupts are disabled here, but we can't trace it until
* enough kernel state to call TRACE_IRQS_OFF can be called - but
* we immediately enable interrupts at that point anyway.
* User mode is traced as though IRQs are on, and SYSENTER
* turned them off.
*/
pushl $__USER_DS
pushl %ebp
pushfl
orl $X86_EFLAGS_IF, (%esp)
pushl $__USER_CS
/*
* Push current_thread_info()->sysenter_return to the stack.
* A tiny bit of offset fixup is necessary: TI_sysenter_return
* is relative to thread_info, which is at the bottom of the
* kernel stack page. 4*4 means the 4 words pushed above;
* TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
* and THREAD_SIZE takes us to the bottom.
*/
pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
pushl %eax
SAVE_ALL
ENABLE_INTERRUPTS(CLBR_NONE)
/*
* Load the potential sixth argument from user stack.
* Careful about security.
*/
cmpl $__PAGE_OFFSET-3, %ebp
jae syscall_fault
ASM_STAC
1: movl (%ebp), %ebp
ASM_CLAC
movl %ebp, PT_EBP(%esp)
_ASM_EXTABLE(1b, syscall_fault)
GET_THREAD_INFO(%ebp)
testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
jnz syscall_trace_entry
sysenter_do_call:
cmpl $(NR_syscalls), %eax
jae sysenter_badsys
call *sys_call_table(, %eax, 4)
sysenter_after_call:
movl %eax, PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testl $_TIF_ALLWORK_MASK, %ecx
jnz syscall_exit_work_irqs_off
sysenter_exit:
/* if something modifies registers it must also disable sysexit */
movl PT_EIP(%esp), %edx
movl PT_OLDESP(%esp), %ecx
xorl %ebp, %ebp
TRACE_IRQS_ON
movl %esp, %eax
call do_fast_syscall_32
testl %eax, %eax
jz .Lsyscall_32_done
/* Opportunistic SYSEXIT */
TRACE_IRQS_ON /* User mode traces as IRQs on. */
movl PT_EIP(%esp), %edx /* pt_regs->ip */
movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */
popl %ebx /* pt_regs->bx */
addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */
popl %esi /* pt_regs->si */
popl %edi /* pt_regs->di */
popl %ebp /* pt_regs->bp */
popl %eax /* pt_regs->ax */
1: mov PT_FS(%esp), %fs
PTGS_TO_GS
/*
* Return back to the vDSO, which will pop ecx and edx.
* Don't bother with DS and ES (they already contain __USER_DS).
*/
ENABLE_INTERRUPTS_SYSEXIT
.pushsection .fixup, "ax"
@ -371,7 +342,7 @@ ENDPROC(entry_SYSENTER_32)
ENTRY(entry_INT80_32)
ASM_CLAC
pushl %eax /* pt_regs->orig_ax */
SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, load -ENOSYS into ax */
SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
/*
* User mode is traced as though IRQs are on, and the interrupt gate
@ -381,6 +352,7 @@ ENTRY(entry_INT80_32)
movl %esp, %eax
call do_int80_syscall_32
.Lsyscall_32_done:
restore_all:
TRACE_IRQS_IRET
@ -457,42 +429,6 @@ ldt_ss:
#endif
ENDPROC(entry_INT80_32)
# perform syscall exit tracing
ALIGN
syscall_trace_entry:
movl $-ENOSYS, PT_EAX(%esp)
movl %esp, %eax
call syscall_trace_enter
/* What it returned is what we'll actually use. */
cmpl $(NR_syscalls), %eax
jnae syscall_call
jmp syscall_exit
END(syscall_trace_entry)
# perform syscall exit tracing
ALIGN
syscall_exit_work_irqs_off:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
syscall_exit_work:
movl %esp, %eax
call syscall_return_slowpath
jmp restore_all
END(syscall_exit_work)
syscall_fault:
ASM_CLAC
GET_THREAD_INFO(%ebp)
movl $-EFAULT, PT_EAX(%esp)
jmp resume_userspace
END(syscall_fault)
sysenter_badsys:
movl $-ENOSYS, %eax
jmp sysenter_after_call
END(sysenter_badsys)
.macro FIXUP_ESPFIX_STACK
/*
* Switch back for ESPFIX stack to the normal zerobased stack

View File

@ -34,6 +34,8 @@ __kernel_vsyscall:
/* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */
ALTERNATIVE_2 "", "sysenter", X86_FEATURE_SYSENTER32, \
"syscall", X86_FEATURE_SYSCALL32
#else
ALTERNATIVE "", "sysenter", X86_FEATURE_SEP
#endif
/* Enter using int $0x80 */