mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-19 10:14:23 +08:00
4eaffdd5a5
My previous comments were still a bit confusing and there was a
typo. Fix it up.
Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Fixes: 71b3c126e6
("x86/mm: Add barriers and document switch_mm()-vs-flush synchronization")
Link: http://lkml.kernel.org/r/0a0b43cdcdd241c5faaaecfbcc91a155ddedc9a1.1452631609.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
279 lines
7.6 KiB
C
279 lines
7.6 KiB
C
#ifndef _ASM_X86_MMU_CONTEXT_H
|
|
#define _ASM_X86_MMU_CONTEXT_H
|
|
|
|
#include <asm/desc.h>
|
|
#include <linux/atomic.h>
|
|
#include <linux/mm_types.h>
|
|
|
|
#include <trace/events/tlb.h>
|
|
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/paravirt.h>
|
|
#include <asm/mpx.h>
|
|
#ifndef CONFIG_PARAVIRT
|
|
static inline void paravirt_activate_mm(struct mm_struct *prev,
|
|
struct mm_struct *next)
|
|
{
|
|
}
|
|
#endif /* !CONFIG_PARAVIRT */
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
extern struct static_key rdpmc_always_available;
|
|
|
|
static inline void load_mm_cr4(struct mm_struct *mm)
|
|
{
|
|
if (static_key_false(&rdpmc_always_available) ||
|
|
atomic_read(&mm->context.perf_rdpmc_allowed))
|
|
cr4_set_bits(X86_CR4_PCE);
|
|
else
|
|
cr4_clear_bits(X86_CR4_PCE);
|
|
}
|
|
#else
|
|
static inline void load_mm_cr4(struct mm_struct *mm) {}
|
|
#endif
|
|
|
|
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
|
/*
|
|
* ldt_structs can be allocated, used, and freed, but they are never
|
|
* modified while live.
|
|
*/
|
|
struct ldt_struct {
|
|
/*
|
|
* Xen requires page-aligned LDTs with special permissions. This is
|
|
* needed to prevent us from installing evil descriptors such as
|
|
* call gates. On native, we could merge the ldt_struct and LDT
|
|
* allocations, but it's not worth trying to optimize.
|
|
*/
|
|
struct desc_struct *entries;
|
|
int size;
|
|
};
|
|
|
|
/*
|
|
* Used for LDT copy/destruction.
|
|
*/
|
|
int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
|
|
void destroy_context(struct mm_struct *mm);
|
|
#else /* CONFIG_MODIFY_LDT_SYSCALL */
|
|
static inline int init_new_context(struct task_struct *tsk,
|
|
struct mm_struct *mm)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline void destroy_context(struct mm_struct *mm) {}
|
|
#endif
|
|
|
|
static inline void load_mm_ldt(struct mm_struct *mm)
|
|
{
|
|
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
|
struct ldt_struct *ldt;
|
|
|
|
/* lockless_dereference synchronizes with smp_store_release */
|
|
ldt = lockless_dereference(mm->context.ldt);
|
|
|
|
/*
|
|
* Any change to mm->context.ldt is followed by an IPI to all
|
|
* CPUs with the mm active. The LDT will not be freed until
|
|
* after the IPI is handled by all such CPUs. This means that,
|
|
* if the ldt_struct changes before we return, the values we see
|
|
* will be safe, and the new values will be loaded before we run
|
|
* any user code.
|
|
*
|
|
* NB: don't try to convert this to use RCU without extreme care.
|
|
* We would still need IRQs off, because we don't want to change
|
|
* the local LDT after an IPI loaded a newer value than the one
|
|
* that we can see.
|
|
*/
|
|
|
|
if (unlikely(ldt))
|
|
set_ldt(ldt->entries, ldt->size);
|
|
else
|
|
clear_LDT();
|
|
#else
|
|
clear_LDT();
|
|
#endif
|
|
|
|
DEBUG_LOCKS_WARN_ON(preemptible());
|
|
}
|
|
|
|
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
|
{
|
|
#ifdef CONFIG_SMP
|
|
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
|
|
this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
|
|
#endif
|
|
}
|
|
|
|
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
|
struct task_struct *tsk)
|
|
{
|
|
unsigned cpu = smp_processor_id();
|
|
|
|
if (likely(prev != next)) {
|
|
#ifdef CONFIG_SMP
|
|
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
|
|
this_cpu_write(cpu_tlbstate.active_mm, next);
|
|
#endif
|
|
cpumask_set_cpu(cpu, mm_cpumask(next));
|
|
|
|
/*
|
|
* Re-load page tables.
|
|
*
|
|
* This logic has an ordering constraint:
|
|
*
|
|
* CPU 0: Write to a PTE for 'next'
|
|
* CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
|
|
* CPU 1: set bit 1 in next's mm_cpumask
|
|
* CPU 1: load from the PTE that CPU 0 writes (implicit)
|
|
*
|
|
* We need to prevent an outcome in which CPU 1 observes
|
|
* the new PTE value and CPU 0 observes bit 1 clear in
|
|
* mm_cpumask. (If that occurs, then the IPI will never
|
|
* be sent, and CPU 0's TLB will contain a stale entry.)
|
|
*
|
|
* The bad outcome can occur if either CPU's load is
|
|
* reordered before that CPU's store, so both CPUs must
|
|
* execute full barriers to prevent this from happening.
|
|
*
|
|
* Thus, switch_mm needs a full barrier between the
|
|
* store to mm_cpumask and any operation that could load
|
|
* from next->pgd. TLB fills are special and can happen
|
|
* due to instruction fetches or for no reason at all,
|
|
* and neither LOCK nor MFENCE orders them.
|
|
* Fortunately, load_cr3() is serializing and gives the
|
|
* ordering guarantee we need.
|
|
*
|
|
*/
|
|
load_cr3(next->pgd);
|
|
|
|
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
|
|
|
/* Stop flush ipis for the previous mm */
|
|
cpumask_clear_cpu(cpu, mm_cpumask(prev));
|
|
|
|
/* Load per-mm CR4 state */
|
|
load_mm_cr4(next);
|
|
|
|
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
|
/*
|
|
* Load the LDT, if the LDT is different.
|
|
*
|
|
* It's possible that prev->context.ldt doesn't match
|
|
* the LDT register. This can happen if leave_mm(prev)
|
|
* was called and then modify_ldt changed
|
|
* prev->context.ldt but suppressed an IPI to this CPU.
|
|
* In this case, prev->context.ldt != NULL, because we
|
|
* never set context.ldt to NULL while the mm still
|
|
* exists. That means that next->context.ldt !=
|
|
* prev->context.ldt, because mms never share an LDT.
|
|
*/
|
|
if (unlikely(prev->context.ldt != next->context.ldt))
|
|
load_mm_ldt(next);
|
|
#endif
|
|
}
|
|
#ifdef CONFIG_SMP
|
|
else {
|
|
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
|
|
BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
|
|
|
|
if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
|
|
/*
|
|
* On established mms, the mm_cpumask is only changed
|
|
* from irq context, from ptep_clear_flush() while in
|
|
* lazy tlb mode, and here. Irqs are blocked during
|
|
* schedule, protecting us from simultaneous changes.
|
|
*/
|
|
cpumask_set_cpu(cpu, mm_cpumask(next));
|
|
|
|
/*
|
|
* We were in lazy tlb mode and leave_mm disabled
|
|
* tlb flush IPI delivery. We must reload CR3
|
|
* to make sure to use no freed page tables.
|
|
*
|
|
* As above, load_cr3() is serializing and orders TLB
|
|
* fills with respect to the mm_cpumask write.
|
|
*/
|
|
load_cr3(next->pgd);
|
|
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
|
load_mm_cr4(next);
|
|
load_mm_ldt(next);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#define activate_mm(prev, next) \
|
|
do { \
|
|
paravirt_activate_mm((prev), (next)); \
|
|
switch_mm((prev), (next), NULL); \
|
|
} while (0);
|
|
|
|
#ifdef CONFIG_X86_32
|
|
#define deactivate_mm(tsk, mm) \
|
|
do { \
|
|
lazy_load_gs(0); \
|
|
} while (0)
|
|
#else
|
|
#define deactivate_mm(tsk, mm) \
|
|
do { \
|
|
load_gs_index(0); \
|
|
loadsegment(fs, 0); \
|
|
} while (0)
|
|
#endif
|
|
|
|
static inline void arch_dup_mmap(struct mm_struct *oldmm,
|
|
struct mm_struct *mm)
|
|
{
|
|
paravirt_arch_dup_mmap(oldmm, mm);
|
|
}
|
|
|
|
static inline void arch_exit_mmap(struct mm_struct *mm)
|
|
{
|
|
paravirt_arch_exit_mmap(mm);
|
|
}
|
|
|
|
#ifdef CONFIG_X86_64
|
|
static inline bool is_64bit_mm(struct mm_struct *mm)
|
|
{
|
|
return !config_enabled(CONFIG_IA32_EMULATION) ||
|
|
!(mm->context.ia32_compat == TIF_IA32);
|
|
}
|
|
#else
|
|
static inline bool is_64bit_mm(struct mm_struct *mm)
|
|
{
|
|
return false;
|
|
}
|
|
#endif
|
|
|
|
static inline void arch_bprm_mm_init(struct mm_struct *mm,
|
|
struct vm_area_struct *vma)
|
|
{
|
|
mpx_mm_init(mm);
|
|
}
|
|
|
|
static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
unsigned long start, unsigned long end)
|
|
{
|
|
/*
|
|
* mpx_notify_unmap() goes and reads a rarely-hot
|
|
* cacheline in the mm_struct. That can be expensive
|
|
* enough to be seen in profiles.
|
|
*
|
|
* The mpx_notify_unmap() call and its contents have been
|
|
* observed to affect munmap() performance on hardware
|
|
* where MPX is not present.
|
|
*
|
|
* The unlikely() optimizes for the fast case: no MPX
|
|
* in the CPU, or no MPX use in the process. Even if
|
|
* we get this wrong (in the unlikely event that MPX
|
|
* is widely enabled on some system) the overhead of
|
|
* MPX itself (reading bounds tables) is expected to
|
|
* overwhelm the overhead of getting this unlikely()
|
|
* consistently wrong.
|
|
*/
|
|
if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX)))
|
|
mpx_notify_unmap(mm, vma, start, end);
|
|
}
|
|
|
|
#endif /* _ASM_X86_MMU_CONTEXT_H */
|