diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index d6b078e9fa28..25b1cc07d496 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -95,6 +95,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, struct pvclock_vsyscall_time_info { struct pvclock_vcpu_time_info pvti; + u32 migrate_count; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 2f355d229a58..e5ecd20e72dd 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -141,7 +141,46 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } +static struct pvclock_vsyscall_time_info *pvclock_vdso_info; + +static struct pvclock_vsyscall_time_info * +pvclock_get_vsyscall_user_time_info(int cpu) +{ + if (!pvclock_vdso_info) { + BUG(); + return NULL; + } + + return &pvclock_vdso_info[cpu]; +} + +struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) +{ + return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; +} + #ifdef CONFIG_X86_64 +static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, + void *v) +{ + struct task_migration_notifier *mn = v; + struct pvclock_vsyscall_time_info *pvti; + + pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); + + /* this is NULL when pvclock vsyscall is not initialized */ + if (unlikely(pvti == NULL)) + return NOTIFY_DONE; + + pvti->migrate_count++; + + return NOTIFY_DONE; +} + +static struct notifier_block pvclock_migrate = { + .notifier_call = pvclock_task_migrate, +}; + /* * Initialize the generic pvclock vsyscall state. This will allocate * a/some page(s) for the per-vcpu pvclock information, set up a @@ -155,12 +194,17 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); + pvclock_vdso_info = i; + for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, __pa(i) + (idx*PAGE_SIZE), PAGE_KERNEL_VVAR); } + + register_task_migration_notifier(&pvclock_migrate); + return 0; } #endif diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 9793322751e0..30933760ee5f 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -82,18 +82,15 @@ static notrace cycle_t vread_pvclock(int *mode) cycle_t ret; u64 last; u32 version; + u32 migrate_count; u8 flags; unsigned cpu, cpu1; /* - * Note: hypervisor must guarantee that: - * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. - * 2. that per-CPU pvclock time info is updated if the - * underlying CPU changes. - * 3. that version is increased whenever underlying CPU - * changes. - * + * When looping to get a consistent (time-info, tsc) pair, we + * also need to deal with the possibility we can switch vcpus, + * so make sure we always re-fetch time-info for the current vcpu. */ do { cpu = __getcpu() & VGETCPU_CPU_MASK; @@ -104,6 +101,8 @@ static notrace cycle_t vread_pvclock(int *mode) pvti = get_pvti(cpu); + migrate_count = pvti->migrate_count; + version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); /* @@ -115,7 +114,8 @@ static notrace cycle_t vread_pvclock(int *mode) cpu1 = __getcpu() & VGETCPU_CPU_MASK; } while (unlikely(cpu != cpu1 || (pvti->pvti.version & 1) || - pvti->pvti.version != version)); + pvti->pvti.version != version || + pvti->migrate_count != migrate_count)); if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) *mode = VCLOCK_NONE; diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d77432e14ff..be98910cc1e2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -176,6 +176,14 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); extern void update_cpu_load_nohz(void); +/* Notifier for when a task gets migrated to a new CPU */ +struct task_migration_notifier { + struct task_struct *task; + int from_cpu; + int to_cpu; +}; +extern void register_task_migration_notifier(struct notifier_block *n); + extern unsigned long get_parent_ip(unsigned long addr); extern void dump_cpu_task(int cpu); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0f831e8a345..d0c4209bb836 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -996,6 +996,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq_clock_skip_update(rq, true); } +static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); + +void register_task_migration_notifier(struct notifier_block *n) +{ + atomic_notifier_chain_register(&task_migration_notifier, n); +} + #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { @@ -1026,10 +1033,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + struct task_migration_notifier tmn; + if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); + + tmn.task = p; + tmn.from_cpu = task_cpu(p); + tmn.to_cpu = new_cpu; + + atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); } __set_task_cpu(p, new_cpu);