diff --git a/fs/exec.c b/fs/exec.c index 6c53920795c2..aaa605529a75 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -990,7 +990,7 @@ static int exec_mmap(struct mm_struct *mm) active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; - mm_init_cid(mm); + mm_init_cid(mm, tsk); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6e3bdf8e38bc..381d22eba088 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -782,6 +782,7 @@ struct vm_area_struct { struct mm_cid { u64 time; int cid; + int recent_cid; }; #endif @@ -852,6 +853,27 @@ struct mm_struct { * When the next mm_cid scan is due (in jiffies). */ unsigned long mm_cid_next_scan; + /** + * @nr_cpus_allowed: Number of CPUs allowed for mm. + * + * Number of CPUs allowed in the union of all mm's + * threads allowed CPUs. + */ + unsigned int nr_cpus_allowed; + /** + * @max_nr_cid: Maximum number of concurrency IDs allocated. + * + * Track the highest number of concurrency IDs allocated for the + * mm. + */ + atomic_t max_nr_cid; + /** + * @cpus_allowed_lock: Lock protecting mm cpus_allowed. + * + * Provide mutual exclusion for mm cpus_allowed and + * mm nr_cpus_allowed updates. + */ + raw_spinlock_t cpus_allowed_lock; #endif #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ @@ -1170,18 +1192,30 @@ static inline int mm_cid_clear_lazy_put(int cid) return cid & ~MM_CID_LAZY_PUT; } +/* + * mm_cpus_allowed: Union of all mm's threads allowed CPUs. + */ +static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm) +{ + unsigned long bitmap = (unsigned long)mm; + + bitmap += offsetof(struct mm_struct, cpu_bitmap); + /* Skip cpu_bitmap */ + bitmap += cpumask_size(); + return (struct cpumask *)bitmap; +} + /* Accessor for struct mm_struct's cidmask. */ static inline cpumask_t *mm_cidmask(struct mm_struct *mm) { - unsigned long cid_bitmap = (unsigned long)mm; + unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm); - cid_bitmap += offsetof(struct mm_struct, cpu_bitmap); - /* Skip cpu_bitmap */ + /* Skip mm_cpus_allowed */ cid_bitmap += cpumask_size(); return (struct cpumask *)cid_bitmap; } -static inline void mm_init_cid(struct mm_struct *mm) +static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { int i; @@ -1189,17 +1223,22 @@ static inline void mm_init_cid(struct mm_struct *mm) struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); pcpu_cid->cid = MM_CID_UNSET; + pcpu_cid->recent_cid = MM_CID_UNSET; pcpu_cid->time = 0; } + mm->nr_cpus_allowed = p->nr_cpus_allowed; + atomic_set(&mm->max_nr_cid, 0); + raw_spin_lock_init(&mm->cpus_allowed_lock); + cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); cpumask_clear(mm_cidmask(mm)); } -static inline int mm_alloc_cid_noprof(struct mm_struct *mm) +static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p) { mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid); if (!mm->pcpu_cid) return -ENOMEM; - mm_init_cid(mm); + mm_init_cid(mm, p); return 0; } #define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__)) @@ -1212,16 +1251,31 @@ static inline void mm_destroy_cid(struct mm_struct *mm) static inline unsigned int mm_cid_size(void) { - return cpumask_size(); + return 2 * cpumask_size(); /* mm_cpus_allowed(), mm_cidmask(). */ +} + +static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) +{ + struct cpumask *mm_allowed = mm_cpus_allowed(mm); + + if (!mm) + return; + /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ + raw_spin_lock(&mm->cpus_allowed_lock); + cpumask_or(mm_allowed, mm_allowed, cpumask); + WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed)); + raw_spin_unlock(&mm->cpus_allowed_lock); } #else /* CONFIG_SCHED_MM_CID */ -static inline void mm_init_cid(struct mm_struct *mm) { } -static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; } +static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { } +static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; } static inline void mm_destroy_cid(struct mm_struct *mm) { } + static inline unsigned int mm_cid_size(void) { return 0; } +static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { } #endif /* CONFIG_SCHED_MM_CID */ struct mmu_gather; diff --git a/kernel/fork.c b/kernel/fork.c index 89ceb4a68af2..7d950e93f080 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1298,7 +1298,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (init_new_context(p, mm)) goto fail_nocontext; - if (mm_alloc_cid(mm)) + if (mm_alloc_cid(mm, p)) goto fail_cid; if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7db711ba6d12..f5ec452e2c5e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2696,6 +2696,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) put_prev_task(rq, p); p->sched_class->set_cpus_allowed(p, ctx); + mm_set_cpus_allowed(p->mm, ctx->new_mask); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -10243,6 +10244,7 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, */ if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) return -1; + WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); return src_cid; } @@ -10255,7 +10257,8 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; struct mm_struct *mm = t->mm; - int src_cid, dst_cid, src_cpu; + int src_cid, src_cpu; + bool dst_cid_is_set; struct rq *src_rq; lockdep_assert_rq_held(dst_rq); @@ -10272,9 +10275,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) * allocation closest to 0 in cases where few threads migrate around * many CPUs. * - * If destination cid is already set, we may have to just clear - * the src cid to ensure compactness in frequent migrations - * scenarios. + * If destination cid or recent cid is already set, we may have + * to just clear the src cid to ensure compactness in frequent + * migrations scenarios. * * It is not useful to clear the src cid when the number of threads is * greater or equal to the number of allowed CPUs, because user-space @@ -10282,9 +10285,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) * allowed CPUs. */ dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); - dst_cid = READ_ONCE(dst_pcpu_cid->cid); - if (!mm_cid_is_unset(dst_cid) && - atomic_read(&mm->mm_users) >= t->nr_cpus_allowed) + dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || + !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); + if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) return; src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); src_rq = cpu_rq(src_cpu); @@ -10295,13 +10298,14 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) src_cid); if (src_cid == -1) return; - if (!mm_cid_is_unset(dst_cid)) { + if (dst_cid_is_set) { __mm_cid_put(mm, src_cid); return; } /* Move src_cid to dst cpu. */ mm_cid_snapshot_time(dst_rq, mm); WRITE_ONCE(dst_pcpu_cid->cid, src_cid); + WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); } static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, @@ -10540,7 +10544,7 @@ void sched_mm_cid_after_execve(struct task_struct *t) * Matches barrier in sched_mm_cid_remote_clear_old(). */ smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); } rseq_set_notify_resume(t); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fba524c81c63..20b6e75604ec 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3596,24 +3596,41 @@ static inline void mm_cid_put(struct mm_struct *mm) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } -static inline int __mm_cid_try_get(struct mm_struct *mm) +static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) { - struct cpumask *cpumask; - int cid; + struct cpumask *cidmask = mm_cidmask(mm); + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid = __this_cpu_read(pcpu_cid->recent_cid); - cpumask = mm_cidmask(mm); + /* Try to re-use recent cid. This improves cache locality. */ + if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask)) + return cid; /* + * Expand cid allocation if the maximum number of concurrency + * IDs allocated (max_nr_cid) is below the number cpus allowed + * and number of threads. Expanding cid allocation as much as + * possible improves cache locality. + */ + cid = atomic_read(&mm->max_nr_cid); + while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { + if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) + continue; + if (!cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + } + /* + * Find the first available concurrency id. * Retry finding first zero bit if the mask is temporarily * filled. This only happens during concurrent remote-clear * which owns a cid without holding a rq lock. */ for (;;) { - cid = cpumask_first_zero(cpumask); - if (cid < nr_cpu_ids) + cid = cpumask_first_zero(cidmask); + if (cid < READ_ONCE(mm->nr_cpus_allowed)) break; cpu_relax(); } - if (cpumask_test_and_set_cpu(cid, cpumask)) + if (cpumask_test_and_set_cpu(cid, cidmask)) return -1; return cid; @@ -3631,7 +3648,8 @@ static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) WRITE_ONCE(pcpu_cid->time, rq->clock); } -static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { int cid; @@ -3641,13 +3659,13 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * guarantee forward progress. */ if (!READ_ONCE(use_cid_lock)) { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto end; raw_spin_lock(&cid_lock); } else { raw_spin_lock(&cid_lock); - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto unlock; } @@ -3667,7 +3685,7 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * all newcoming allocations observe the use_cid_lock flag set. */ do { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); cpu_relax(); } while (cid < 0); /* @@ -3684,7 +3702,8 @@ end: return cid; } -static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; struct cpumask *cpumask; @@ -3701,8 +3720,9 @@ static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } - cid = __mm_cid_get(rq, mm); + cid = __mm_cid_get(rq, t, mm); __this_cpu_write(pcpu_cid->cid, cid); + __this_cpu_write(pcpu_cid->recent_cid, cid); return cid; } @@ -3755,7 +3775,7 @@ static inline void switch_mm_cid(struct rq *rq, prev->mm_cid = -1; } if (next->mm_cid_active) - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm); + next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); } #else /* !CONFIG_SCHED_MM_CID: */