From d0b27fa77854b149ad4af08b0fe47fe712a47ade Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 19 Apr 2008 19:44:57 +0200 Subject: [PATCH] sched: rt-group: synchonised bandwidth period Various SMP balancing algorithms require that the bandwidth period run in sync. Possible improvements are moving the rt_bandwidth thing into root_domain and keeping a span per rt_bandwidth which marks throttled cpus. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 ++ kernel/sched.c | 260 +++++++++++++++++++++++++++++++-------- kernel/sched_rt.c | 104 +++++++++++----- kernel/sysctl.c | 4 +- kernel/time/tick-sched.c | 5 - kernel/user.c | 28 +++++ 6 files changed, 320 insertions(+), 88 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 15f05ff453d8..be5d31752dbd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1563,6 +1563,10 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; +int sched_rt_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); + extern unsigned int sysctl_sched_compat_yield; #ifdef CONFIG_RT_MUTEXES @@ -2052,6 +2056,9 @@ extern unsigned long sched_group_shares(struct task_group *tg); extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us); extern long sched_group_rt_runtime(struct task_group *tg); +extern int sched_group_set_rt_period(struct task_group *tg, + long rt_period_us); +extern long sched_group_rt_period(struct task_group *tg); #endif #endif diff --git a/kernel/sched.c b/kernel/sched.c index e813e845d9cf..bb20323f7d09 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -115,6 +115,11 @@ unsigned long long __attribute__((weak)) sched_clock(void) */ #define DEF_TIMESLICE (100 * HZ / 1000) +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + #ifdef CONFIG_SMP /* * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) @@ -156,6 +161,80 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; +struct rt_bandwidth { + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +}; + +static struct rt_bandwidth def_rt_bandwidth; + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_bandwidth *rt_b = + container_of(timer, struct rt_bandwidth, rt_period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, rt_b->rt_period); + + if (!overrun) + break; + + idle = do_sched_rt_period_timer(rt_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +static +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) +{ + rt_b->rt_period = ns_to_ktime(period); + rt_b->rt_runtime = runtime; + + hrtimer_init(&rt_b->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.function = sched_rt_period_timer; + rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + ktime_t now; + + if (rt_b->rt_runtime == RUNTIME_INF) + return; + + if (hrtimer_active(&rt_b->rt_period_timer)) + return; + + spin_lock(&rt_b->rt_runtime_lock); + for (;;) { + if (hrtimer_active(&rt_b->rt_period_timer)) + break; + + now = hrtimer_cb_get_time(&rt_b->rt_period_timer); + hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); + hrtimer_start(&rt_b->rt_period_timer, + rt_b->rt_period_timer.expires, + HRTIMER_MODE_ABS); + } + spin_unlock(&rt_b->rt_runtime_lock); +} + +#ifdef CONFIG_RT_GROUP_SCHED +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + hrtimer_cancel(&rt_b->rt_period_timer); +} +#endif + #ifdef CONFIG_GROUP_SCHED #include @@ -182,7 +261,7 @@ struct task_group { struct sched_rt_entity **rt_se; struct rt_rq **rt_rq; - u64 rt_runtime; + struct rt_bandwidth rt_bandwidth; #endif struct rcu_head rcu; @@ -407,8 +486,6 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; - u64 rt_period_expire; - int rt_throttled; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -592,23 +669,6 @@ static void update_rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -unsigned long rt_needs_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - u64 delta; - - if (!rq->rt_throttled) - return 0; - - if (rq->clock > rq->rt_period_expire) - return 1; - - delta = rq->rt_period_expire - rq->clock; - do_div(delta, NSEC_PER_SEC / HZ); - - return (unsigned long)delta; -} - /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -664,10 +724,18 @@ static __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -/* - * single value that denotes runtime == period, ie unlimited time. - */ -#define RUNTIME_INF ((u64)~0ULL) +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_period < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} static const unsigned long long time_sync_thresh = 100000; @@ -3854,7 +3922,6 @@ void scheduler_tick(void) update_last_tick_seen(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); - update_sched_rt_period(rq); spin_unlock(&rq->lock); #ifdef CONFIG_SMP @@ -4689,7 +4756,7 @@ recheck: * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_policy(policy) && task_group(p)->rt_runtime == 0) + if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -7288,6 +7355,14 @@ void __init sched_init(void) init_defrootdomain(); #endif + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + +#ifdef CONFIG_RT_GROUP_SCHED + init_rt_bandwidth(&init_task_group.rt_bandwidth, + global_rt_period(), global_rt_runtime()); +#endif + #ifdef CONFIG_GROUP_SCHED list_add(&init_task_group.list, &task_groups); #endif @@ -7312,15 +7387,11 @@ void __init sched_init(void) #endif #ifdef CONFIG_RT_GROUP_SCHED - init_task_group.rt_runtime = - sysctl_sched_rt_runtime * NSEC_PER_USEC; INIT_LIST_HEAD(&rq->leaf_rt_rq_list); init_tg_rt_entry(rq, &init_task_group, &per_cpu(init_rt_rq, i), &per_cpu(init_sched_rt_entity, i), i, 1); #endif - rq->rt_period_expire = 0; - rq->rt_throttled = 0; for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; @@ -7506,8 +7577,6 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_GROUP_SCHED - #ifdef CONFIG_FAIR_GROUP_SCHED static void free_fair_sched_group(struct task_group *tg) { @@ -7596,6 +7665,8 @@ static void free_rt_sched_group(struct task_group *tg) { int i; + destroy_rt_bandwidth(&tg->rt_bandwidth); + for_each_possible_cpu(i) { if (tg->rt_rq) kfree(tg->rt_rq[i]); @@ -7621,7 +7692,8 @@ static int alloc_rt_sched_group(struct task_group *tg) if (!tg->rt_se) goto err; - tg->rt_runtime = 0; + init_rt_bandwidth(&tg->rt_bandwidth, + ktime_to_ns(def_rt_bandwidth.rt_period), 0); for_each_possible_cpu(i) { rq = cpu_rq(i); @@ -7674,6 +7746,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) } #endif +#ifdef CONFIG_GROUP_SCHED static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -7775,6 +7848,7 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, &flags); } +#endif #ifdef CONFIG_FAIR_GROUP_SCHED static void set_se_shares(struct sched_entity *se, unsigned long shares) @@ -7871,16 +7945,15 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) struct task_group *tgi; unsigned long total = 0; unsigned long global_ratio = - to_ratio(sysctl_sched_rt_period, - sysctl_sched_rt_runtime < 0 ? - RUNTIME_INF : sysctl_sched_rt_runtime); + to_ratio(global_rt_period(), global_rt_runtime()); rcu_read_lock(); list_for_each_entry_rcu(tgi, &task_groups, list) { if (tgi == tg) continue; - total += to_ratio(period, tgi->rt_runtime); + total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), + tgi->rt_bandwidth.rt_runtime); } rcu_read_unlock(); @@ -7898,16 +7971,11 @@ static inline int tg_has_rt_tasks(struct task_group *tg) return 0; } -int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +static int tg_set_bandwidth(struct task_group *tg, + u64 rt_period, u64 rt_runtime) { - u64 rt_runtime, rt_period; int err = 0; - rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; - rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; - if (rt_runtime_us == -1) - rt_runtime = RUNTIME_INF; - mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { @@ -7918,7 +7986,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) err = -EINVAL; goto unlock; } - tg->rt_runtime = rt_runtime; + tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); + tg->rt_bandwidth.rt_runtime = rt_runtime; unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); @@ -7926,19 +7995,96 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) return err; } +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ + u64 rt_runtime, rt_period; + + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us < 0) + rt_runtime = RUNTIME_INF; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + long sched_group_rt_runtime(struct task_group *tg) { u64 rt_runtime_us; - if (tg->rt_runtime == RUNTIME_INF) + if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) return -1; - rt_runtime_us = tg->rt_runtime; + rt_runtime_us = tg->rt_bandwidth.rt_runtime; do_div(rt_runtime_us, NSEC_PER_USEC); return rt_runtime_us; } + +int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +{ + u64 rt_runtime, rt_period; + + rt_period = (u64)rt_period_us * NSEC_PER_USEC; + rt_runtime = tg->rt_bandwidth.rt_runtime; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_period(struct task_group *tg) +{ + u64 rt_period_us; + + rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); + do_div(rt_period_us, NSEC_PER_USEC); + return rt_period_us; +} + +static int sched_rt_global_constraints(void) +{ + int ret = 0; + + mutex_lock(&rt_constraints_mutex); + if (!__rt_schedulable(NULL, 1, 0)) + ret = -EINVAL; + mutex_unlock(&rt_constraints_mutex); + + return ret; +} +#else +static int sched_rt_global_constraints(void) +{ + return 0; +} #endif -#endif /* CONFIG_GROUP_SCHED */ + +int sched_rt_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + old_period = sysctl_sched_rt_period; + old_runtime = sysctl_sched_rt_runtime; + + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + + if (!ret && write) { + ret = sched_rt_global_constraints(); + if (ret) { + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; + } else { + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = + ns_to_ktime(global_rt_period()); + } + } + mutex_unlock(&mutex); + + return ret; +} #ifdef CONFIG_CGROUP_SCHED @@ -7988,7 +8134,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, { #ifdef CONFIG_RT_GROUP_SCHED /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) + if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ @@ -8066,6 +8212,17 @@ static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } + +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 rt_period_us) +{ + return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); +} + +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_period(cgroup_tg(cgrp)); +} #endif static struct cftype cpu_files[] = { @@ -8082,6 +8239,11 @@ static struct cftype cpu_files[] = { .read = cpu_rt_runtime_read, .write = cpu_rt_runtime_write, }, + { + .name = "rt_period_us", + .read_uint = cpu_rt_period_read_uint, + .write_uint = cpu_rt_period_write_uint, + }, #endif }; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 0a6d2e516420..8bc176136666 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -62,7 +62,7 @@ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) if (!rt_rq->tg) return RUNTIME_INF; - return rt_rq->tg->rt_runtime; + return rt_rq->tg->rt_bandwidth.rt_runtime; } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -127,14 +127,29 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) return p->prio != p->normal_prio; } +#ifdef CONFIG_SMP +static inline cpumask_t sched_rt_period_mask(void) +{ + return cpu_rq(smp_processor_id())->rd->span; +} +#else +static inline cpumask_t sched_rt_period_mask(void) +{ + return cpu_online_map; +} +#endif + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; +} + #else static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { - if (sysctl_sched_rt_runtime == -1) - return RUNTIME_INF; - - return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; + return def_rt_bandwidth.rt_runtime; } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -173,8 +188,55 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq) { return rt_rq->rt_throttled; } + +static inline cpumask_t sched_rt_period_mask(void) +{ + return cpu_online_map; +} + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return &cpu_rq(cpu)->rt; +} + #endif +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) +{ + int i, idle = 1; + cpumask_t span; + + if (rt_b->rt_runtime == RUNTIME_INF) + return 1; + + span = sched_rt_period_mask(); + for_each_cpu_mask(i, span) { + int enqueue = 0; + struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); + struct rq *rq = rq_of_rt_rq(rt_rq); + + spin_lock(&rq->lock); + if (rt_rq->rt_time) { + u64 runtime = rt_b->rt_runtime; + + rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); + if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { + rt_rq->rt_throttled = 0; + enqueue = 1; + } + if (rt_rq->rt_time || rt_rq->rt_nr_running) + idle = 0; + } + + if (enqueue) + sched_rt_rq_enqueue(rt_rq); + spin_unlock(&rq->lock); + } + + return idle; +} + static inline int rt_se_prio(struct sched_rt_entity *rt_se) { #ifdef CONFIG_RT_GROUP_SCHED @@ -198,11 +260,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return rt_rq_throttled(rt_rq); if (rt_rq->rt_time > runtime) { - struct rq *rq = rq_of_rt_rq(rt_rq); - - rq->rt_throttled = 1; rt_rq->rt_throttled = 1; - if (rt_rq_throttled(rt_rq)) { sched_rt_rq_dequeue(rt_rq); return 1; @@ -212,29 +270,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } -static void update_sched_rt_period(struct rq *rq) -{ - struct rt_rq *rt_rq; - u64 period; - - while (rq->clock > rq->rt_period_expire) { - period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; - rq->rt_period_expire += period; - - for_each_leaf_rt_rq(rt_rq, rq) { - u64 runtime = sched_rt_runtime(rt_rq); - - rt_rq->rt_time -= min(rt_rq->rt_time, runtime); - if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { - rt_rq->rt_throttled = 0; - sched_rt_rq_enqueue(rt_rq); - } - } - - rq->rt_throttled = 0; - } -} - /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -284,6 +319,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) #ifdef CONFIG_RT_GROUP_SCHED if (rt_se_boosted(rt_se)) rt_rq->rt_nr_boosted++; + + if (rt_rq->tg) + start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); +#else + start_rt_bandwidth(&def_rt_bandwidth); #endif } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index be332e1a0c29..fd3364827ccf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -307,7 +307,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_rt_period, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &sched_rt_handler, }, { .ctl_name = CTL_UNNUMBERED, @@ -315,7 +315,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_rt_runtime, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &sched_rt_handler, }, { .ctl_name = CTL_UNNUMBERED, diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 69dba0c71727..d358d4e3a958 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -191,7 +191,6 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) void tick_nohz_stop_sched_tick(void) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; - unsigned long rt_jiffies; struct tick_sched *ts; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; @@ -243,10 +242,6 @@ void tick_nohz_stop_sched_tick(void) next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; - rt_jiffies = rt_needs_cpu(cpu); - if (rt_jiffies && rt_jiffies < delta_jiffies) - delta_jiffies = rt_jiffies; - if (rcu_needs_cpu(cpu)) delta_jiffies = 1; /* diff --git a/kernel/user.c b/kernel/user.c index 7132022a040c..5925c6887c10 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -193,6 +193,33 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, static struct kobj_attribute cpu_rt_runtime_attr = __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); + +static ssize_t cpu_rt_period_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct user_struct *up = container_of(kobj, struct user_struct, kobj); + + return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg)); +} + +static ssize_t cpu_rt_period_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t size) +{ + struct user_struct *up = container_of(kobj, struct user_struct, kobj); + unsigned long rt_period; + int rc; + + sscanf(buf, "%lu", &rt_period); + + rc = sched_group_set_rt_period(up->tg, rt_period); + + return (rc ? rc : size); +} + +static struct kobj_attribute cpu_rt_period_attr = + __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store); #endif /* default attributes per uid directory */ @@ -202,6 +229,7 @@ static struct attribute *uids_attributes[] = { #endif #ifdef CONFIG_RT_GROUP_SCHED &cpu_rt_runtime_attr.attr, + &cpu_rt_period_attr.attr, #endif NULL };