From 8ed92e51f99c2199c64cb33b4ba95ab12940a94c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Oct 2012 14:28:50 +0200 Subject: [PATCH 01/33] sched: Add WAKEUP_PREEMPTION feature flag, on by default As per the recent discussion with Mike and Linus, make it easier to test with/without this feature. No change in default behavior. Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-izoxq4haeg4mTognnDbwcevt@git.kernel.org --- kernel/sched/fair.c | 2 +- kernel/sched/features.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b990..f936552b3db1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2907,7 +2907,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick): */ - if (unlikely(p->policy != SCHED_NORMAL)) + if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) return; find_matching_se(&se, &pse); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index eebefcad7027..e68e69ab917d 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -31,6 +31,11 @@ SCHED_FEAT(LAST_BUDDY, true) */ SCHED_FEAT(CACHE_HOT_BUDDY, true) +/* + * Allow wakeup-time preemption of the current task: + */ +SCHED_FEAT(WAKEUP_PREEMPTION, true) + /* * Use arch dependent cpu power functions */ From 9d85f21c94f7f7a84d0ba686c58aa6d9da58fdbb Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:29 +0200 Subject: [PATCH 02/33] sched: Track the runnable average on a per-task entity basis Instead of tracking averaging the load parented by a cfs_rq, we can track entity load directly. With the load for a given cfs_rq then being the sum of its children. To do this we represent the historical contribution to runnable average within each trailing 1024us of execution as the coefficients of a geometric series. We can express this for a given task t as: runnable_sum(t) = \Sum u_i * y^i, runnable_avg_period(t) = \Sum 1024 * y^i load(t) = weight_t * runnable_sum(t) / runnable_avg_period(t) Where: u_i is the usage in the last i`th 1024us period (approximately 1ms) ~ms and y is chosen such that y^k = 1/2. We currently choose k to be 32 which roughly translates to about a sched period. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.372695337@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 13 +++++ kernel/sched/core.c | 5 ++ kernel/sched/debug.c | 4 ++ kernel/sched/fair.c | 129 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 151 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2e..418fc6d8a4da 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1095,6 +1095,16 @@ struct load_weight { unsigned long weight, inv_weight; }; +struct sched_avg { + /* + * These sums represent an infinite geometric series and so are bound + * above by 1024/(1-y). Thus we only need a u32 to store them for for all + * choices of y < 1-2^(-32)*1024. + */ + u32 runnable_avg_sum, runnable_avg_period; + u64 last_runnable_update; +}; + #ifdef CONFIG_SCHEDSTATS struct sched_statistics { u64 wait_start; @@ -1155,6 +1165,9 @@ struct sched_entity { /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; #endif +#ifdef CONFIG_SMP + struct sched_avg avg; +#endif }; struct sched_rt_entity { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712..fd9d0859350a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1524,6 +1524,11 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); +#ifdef CONFIG_SMP + p->se.avg.runnable_avg_period = 0; + p->se.avg.runnable_avg_sum = 0; +#endif + #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6f79596e0ea9..61f70979153a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -85,6 +85,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->statistics.wait_count); #endif P(se->load.weight); +#ifdef CONFIG_SMP + P(se->avg.runnable_avg_sum); + P(se->avg.runnable_avg_period); +#endif #undef PN #undef P } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b990..16d67f9b6955 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -971,6 +971,126 @@ static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) } #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_SMP +/* + * Approximate: + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) + */ +static __always_inline u64 decay_load(u64 val, u64 n) +{ + for (; n && val; n--) { + val *= 4008; + val >>= 12; + } + + return val; +} + +/* + * We can represent the historical contribution to runnable average as the + * coefficients of a geometric series. To do this we sub-divide our runnable + * history into segments of approximately 1ms (1024us); label the segment that + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. + * + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... + * p0 p1 p2 + * (now) (~1ms ago) (~2ms ago) + * + * Let u_i denote the fraction of p_i that the entity was runnable. + * + * We then designate the fractions u_i as our co-efficients, yielding the + * following representation of historical load: + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... + * + * We choose y based on the with of a reasonably scheduling period, fixing: + * y^32 = 0.5 + * + * This means that the contribution to load ~32ms ago (u_32) will be weighted + * approximately half as much as the contribution to load within the last ms + * (u_0). + * + * When a period "rolls over" and we have new u_0`, multiplying the previous + * sum again by y is sufficient to update: + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] + */ +static __always_inline int __update_entity_runnable_avg(u64 now, + struct sched_avg *sa, + int runnable) +{ + u64 delta; + int delta_w, decayed = 0; + + delta = now - sa->last_runnable_update; + /* + * This should only happen when time goes backwards, which it + * unfortunately does during sched clock init when we swap over to TSC. + */ + if ((s64)delta < 0) { + sa->last_runnable_update = now; + return 0; + } + + /* + * Use 1024ns as the unit of measurement since it's a reasonable + * approximation of 1us and fast to compute. + */ + delta >>= 10; + if (!delta) + return 0; + sa->last_runnable_update = now; + + /* delta_w is the amount already accumulated against our next period */ + delta_w = sa->runnable_avg_period % 1024; + if (delta + delta_w >= 1024) { + /* period roll-over */ + decayed = 1; + + /* + * Now that we know we're crossing a period boundary, figure + * out how much from delta we need to complete the current + * period and accrue it. + */ + delta_w = 1024 - delta_w; + BUG_ON(delta_w > delta); + do { + if (runnable) + sa->runnable_avg_sum += delta_w; + sa->runnable_avg_period += delta_w; + + /* + * Remainder of delta initiates a new period, roll over + * the previous. + */ + sa->runnable_avg_sum = + decay_load(sa->runnable_avg_sum, 1); + sa->runnable_avg_period = + decay_load(sa->runnable_avg_period, 1); + + delta -= delta_w; + /* New period is empty */ + delta_w = 1024; + } while (delta >= 1024); + } + + /* Remainder of delta accrued against u_0` */ + if (runnable) + sa->runnable_avg_sum += delta; + sa->runnable_avg_period += delta; + + return decayed; +} + +/* Update a sched_entity's runnable average */ +static inline void update_entity_load_avg(struct sched_entity *se) +{ + __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg, + se->on_rq); +} +#else +static inline void update_entity_load_avg(struct sched_entity *se) {} +#endif + static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS @@ -1097,6 +1217,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); + update_entity_load_avg(se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -1171,6 +1292,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_entity_load_avg(se); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -1340,6 +1462,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); + /* in !on_rq case, update occurred at dequeue */ + update_entity_load_avg(prev); } cfs_rq->curr = NULL; } @@ -1352,6 +1476,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) */ update_curr(cfs_rq); + /* + * Ensure that runnable average is periodically updated. + */ + update_entity_load_avg(curr); + /* * Update share accounting for long-running entities. */ From 18bf2805d9b30cb823d4919b42cd230f59c7ce1f Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Thu, 4 Oct 2012 12:51:20 +0200 Subject: [PATCH 03/33] sched: Maintain per-rq runnable averages Since runqueues do not have a corresponding sched_entity we instead embed a sched_avg structure directly. Signed-off-by: Ben Segall Reviewed-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.442637130@google.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 10 ++++++++-- kernel/sched/fair.c | 18 ++++++++++++++++-- kernel/sched/sched.h | 2 ++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 61f70979153a..4240abce4116 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec) static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { struct sched_entity *se = tg->se[cpu]; - if (!se) - return; #define P(F) \ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) + if (!se) { + struct sched_avg *avg = &cpu_rq(cpu)->avg; + P(avg->runnable_avg_sum); + P(avg->runnable_avg_period); + return; + } + + PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 16d67f9b6955..8c5468fcf10d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1087,8 +1087,14 @@ static inline void update_entity_load_avg(struct sched_entity *se) __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg, se->on_rq); } + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ + __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); +} #else static inline void update_entity_load_avg(struct sched_entity *se) {} +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} #endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -2340,8 +2346,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { + update_rq_runnable_avg(rq, rq->nr_running); inc_nr_running(rq); + } hrtick_update(rq); } @@ -2399,8 +2407,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { dec_nr_running(rq); + update_rq_runnable_avg(rq, 1); + } hrtick_update(rq); } @@ -4586,6 +4596,8 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (this_rq->avg_idle < sysctl_sched_migration_cost) return; + update_rq_runnable_avg(this_rq, 1); + /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ @@ -5083,6 +5095,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } + + update_rq_runnable_avg(rq, 1); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7db09cfabc..14b571968713 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -467,6 +467,8 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif + + struct sched_avg avg; }; static inline int cpu_of(struct rq *rq) From 2dac754e10a5d41d94d2d2365c0345d4f215a266 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: [PATCH 04/33] sched: Aggregate load contributed by task entities on parenting cfs_rq For a given task t, we can compute its contribution to load as: task_load(t) = runnable_avg(t) * weight(t) On a parenting cfs_rq we can then aggregate: runnable_load(cfs_rq) = \Sum task_load(t), for all runnable children t Maintain this bottom up, with task entities adding their contributed load to the parenting cfs_rq sum. When a task entity's load changes we add the same delta to the maintained sum. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.514678907@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/debug.c | 3 +++ kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 10 ++++++++- 4 files changed, 60 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 418fc6d8a4da..81d8b1ba4100 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1103,6 +1103,7 @@ struct sched_avg { */ u32 runnable_avg_sum, runnable_avg_period; u64 last_runnable_update; + unsigned long load_avg_contrib; }; #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4240abce4116..c953a89f94aa 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -94,6 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #ifdef CONFIG_SMP P(se->avg.runnable_avg_sum); P(se->avg.runnable_avg_period); + P(se->avg.load_avg_contrib); #endif #undef PN #undef P @@ -224,6 +225,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->load_contribution); SEQ_printf(m, " .%-30s: %d\n", "load_tg", atomic_read(&cfs_rq->tg->load_weight)); + SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", + cfs_rq->runnable_load_avg); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8c5468fcf10d..77af759e5675 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1081,20 +1081,63 @@ static __always_inline int __update_entity_runnable_avg(u64 now, return decayed; } +/* Compute the current contribution to load_avg by se, return any delta */ +static long __update_entity_load_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.load_avg_contrib; + + if (!entity_is_task(se)) + return 0; + + se->avg.load_avg_contrib = div64_u64(se->avg.runnable_avg_sum * + se->load.weight, + se->avg.runnable_avg_period + 1); + + return se->avg.load_avg_contrib - old_contrib; +} + /* Update a sched_entity's runnable average */ static inline void update_entity_load_avg(struct sched_entity *se) { - __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg, - se->on_rq); + struct cfs_rq *cfs_rq = cfs_rq_of(se); + long contrib_delta; + + if (!__update_entity_runnable_avg(rq_of(cfs_rq)->clock_task, &se->avg, + se->on_rq)) + return; + + contrib_delta = __update_entity_load_avg_contrib(se); + if (se->on_rq) + cfs_rq->runnable_load_avg += contrib_delta; } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); } + +/* Add the load generated by se into cfs_rq's child load-average */ +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) +{ + update_entity_load_avg(se); + cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; +} + +/* Remove se's load from this cfs_rq child load-average */ +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) +{ + update_entity_load_avg(se); + cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; +} #else static inline void update_entity_load_avg(struct sched_entity *se) {} static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) {} +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) {} #endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1223,7 +1266,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); - update_entity_load_avg(se); + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -1298,7 +1341,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_entity_load_avg(se); + dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14b571968713..e6539736af58 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -222,6 +222,15 @@ struct cfs_rq { unsigned int nr_spread_over; #endif +#ifdef CONFIG_SMP + /* + * CFS Load tracking + * Under CFS, load is tracked on a per-entity basis and aggregated up. + * This allows for the description of both thread and group usage (in + * the FAIR_GROUP_SCHED case). + */ + u64 runnable_load_avg; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ @@ -1214,4 +1223,3 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - From 9ee474f55664ff63111c843099d365e7ecffb56f Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: [PATCH 05/33] sched: Maintain the load contribution of blocked entities We are currently maintaining: runnable_load(cfs_rq) = \Sum task_load(t) For all running children t of cfs_rq. While this can be naturally updated for tasks in a runnable state (as they are scheduled); this does not account for the load contributed by blocked task entities. This can be solved by introducing a separate accounting for blocked load: blocked_load(cfs_rq) = \Sum runnable(b) * weight(b) Obviously we do not want to iterate over all blocked entities to account for their decay, we instead observe that: runnable_load(t) = \Sum p_i*y^i and that to account for an additional idle period we only need to compute: y*runnable_load(t). This means that we can compute all blocked entities at once by evaluating: blocked_load(cfs_rq)` = y * blocked_load(cfs_rq) Finally we maintain a decay counter so that when a sleeping entity re-awakens we can determine how much of its load should be removed from the blocked sum. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.585389902@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 1 - kernel/sched/debug.c | 3 + kernel/sched/fair.c | 128 +++++++++++++++++++++++++++++++++++++----- kernel/sched/sched.h | 4 +- 5 files changed, 122 insertions(+), 15 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 81d8b1ba4100..b1831accfd89 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1103,6 +1103,7 @@ struct sched_avg { */ u32 runnable_avg_sum, runnable_avg_period; u64 last_runnable_update; + s64 decay_count; unsigned long load_avg_contrib; }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fd9d0859350a..00898f1fb69e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1528,7 +1528,6 @@ static void __sched_fork(struct task_struct *p) p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; #endif - #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c953a89f94aa..2d2e2b3c1bef 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -95,6 +95,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->avg.runnable_avg_sum); P(se->avg.runnable_avg_period); P(se->avg.load_avg_contrib); + P(se->avg.decay_count); #endif #undef PN #undef P @@ -227,6 +228,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) atomic_read(&cfs_rq->tg->load_weight)); SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", cfs_rq->runnable_load_avg); + SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", + cfs_rq->blocked_load_avg); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 77af759e5675..83194175e841 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -259,6 +259,8 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq); + static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { @@ -278,6 +280,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) } cfs_rq->on_list = 1; + /* We should have no load, but we need to update last_decay. */ + update_cfs_rq_blocked_load(cfs_rq); } } @@ -1081,6 +1085,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now, return decayed; } +/* Synchronize an entity's decay with its parenting cfs_rq.*/ +static inline void __synchronize_entity_decay(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 decays = atomic64_read(&cfs_rq->decay_counter); + + decays -= se->avg.decay_count; + if (!decays) + return; + + se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + se->avg.decay_count = 0; +} + /* Compute the current contribution to load_avg by se, return any delta */ static long __update_entity_load_avg_contrib(struct sched_entity *se) { @@ -1096,8 +1114,18 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) return se->avg.load_avg_contrib - old_contrib; } +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, + long load_contrib) +{ + if (likely(load_contrib < cfs_rq->blocked_load_avg)) + cfs_rq->blocked_load_avg -= load_contrib; + else + cfs_rq->blocked_load_avg = 0; +} + /* Update a sched_entity's runnable average */ -static inline void update_entity_load_avg(struct sched_entity *se) +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) { struct cfs_rq *cfs_rq = cfs_rq_of(se); long contrib_delta; @@ -1107,8 +1135,34 @@ static inline void update_entity_load_avg(struct sched_entity *se) return; contrib_delta = __update_entity_load_avg_contrib(se); + + if (!update_cfs_rq) + return; + if (se->on_rq) cfs_rq->runnable_load_avg += contrib_delta; + else + subtract_blocked_load_contrib(cfs_rq, -contrib_delta); +} + +/* + * Decay the load contributed by all blocked children and account this so that + * their contribution may appropriately discounted when they wake up. + */ +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) +{ + u64 now = rq_of(cfs_rq)->clock_task >> 20; + u64 decays; + + decays = now - cfs_rq->last_decay; + if (!decays) + return; + + cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, + decays); + atomic64_add(decays, &cfs_rq->decay_counter); + + cfs_rq->last_decay = now; } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) @@ -1118,26 +1172,53 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable) /* Add the load generated by se into cfs_rq's child load-average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) + struct sched_entity *se, + int wakeup) { - update_entity_load_avg(se); + /* we track migrations using entity decay_count == 0 */ + if (unlikely(!se->avg.decay_count)) { + se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; + wakeup = 0; + } else { + __synchronize_entity_decay(se); + } + + if (wakeup) + subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); + + update_entity_load_avg(se, 0); cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + update_cfs_rq_blocked_load(cfs_rq); } -/* Remove se's load from this cfs_rq child load-average */ +/* + * Remove se's load from this cfs_rq child load-average, if the entity is + * transitioning to a blocked state we track its projected decay using + * blocked_load_avg. + */ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) + struct sched_entity *se, + int sleep) { - update_entity_load_avg(se); + update_entity_load_avg(se, 1); + cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + if (sleep) { + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + } /* migrations, e.g. sleep=0 leave decay_count == 0 */ } #else -static inline void update_entity_load_avg(struct sched_entity *se) {} +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) {} static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) {} + struct sched_entity *se, + int wakeup) {} static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) {} + struct sched_entity *se, + int sleep) {} +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) {} #endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1266,7 +1347,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); - enqueue_entity_load_avg(cfs_rq, se); + enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -1341,7 +1422,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - dequeue_entity_load_avg(cfs_rq, se); + dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -1512,7 +1593,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ - update_entity_load_avg(prev); + update_entity_load_avg(prev, 1); } cfs_rq->curr = NULL; } @@ -1528,7 +1609,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_entity_load_avg(curr); + update_entity_load_avg(curr, 1); + update_cfs_rq_blocked_load(cfs_rq); /* * Update share accounting for long-running entities. @@ -2387,6 +2469,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); } if (!se) { @@ -2448,6 +2531,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); } if (!se) { @@ -3498,6 +3582,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) update_rq_clock(rq); update_cfs_load(cfs_rq, 1); + update_cfs_rq_blocked_load(cfs_rq); /* * We need to update shares after updating tg->load_weight in @@ -5232,6 +5317,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) place_entity(cfs_rq, se, 0); se->vruntime -= cfs_rq->min_vruntime; } + +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + /* + * Remove our load from contribution when we leave sched_fair + * and ensure we don't carry in an old decay_count if we + * switch back. + */ + if (p->se.avg.decay_count) { + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + __synchronize_entity_decay(&p->se); + subtract_blocked_load_contrib(cfs_rq, + p->se.avg.load_avg_contrib); + } +#endif } /* @@ -5278,6 +5377,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + atomic64_set(&cfs_rq->decay_counter, 1); +#endif } #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e6539736af58..664ff39195f7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -229,7 +229,9 @@ struct cfs_rq { * This allows for the description of both thread and group usage (in * the FAIR_GROUP_SCHED case). */ - u64 runnable_load_avg; + u64 runnable_load_avg, blocked_load_avg; + atomic64_t decay_counter; + u64 last_decay; #endif #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ From 0a74bef8bed18dc6889e9bc37ea1050a50c86c89 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: [PATCH 06/33] sched: Add an rq migration call-back to sched_class Since we are now doing bottom up load accumulation we need explicit notification when a task has been re-parented so that the old hierarchy can be updated. Adds: migrate_task_rq(struct task_struct *p, int next_cpu) (The alternative is to do this out of __set_task_cpu, but it was suggested that this would be a cleaner encapsulation.) Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.660023400@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 2 ++ kernel/sched/fair.c | 12 ++++++++++++ 3 files changed, 15 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index b1831accfd89..e483ccb08ce6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1061,6 +1061,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + void (*migrate_task_rq)(struct task_struct *p, int next_cpu); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 00898f1fb69e..f26860074ef2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -952,6 +952,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + if (p->sched_class->migrate_task_rq) + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 83194175e841..5e602e6ba0c3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3047,6 +3047,17 @@ unlock: return new_cpu; } + +/* + * Called immediately before a task is migrated to a new cpu; task_cpu(p) and + * cfs_rq_of(p) references at time of call are still valid and identify the + * previous cpu. However, the caller only guarantees p->pi_lock is held; no + * other assumptions, including the state of rq->lock, should be made. + */ +static void +migrate_task_rq_fair(struct task_struct *p, int next_cpu) +{ +} #endif /* CONFIG_SMP */ static unsigned long @@ -5607,6 +5618,7 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, + .migrate_task_rq = migrate_task_rq_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, From aff3e498844441fa71c5ee1bbc470e1dff9548d9 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: [PATCH 07/33] sched: Account for blocked load waking back up When a running entity blocks we migrate its tracked load to cfs_rq->blocked_runnable_avg. In the sleep case this occurs while holding rq->lock and so is a natural transition. Wake-ups however, are potentially asynchronous in the presence of migration and so special care must be taken. We use an atomic counter to track such migrated load, taking care to match this with the previously introduced decay counters so that we don't migrate too much load. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.726077467@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 100 ++++++++++++++++++++++++++++++++++--------- kernel/sched/sched.h | 2 +- 2 files changed, 81 insertions(+), 21 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5e602e6ba0c3..74dc29ba1ad1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -259,7 +259,8 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq); +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, + int force_update); static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { @@ -281,7 +282,7 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->on_list = 1; /* We should have no load, but we need to update last_decay. */ - update_cfs_rq_blocked_load(cfs_rq); + update_cfs_rq_blocked_load(cfs_rq, 0); } } @@ -1086,17 +1087,19 @@ static __always_inline int __update_entity_runnable_avg(u64 now, } /* Synchronize an entity's decay with its parenting cfs_rq.*/ -static inline void __synchronize_entity_decay(struct sched_entity *se) +static inline u64 __synchronize_entity_decay(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 decays = atomic64_read(&cfs_rq->decay_counter); decays -= se->avg.decay_count; if (!decays) - return; + return 0; se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); se->avg.decay_count = 0; + + return decays; } /* Compute the current contribution to load_avg by se, return any delta */ @@ -1149,20 +1152,26 @@ static inline void update_entity_load_avg(struct sched_entity *se, * Decay the load contributed by all blocked children and account this so that * their contribution may appropriately discounted when they wake up. */ -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) { u64 now = rq_of(cfs_rq)->clock_task >> 20; u64 decays; decays = now - cfs_rq->last_decay; - if (!decays) + if (!decays && !force_update) return; - cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, - decays); - atomic64_add(decays, &cfs_rq->decay_counter); + if (atomic64_read(&cfs_rq->removed_load)) { + u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); + subtract_blocked_load_contrib(cfs_rq, removed_load); + } - cfs_rq->last_decay = now; + if (decays) { + cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, + decays); + atomic64_add(decays, &cfs_rq->decay_counter); + cfs_rq->last_decay = now; + } } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) @@ -1175,20 +1184,42 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) { - /* we track migrations using entity decay_count == 0 */ - if (unlikely(!se->avg.decay_count)) { + /* + * We track migrations using entity decay_count <= 0, on a wake-up + * migration we use a negative decay count to track the remote decays + * accumulated while sleeping. + */ + if (unlikely(se->avg.decay_count <= 0)) { se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; + if (se->avg.decay_count) { + /* + * In a wake-up migration we have to approximate the + * time sleeping. This is because we can't synchronize + * clock_task between the two cpus, and it is not + * guaranteed to be read-safe. Instead, we can + * approximate this using our carried decays, which are + * explicitly atomically readable. + */ + se->avg.last_runnable_update -= (-se->avg.decay_count) + << 20; + update_entity_load_avg(se, 0); + /* Indicate that we're now synchronized and on-rq */ + se->avg.decay_count = 0; + } wakeup = 0; } else { __synchronize_entity_decay(se); } - if (wakeup) + /* migrated tasks did not contribute to our blocked load */ + if (wakeup) { subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); + update_entity_load_avg(se, 0); + } - update_entity_load_avg(se, 0); cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; - update_cfs_rq_blocked_load(cfs_rq); + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !wakeup); } /* @@ -1201,6 +1232,8 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, int sleep) { update_entity_load_avg(se, 1); + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !sleep); cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; if (sleep) { @@ -1218,7 +1251,8 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) {} -static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) {} +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, + int force_update) {} #endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1610,7 +1644,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * Ensure that runnable average is periodically updated. */ update_entity_load_avg(curr, 1); - update_cfs_rq_blocked_load(cfs_rq); + update_cfs_rq_blocked_load(cfs_rq, 1); /* * Update share accounting for long-running entities. @@ -3057,6 +3091,19 @@ unlock: static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) { + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Load tracking: accumulate removed load so that it can be processed + * when we next update owning cfs_rq under rq->lock. Tasks contribute + * to blocked load iff they have a positive decay-count. It can never + * be negative here since on-rq tasks have decay-count == 0. + */ + if (se->avg.decay_count) { + se->avg.decay_count = -__synchronize_entity_decay(se); + atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); + } } #endif /* CONFIG_SMP */ @@ -3593,7 +3640,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) update_rq_clock(rq); update_cfs_load(cfs_rq, 1); - update_cfs_rq_blocked_load(cfs_rq); + update_cfs_rq_blocked_load(cfs_rq, 1); /* * We need to update shares after updating tg->load_weight in @@ -5390,12 +5437,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #endif #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) atomic64_set(&cfs_rq->decay_counter, 1); + atomic64_set(&cfs_rq->removed_load, 0); #endif } #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int on_rq) { + struct cfs_rq *cfs_rq; /* * If the task was not on the rq at the time of this cgroup movement * it must have been asleep, sleeping tasks keep their ->vruntime @@ -5427,8 +5476,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) if (!on_rq) p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; set_task_rq(p, task_cpu(p)); - if (!on_rq) - p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; + if (!on_rq) { + cfs_rq = cfs_rq_of(&p->se); + p->se.vruntime += cfs_rq->min_vruntime; +#ifdef CONFIG_SMP + /* + * migrate_task_rq_fair() will have removed our previous + * contribution, but we must synchronize for ongoing future + * decay. + */ + p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; +#endif + } } void free_fair_sched_group(struct task_group *tg) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 664ff39195f7..30236ab4edc0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -230,7 +230,7 @@ struct cfs_rq { * the FAIR_GROUP_SCHED case). */ u64 runnable_load_avg, blocked_load_avg; - atomic64_t decay_counter; + atomic64_t decay_counter, removed_load; u64 last_decay; #endif #ifdef CONFIG_FAIR_GROUP_SCHED From c566e8e9e44b72b53091da20e2dedefc730f2ee2 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: [PATCH 08/33] sched: Aggregate total task_group load Maintain a global running sum of the average load seen on each cfs_rq belonging to each task group so that it may be used in calculating an appropriate shares:weight distribution. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.792901086@google.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 4 ++++ kernel/sched/fair.c | 22 ++++++++++++++++++++++ kernel/sched/sched.h | 4 ++++ 3 files changed, 30 insertions(+) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2d2e2b3c1bef..290892361a09 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -230,6 +230,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", + atomic64_read(&cfs_rq->tg->load_avg)); + SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", + cfs_rq->tg_load_contrib); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 74dc29ba1ad1..db788222f198 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1102,6 +1102,26 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) return decays; } +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) +{ + struct task_group *tg = cfs_rq->tg; + s64 tg_contrib; + + tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; + tg_contrib -= cfs_rq->tg_load_contrib; + + if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { + atomic64_add(tg_contrib, &tg->load_avg); + cfs_rq->tg_load_contrib += tg_contrib; + } +} +#else +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) {} +#endif + /* Compute the current contribution to load_avg by se, return any delta */ static long __update_entity_load_avg_contrib(struct sched_entity *se) { @@ -1172,6 +1192,8 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) atomic64_add(decays, &cfs_rq->decay_counter); cfs_rq->last_decay = now; } + + __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 30236ab4edc0..924a99094888 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -112,6 +112,7 @@ struct task_group { unsigned long shares; atomic_t load_weight; + atomic64_t load_avg; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -232,6 +233,9 @@ struct cfs_rq { u64 runnable_load_avg, blocked_load_avg; atomic64_t decay_counter, removed_load; u64 last_decay; +#ifdef CONFIG_FAIR_GROUP_SCHED + u64 tg_load_contrib; +#endif #endif #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ From 8165e145ceb62fc338e099c9b12b3239c83d2f8e Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:31 +0200 Subject: [PATCH 09/33] sched: Compute load contribution by a group entity Unlike task entities who have a fixed weight, group entities instead own a fraction of their parenting task_group's shares as their contributed weight. Compute this fraction so that we can correctly account hierarchies and shared entity nodes. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.855074415@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index db788222f198..e20cb2693ef7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1117,22 +1117,43 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, cfs_rq->tg_load_contrib += tg_contrib; } } + +static inline void __update_group_entity_contrib(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + struct task_group *tg = cfs_rq->tg; + u64 contrib; + + contrib = cfs_rq->tg_load_contrib * tg->shares; + se->avg.load_avg_contrib = div64_u64(contrib, + atomic64_read(&tg->load_avg) + 1); +} #else static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, int force_update) {} +static inline void __update_group_entity_contrib(struct sched_entity *se) {} #endif +static inline void __update_task_entity_contrib(struct sched_entity *se) +{ + u32 contrib; + + /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ + contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); + contrib /= (se->avg.runnable_avg_period + 1); + se->avg.load_avg_contrib = scale_load(contrib); +} + /* Compute the current contribution to load_avg by se, return any delta */ static long __update_entity_load_avg_contrib(struct sched_entity *se) { long old_contrib = se->avg.load_avg_contrib; - if (!entity_is_task(se)) - return 0; - - se->avg.load_avg_contrib = div64_u64(se->avg.runnable_avg_sum * - se->load.weight, - se->avg.runnable_avg_period + 1); + if (entity_is_task(se)) { + __update_task_entity_contrib(se); + } else { + __update_group_entity_contrib(se); + } return se->avg.load_avg_contrib - old_contrib; } From bb17f65571e97a7ec0297571fb1154fbd107ad00 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:31 +0200 Subject: [PATCH 10/33] sched: Normalize tg load contributions against runnable time Entities of equal weight should receive equitable distribution of cpu time. This is challenging in the case of a task_group's shares as execution may be occurring on multiple cpus simultaneously. To handle this we divide up the shares into weights proportionate with the load on each cfs_rq. This does not however, account for the fact that the sum of the parts may be less than one cpu and so we need to normalize: load(tg) = min(runnable_avg(tg), 1) * tg->shares Where runnable_avg is the aggregate time in which the task_group had runnable children. Signed-off-by: Paul Turner Reviewed-by: Ben Segall . Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.930124292@google.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 4 ++++ kernel/sched/fair.c | 56 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 2 ++ 3 files changed, 62 insertions(+) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 290892361a09..71b0ea325e93 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -234,6 +234,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) atomic64_read(&cfs_rq->tg->load_avg)); SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); + SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", + cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", + atomic_read(&cfs_rq->tg->runnable_avg)); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e20cb2693ef7..9e49722da032 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1118,19 +1118,73 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, } } +/* + * Aggregate cfs_rq runnable averages into an equivalent task_group + * representation for computing load contributions. + */ +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) +{ + struct task_group *tg = cfs_rq->tg; + long contrib; + + /* The fraction of a cpu used by this cfs_rq */ + contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, + sa->runnable_avg_period + 1); + contrib -= cfs_rq->tg_runnable_contrib; + + if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { + atomic_add(contrib, &tg->runnable_avg); + cfs_rq->tg_runnable_contrib += contrib; + } +} + static inline void __update_group_entity_contrib(struct sched_entity *se) { struct cfs_rq *cfs_rq = group_cfs_rq(se); struct task_group *tg = cfs_rq->tg; + int runnable_avg; + u64 contrib; contrib = cfs_rq->tg_load_contrib * tg->shares; se->avg.load_avg_contrib = div64_u64(contrib, atomic64_read(&tg->load_avg) + 1); + + /* + * For group entities we need to compute a correction term in the case + * that they are consuming <1 cpu so that we would contribute the same + * load as a task of equal weight. + * + * Explicitly co-ordinating this measurement would be expensive, but + * fortunately the sum of each cpus contribution forms a usable + * lower-bound on the true value. + * + * Consider the aggregate of 2 contributions. Either they are disjoint + * (and the sum represents true value) or they are disjoint and we are + * understating by the aggregate of their overlap. + * + * Extending this to N cpus, for a given overlap, the maximum amount we + * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of + * cpus that overlap for this interval and w_i is the interval width. + * + * On a small machine; the first term is well-bounded which bounds the + * total error since w_i is a subset of the period. Whereas on a + * larger machine, while this first term can be larger, if w_i is the + * of consequential size guaranteed to see n_i*w_i quickly converge to + * our upper bound of 1-cpu. + */ + runnable_avg = atomic_read(&tg->runnable_avg); + if (runnable_avg < NICE_0_LOAD) { + se->avg.load_avg_contrib *= runnable_avg; + se->avg.load_avg_contrib >>= NICE_0_SHIFT; + } } #else static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, int force_update) {} +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) {} static inline void __update_group_entity_contrib(struct sched_entity *se) {} #endif @@ -1152,6 +1206,7 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) if (entity_is_task(se)) { __update_task_entity_contrib(se); } else { + __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); __update_group_entity_contrib(se); } @@ -1220,6 +1275,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); + __update_tg_runnable_avg(&rq->avg, &rq->cfs); } /* Add the load generated by se into cfs_rq's child load-average */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 924a99094888..134928dc6f05 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -113,6 +113,7 @@ struct task_group { atomic_t load_weight; atomic64_t load_avg; + atomic_t runnable_avg; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -234,6 +235,7 @@ struct cfs_rq { atomic64_t decay_counter, removed_load; u64 last_decay; #ifdef CONFIG_FAIR_GROUP_SCHED + u32 tg_runnable_contrib; u64 tg_load_contrib; #endif #endif From f1b17280efbd21873d1db8631117bdbccbcb39a2 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:31 +0200 Subject: [PATCH 11/33] sched: Maintain runnable averages across throttled periods With bandwidth control tracked entities may cease execution according to user specified bandwidth limits. Charging this time as either throttled or blocked however, is incorrect and would falsely skew in either direction. What we actually want is for any throttled periods to be "invisible" to load-tracking as they are removed from the system for that interval and contribute normally otherwise. Do this by moderating the progression of time to omit any periods in which the entity belonged to a throttled hierarchy. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.998912151@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 50 +++++++++++++++++++++++++++++++++++--------- kernel/sched/sched.h | 3 ++- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9e49722da032..873c9f5c5796 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1222,15 +1222,26 @@ static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, cfs_rq->blocked_load_avg = 0; } +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); + /* Update a sched_entity's runnable average */ static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) { struct cfs_rq *cfs_rq = cfs_rq_of(se); long contrib_delta; + u64 now; - if (!__update_entity_runnable_avg(rq_of(cfs_rq)->clock_task, &se->avg, - se->on_rq)) + /* + * For a group entity we need to use their owned cfs_rq_clock_task() in + * case they are the parent of a throttled hierarchy. + */ + if (entity_is_task(se)) + now = cfs_rq_clock_task(cfs_rq); + else + now = cfs_rq_clock_task(group_cfs_rq(se)); + + if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) return; contrib_delta = __update_entity_load_avg_contrib(se); @@ -1250,7 +1261,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, */ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) { - u64 now = rq_of(cfs_rq)->clock_task >> 20; + u64 now = cfs_rq_clock_task(cfs_rq) >> 20; u64 decays; decays = now - cfs_rq->last_decay; @@ -1841,6 +1852,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return &tg->cfs_bandwidth; } +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq->throttle_count)) + return cfs_rq->throttled_clock_task; + + return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; +} + /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -1991,6 +2011,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->load_stamp += delta; cfs_rq->load_last += delta; + /* adjust cfs_rq_clock_task() */ + cfs_rq->throttled_clock_task_time += rq->clock_task - + cfs_rq->throttled_clock_task; + /* update entity weight now that we are on_rq again */ update_cfs_shares(cfs_rq); } @@ -2005,8 +2029,10 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; /* group is entering throttled state, record last load */ - if (!cfs_rq->throttle_count) + if (!cfs_rq->throttle_count) { update_cfs_load(cfs_rq, 0); + cfs_rq->throttled_clock_task = rq->clock_task; + } cfs_rq->throttle_count++; return 0; @@ -2021,7 +2047,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - /* account load preceding throttle */ + /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); @@ -2045,7 +2071,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rq->nr_running -= task_delta; cfs_rq->throttled = 1; - cfs_rq->throttled_timestamp = rq->clock; + cfs_rq->throttled_clock = rq->clock; raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); raw_spin_unlock(&cfs_b->lock); @@ -2063,10 +2089,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 0; raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; + cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); - cfs_rq->throttled_timestamp = 0; update_rq_clock(rq); /* update hierarchical throttle state */ @@ -2466,8 +2491,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq) } #else /* CONFIG_CFS_BANDWIDTH */ -static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + return rq_of(cfs_rq)->clock_task; +} + +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 134928dc6f05..d13bce7a44ef 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -281,7 +281,8 @@ struct cfs_rq { u64 runtime_expires; s64 runtime_remaining; - u64 throttled_timestamp; + u64 throttled_clock, throttled_clock_task; + u64 throttled_clock_task_time; int throttled, throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ From 82958366cfea1a50e7e90907b2d55ae29ed69974 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:31 +0200 Subject: [PATCH 12/33] sched: Replace update_shares weight distribution with per-entity computation Now that the machinery in place is in place to compute contributed load in a bottom up fashion; replace the shares distribution code within update_shares() accordingly. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141507.061208672@google.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 8 --- kernel/sched/fair.c | 157 +++++++------------------------------------ kernel/sched/sched.h | 36 ++++------ 3 files changed, 36 insertions(+), 165 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 71b0ea325e93..2cd3c1b4e582 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -218,14 +218,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", - SPLIT_NS(cfs_rq->load_avg)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", - SPLIT_NS(cfs_rq->load_period)); - SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", - cfs_rq->load_contribution); - SEQ_printf(m, " .%-30s: %d\n", "load_tg", - atomic_read(&cfs_rq->tg->load_weight)); SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 873c9f5c5796..57fae95eed99 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -658,9 +658,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq); - /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -680,10 +677,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); - -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED - cfs_rq->load_unacc_exec_time += delta_exec; -#endif } static void update_curr(struct cfs_rq *cfs_rq) @@ -806,72 +799,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_FAIR_GROUP_SCHED -/* we need this in update_cfs_load and load-balance functions below */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); # ifdef CONFIG_SMP -static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, - int global_update) -{ - struct task_group *tg = cfs_rq->tg; - long load_avg; - - load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); - load_avg -= cfs_rq->load_contribution; - - if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { - atomic_add(load_avg, &tg->load_weight); - cfs_rq->load_contribution += load_avg; - } -} - -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ - u64 period = sysctl_sched_shares_window; - u64 now, delta; - unsigned long load = cfs_rq->load.weight; - - if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) - return; - - now = rq_of(cfs_rq)->clock_task; - delta = now - cfs_rq->load_stamp; - - /* truncate load history at 4 idle periods */ - if (cfs_rq->load_stamp > cfs_rq->load_last && - now - cfs_rq->load_last > 4 * period) { - cfs_rq->load_period = 0; - cfs_rq->load_avg = 0; - delta = period - 1; - } - - cfs_rq->load_stamp = now; - cfs_rq->load_unacc_exec_time = 0; - cfs_rq->load_period += delta; - if (load) { - cfs_rq->load_last = now; - cfs_rq->load_avg += delta * load; - } - - /* consider updating load contribution on each fold or truncate */ - if (global_update || cfs_rq->load_period > period - || !cfs_rq->load_period) - update_cfs_rq_load_contribution(cfs_rq, global_update); - - while (cfs_rq->load_period > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (cfs_rq->load_period)); - cfs_rq->load_period /= 2; - cfs_rq->load_avg /= 2; - } - - if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) - list_del_leaf_cfs_rq(cfs_rq); -} - static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) { long tg_weight; @@ -881,8 +809,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) * to gain a more accurate current total weight. See * update_cfs_rq_load_contribution(). */ - tg_weight = atomic_read(&tg->load_weight); - tg_weight -= cfs_rq->load_contribution; + tg_weight = atomic64_read(&tg->load_avg); + tg_weight -= cfs_rq->tg_load_contrib; tg_weight += cfs_rq->load.weight; return tg_weight; @@ -906,27 +834,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) return shares; } - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); - } -} # else /* CONFIG_SMP */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -} - static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { return tg->shares; } - -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -} # endif /* CONFIG_SMP */ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) @@ -944,6 +856,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, account_entity_enqueue(cfs_rq, se); } +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); + static void update_cfs_shares(struct cfs_rq *cfs_rq) { struct task_group *tg; @@ -963,17 +877,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -} - static inline void update_cfs_shares(struct cfs_rq *cfs_rq) { } - -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -} #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP @@ -1490,7 +1396,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_cfs_load(cfs_rq, 0); enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -1587,7 +1492,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; - update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); /* @@ -1756,11 +1660,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) update_entity_load_avg(curr, 1); update_cfs_rq_blocked_load(cfs_rq, 1); - /* - * Update share accounting for long-running entities. - */ - update_entity_shares_tick(cfs_rq); - #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother @@ -2005,18 +1904,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; #ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { - u64 delta = rq->clock_task - cfs_rq->load_stamp; - - /* leaving throttled state, advance shares averaging windows */ - cfs_rq->load_stamp += delta; - cfs_rq->load_last += delta; - /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq->clock_task - cfs_rq->throttled_clock_task; - - /* update entity weight now that we are on_rq again */ - update_cfs_shares(cfs_rq); } #endif @@ -2028,11 +1918,9 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - /* group is entering throttled state, record last load */ - if (!cfs_rq->throttle_count) { - update_cfs_load(cfs_rq, 0); + /* group is entering throttled state, stop time */ + if (!cfs_rq->throttle_count) cfs_rq->throttled_clock_task = rq->clock_task; - } cfs_rq->throttle_count++; return 0; @@ -2630,7 +2518,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); } @@ -2692,7 +2579,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); } @@ -3755,27 +3641,36 @@ next: */ static int update_shares_cpu(struct task_group *tg, int cpu) { + struct sched_entity *se; struct cfs_rq *cfs_rq; unsigned long flags; struct rq *rq; - if (!tg->se[cpu]) - return 0; - rq = cpu_rq(cpu); + se = tg->se[cpu]; cfs_rq = tg->cfs_rq[cpu]; raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_load(cfs_rq, 1); update_cfs_rq_blocked_load(cfs_rq, 1); - /* - * We need to update shares after updating tg->load_weight in - * order to adjust the weight of groups with long running tasks. - */ - update_cfs_shares(cfs_rq); + if (se) { + update_entity_load_avg(se, 1); + /* + * We pivot on our runnable average having decayed to zero for + * list removal. This generally implies that all our children + * have also been removed (modulo rounding error or bandwidth + * control); however, such cases are rare and we can fix these + * at enqueue. + * + * TODO: fix up out-of-order children on enqueue. + */ + if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) + list_del_leaf_cfs_rq(cfs_rq); + } else { + update_rq_runnable_avg(rq, rq->nr_running); + } raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -5702,10 +5597,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, cfs_rq->tg = tg; cfs_rq->rq = rq; -#ifdef CONFIG_SMP - /* allow initial update_cfs_load() to truncate */ - cfs_rq->load_stamp = 1; -#endif init_cfs_rq_runtime(cfs_rq); tg->cfs_rq[cpu] = cfs_rq; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d13bce7a44ef..0a75a430ca77 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -234,11 +234,21 @@ struct cfs_rq { u64 runnable_load_avg, blocked_load_avg; atomic64_t decay_counter, removed_load; u64 last_decay; + #ifdef CONFIG_FAIR_GROUP_SCHED u32 tg_runnable_contrib; u64 tg_load_contrib; -#endif -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; +#endif /* CONFIG_SMP */ + #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ @@ -254,28 +264,6 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ -#ifdef CONFIG_SMP - /* - * h_load = weight * f(tg) - * - * Where f(tg) is the recursive weight fraction assigned to - * this group. - */ - unsigned long h_load; - - /* - * Maintaining per-cpu shares distribution for group scheduling - * - * load_stamp is the last time we updated the load average - * load_last is the last time we updated the load average and saw load - * load_unacc_exec_time is currently unaccounted execution time - */ - u64 load_avg; - u64 load_period; - u64 load_stamp, load_last, load_unacc_exec_time; - - unsigned long load_contribution; -#endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; From 48a1675323fa1b7844e479ad2a4469f4558c0f79 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:31 +0200 Subject: [PATCH 13/33] sched: Refactor update_shares_cpu() -> update_blocked_avgs() Now that running entities maintain their own load-averages the work we must do in update_shares() is largely restricted to the periodic decay of blocked entities. This allows us to be a little less pessimistic regarding our occupancy on rq->lock and the associated rq->clock updates required. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141507.133999170@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 50 +++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 57fae95eed99..dcc27d8ae6ba 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3639,20 +3639,15 @@ next: /* * update tg->load_weight by folding this cpu's load_avg */ -static int update_shares_cpu(struct task_group *tg, int cpu) +static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) { - struct sched_entity *se; - struct cfs_rq *cfs_rq; - unsigned long flags; - struct rq *rq; + struct sched_entity *se = tg->se[cpu]; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; - rq = cpu_rq(cpu); - se = tg->se[cpu]; - cfs_rq = tg->cfs_rq[cpu]; + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + return; - raw_spin_lock_irqsave(&rq->lock, flags); - - update_rq_clock(rq); update_cfs_rq_blocked_load(cfs_rq, 1); if (se) { @@ -3669,32 +3664,33 @@ static int update_shares_cpu(struct task_group *tg, int cpu) if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) list_del_leaf_cfs_rq(cfs_rq); } else { + struct rq *rq = rq_of(cfs_rq); update_rq_runnable_avg(rq, rq->nr_running); } - - raw_spin_unlock_irqrestore(&rq->lock, flags); - - return 0; } -static void update_shares(int cpu) +static void update_blocked_averages(int cpu) { - struct cfs_rq *cfs_rq; struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq; + unsigned long flags; - rcu_read_lock(); + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - continue; - - update_shares_cpu(cfs_rq->tg, cpu); + /* + * Note: We may want to consider periodically releasing + * rq->lock about these updates so that creating many task + * groups does not result in continually extending hold time. + */ + __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); } - rcu_read_unlock(); + + raw_spin_unlock_irqrestore(&rq->lock, flags); } /* @@ -3746,7 +3742,7 @@ static unsigned long task_h_load(struct task_struct *p) return load; } #else -static inline void update_shares(int cpu) +static inline void update_blocked_averages(int cpu) { } @@ -4813,7 +4809,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) */ raw_spin_unlock(&this_rq->lock); - update_shares(this_cpu); + update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; @@ -5068,7 +5064,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) int update_next_balance = 0; int need_serialize; - update_shares(cpu); + update_blocked_averages(cpu); rcu_read_lock(); for_each_domain(cpu, sd) { From f269ae0469fc882332bdfb5db15d3c1315fe2a10 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:31 +0200 Subject: [PATCH 14/33] sched: Update_cfs_shares at period edge Now that our measurement intervals are small (~1ms) we can amortize the posting of update_shares() to be about each period overflow. This is a large cost saving for frequently switching tasks. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141507.200772172@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dcc27d8ae6ba..002a7697f437 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1187,6 +1187,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) } __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); + update_cfs_shares(cfs_rq); } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) @@ -1396,9 +1397,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); + enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -1471,7 +1471,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -1491,8 +1490,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); - se->on_rq = 0; account_entity_dequeue(cfs_rq, se); + dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); /* * Normalize the entity after updating the min_vruntime because the @@ -1506,7 +1505,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) return_cfs_rq_runtime(cfs_rq); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq); + se->on_rq = 0; } /* @@ -2518,8 +2517,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); + update_cfs_rq_blocked_load(cfs_rq, 0); } if (!se) { @@ -2579,8 +2578,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); + update_cfs_rq_blocked_load(cfs_rq, 0); } if (!se) { @@ -5639,8 +5638,11 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) se = tg->se[i]; /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); - for_each_sched_entity(se) + for_each_sched_entity(se) { update_cfs_shares(group_cfs_rq(se)); + /* update contribution to parent */ + update_entity_load_avg(se, 1); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } From 5b51f2f80b3b906ce59bd4dce6eca3c7f34cb1b9 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:32 +0200 Subject: [PATCH 15/33] sched: Make __update_entity_runnable_avg() fast __update_entity_runnable_avg forms the core of maintaining an entity's runnable load average. In this function we charge the accumulated run-time since last update and handle appropriate decay. In some cases, e.g. a waking task, this time interval may be much larger than our period unit. Fortunately we can exploit some properties of our series to perform decay for a blocked update in constant time and account the contribution for a running update in essentially-constant* time. [*]: For any running entity they should be performing updates at the tick which gives us a soft limit of 1 jiffy between updates, and we can compute up to a 32 jiffy update in a single pass. C program to generate the magic constants in the arrays: #include #include #define N 32 #define WMULT_SHIFT 32 const long WMULT_CONST = ((1UL << N) - 1); double y; long runnable_avg_yN_inv[N]; void calc_mult_inv() { int i; double yn = 0; printf("inverses\n"); for (i = 0; i < N; i++) { yn = (double)WMULT_CONST * pow(y, i); runnable_avg_yN_inv[i] = yn; printf("%2d: 0x%8lx\n", i, runnable_avg_yN_inv[i]); } printf("\n"); } long mult_inv(long c, int n) { return (c * runnable_avg_yN_inv[n]) >> WMULT_SHIFT; } void calc_yn_sum(int n) { int i; double sum = 0, sum_fl = 0, diff = 0; /* * We take the floored sum to ensure the sum of partial sums is never * larger than the actual sum. */ printf("sum y^n\n"); printf(" %8s %8s %8s\n", "exact", "floor", "error"); for (i = 1; i <= n; i++) { sum = (y * sum + y * 1024); sum_fl = floor(y * sum_fl+ y * 1024); printf("%2d: %8.0f %8.0f %8.0f\n", i, sum, sum_fl, sum_fl - sum); } printf("\n"); } void calc_conv(long n) { long old_n; int i = -1; printf("convergence (LOAD_AVG_MAX, LOAD_AVG_MAX_N)\n"); do { old_n = n; n = mult_inv(n, 1) + 1024; i++; } while (n != old_n); printf("%d> %ld\n", i - 1, n); printf("\n"); } void main() { y = pow(0.5, 1/(double)N); calc_mult_inv(); calc_conv(1024); calc_yn_sum(N); } [ Compile with -lm ] Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141507.277808946@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 121 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 99 insertions(+), 22 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 002a7697f437..6ecf455fd95b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -883,18 +883,93 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables below are dependent on this value. + */ +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ + +/* Precomputed fixed inverse multiplies for multiplication by y^n */ +static const u32 runnable_avg_yN_inv[] = { + 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, + 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, + 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, + 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, + 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, + 0x85aac367, 0x82cd8698, +}; + +/* + * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent + * over-estimates when re-combining. + */ +static const u32 runnable_avg_yN_sum[] = { + 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, + 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, + 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, +}; + /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ static __always_inline u64 decay_load(u64 val, u64 n) { - for (; n && val; n--) { - val *= 4008; - val >>= 12; + unsigned int local_n; + + if (!n) + return val; + else if (unlikely(n > LOAD_AVG_PERIOD * 63)) + return 0; + + /* after bounds checking we can collapse to 32-bit */ + local_n = n; + + /* + * As y^PERIOD = 1/2, we can combine + * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) + * With a look-up table which covers k^n (n= LOAD_AVG_PERIOD)) { + val >>= local_n / LOAD_AVG_PERIOD; + local_n %= LOAD_AVG_PERIOD; } - return val; + val *= runnable_avg_yN_inv[local_n]; + /* We don't use SRR here since we always want to round down. */ + return val >> 32; +} + +/* + * For updates fully spanning n periods, the contribution to runnable + * average will be: \Sum 1024*y^n + * + * We can compute this reasonably efficiently by combining: + * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n = LOAD_AVG_MAX_N)) + return LOAD_AVG_MAX; + + /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ + do { + contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ + contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; + + n -= LOAD_AVG_PERIOD; + } while (n > LOAD_AVG_PERIOD); + + contrib = decay_load(contrib, n); + return contrib + runnable_avg_yN_sum[n]; } /* @@ -929,7 +1004,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now, struct sched_avg *sa, int runnable) { - u64 delta; + u64 delta, periods; + u32 runnable_contrib; int delta_w, decayed = 0; delta = now - sa->last_runnable_update; @@ -963,25 +1039,26 @@ static __always_inline int __update_entity_runnable_avg(u64 now, * period and accrue it. */ delta_w = 1024 - delta_w; - BUG_ON(delta_w > delta); - do { - if (runnable) - sa->runnable_avg_sum += delta_w; - sa->runnable_avg_period += delta_w; + if (runnable) + sa->runnable_avg_sum += delta_w; + sa->runnable_avg_period += delta_w; - /* - * Remainder of delta initiates a new period, roll over - * the previous. - */ - sa->runnable_avg_sum = - decay_load(sa->runnable_avg_sum, 1); - sa->runnable_avg_period = - decay_load(sa->runnable_avg_period, 1); + delta -= delta_w; - delta -= delta_w; - /* New period is empty */ - delta_w = 1024; - } while (delta >= 1024); + /* Figure out how many additional periods this update spans */ + periods = delta / 1024; + delta %= 1024; + + sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, + periods + 1); + sa->runnable_avg_period = decay_load(sa->runnable_avg_period, + periods + 1); + + /* Efficiently calculate \sum (1..n_period) 1024*y^i */ + runnable_contrib = __compute_runnable_contrib(periods); + if (runnable) + sa->runnable_avg_sum += runnable_contrib; + sa->runnable_avg_period += runnable_contrib; } /* Remainder of delta accrued against u_0` */ From f4e26b120b9de84cb627bc7361ba43cfdc51341f Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:32 +0200 Subject: [PATCH 16/33] sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking While per-entity load-tracking is generally useful, beyond computing shares distribution, e.g. runnable based load-balance (in progress), governors, power-management, etc. These facilities are not yet consumers of this data. This may be trivially reverted when the information is required; but avoid paying the overhead for calculations we will not use until then. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141507.422162369@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 +++++++- kernel/sched/core.c | 7 ++++++- kernel/sched/fair.c | 13 +++++++++++-- kernel/sched/sched.h | 9 ++++++++- 4 files changed, 32 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index e483ccb08ce6..e1581a029e3d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1168,7 +1168,13 @@ struct sched_entity { /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; #endif -#ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + /* Per-entity load-tracking */ struct sched_avg avg; #endif }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f26860074ef2..5dae0d252ff7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1526,7 +1526,12 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); -#ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6ecf455fd95b..3e6a3531fa90 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -882,7 +882,8 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) } #endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_SMP +/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) /* * We choose a half-life close to 1 scheduling period. * Note: The tables below are dependent on this value. @@ -3173,6 +3174,12 @@ unlock: return new_cpu; } +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_FAIR_GROUP_SCHED /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -3196,6 +3203,7 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } } +#endif #endif /* CONFIG_SMP */ static unsigned long @@ -5773,8 +5781,9 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, +#ifdef CONFIG_FAIR_GROUP_SCHED .migrate_task_rq = migrate_task_rq_fair, - +#endif .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0a75a430ca77..5eca173b563f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -225,6 +225,12 @@ struct cfs_rq { #endif #ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_FAIR_GROUP_SCHED /* * CFS Load tracking * Under CFS, load is tracked on a per-entity basis and aggregated up. @@ -234,7 +240,8 @@ struct cfs_rq { u64 runnable_load_avg, blocked_load_avg; atomic64_t decay_counter, removed_load; u64 last_decay; - +#endif /* CONFIG_FAIR_GROUP_SCHED */ +/* These always depend on CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED u32 tg_runnable_contrib; u64 tg_load_contrib; From e9c84cb8d5f1b1ea6fcbe6190d51dc84b6975938 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Jul 2012 13:53:26 +0200 Subject: [PATCH 17/33] sched: Describe CFS load-balancer Add some scribbles on how and why the load-balancer works.. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1341316406.23484.64.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 118 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 116 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3e6a3531fa90..a319d56c7605 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3456,8 +3456,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp #ifdef CONFIG_SMP /************************************************** - * Fair scheduling class load-balancing methods: - */ + * Fair scheduling class load-balancing methods. + * + * BASICS + * + * The purpose of load-balancing is to achieve the same basic fairness the + * per-cpu scheduler provides, namely provide a proportional amount of compute + * time to each task. This is expressed in the following equation: + * + * W_i,n/P_i == W_j,n/P_j for all i,j (1) + * + * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight + * W_i,0 is defined as: + * + * W_i,0 = \Sum_j w_i,j (2) + * + * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight + * is derived from the nice value as per prio_to_weight[]. + * + * The weight average is an exponential decay average of the instantaneous + * weight: + * + * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) + * + * P_i is the cpu power (or compute capacity) of cpu i, typically it is the + * fraction of 'recent' time available for SCHED_OTHER task execution. But it + * can also include other factors [XXX]. + * + * To achieve this balance we define a measure of imbalance which follows + * directly from (1): + * + * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) + * + * We them move tasks around to minimize the imbalance. In the continuous + * function space it is obvious this converges, in the discrete case we get + * a few fun cases generally called infeasible weight scenarios. + * + * [XXX expand on: + * - infeasible weights; + * - local vs global optima in the discrete case. ] + * + * + * SCHED DOMAINS + * + * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) + * for all i,j solution, we create a tree of cpus that follows the hardware + * topology where each level pairs two lower groups (or better). This results + * in O(log n) layers. Furthermore we reduce the number of cpus going up the + * tree to only the first of the previous level and we decrease the frequency + * of load-balance at each level inv. proportional to the number of cpus in + * the groups. + * + * This yields: + * + * log_2 n 1 n + * \Sum { --- * --- * 2^i } = O(n) (5) + * i = 0 2^i 2^i + * `- size of each group + * | | `- number of cpus doing load-balance + * | `- freq + * `- sum over all levels + * + * Coupled with a limit on how many tasks we can migrate every balance pass, + * this makes (5) the runtime complexity of the balancer. + * + * An important property here is that each CPU is still (indirectly) connected + * to every other cpu in at most O(log n) steps: + * + * The adjacency matrix of the resulting graph is given by: + * + * log_2 n + * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) + * k = 0 + * + * And you'll find that: + * + * A^(log_2 n)_i,j != 0 for all i,j (7) + * + * Showing there's indeed a path between every cpu in at most O(log n) steps. + * The task movement gives a factor of O(m), giving a convergence complexity + * of: + * + * O(nm log n), n := nr_cpus, m := nr_tasks (8) + * + * + * WORK CONSERVING + * + * In order to avoid CPUs going idle while there's still work to do, new idle + * balancing is more aggressive and has the newly idle cpu iterate up the domain + * tree itself instead of relying on other CPUs to bring it work. + * + * This adds some complexity to both (5) and (8) but it reduces the total idle + * time. + * + * [XXX more?] + * + * + * CGROUPS + * + * Cgroups make a horror show out of (2), instead of a simple sum we get: + * + * s_k,i + * W_i,0 = \Sum_j \Prod_k w_k * ----- (9) + * S_k + * + * Where + * + * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) + * + * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. + * + * The big problem is S_k, its a global sum needed to compute a local (W_i) + * property. + * + * [XXX write more on how we solve this.. _after_ merging pjt's patches that + * rewrite all of this once again.] + */ static unsigned long __read_mostly max_load_balance_interval = HZ/10; From dcbf832e5823156e8f155359b47bd108cac8ad68 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 5 Oct 2012 23:07:19 +0200 Subject: [PATCH 18/33] vtime: Gather vtime declarations to their own header file These APIs are scattered around and are going to expand a bit. Let's create a dedicated header file for sanity. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- include/linux/hardirq.h | 11 +---------- include/linux/kernel_stat.h | 9 +-------- include/linux/vtime.h | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 18 deletions(-) create mode 100644 include/linux/vtime.h diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index cab3da3d0949..b083a475423d 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -4,6 +4,7 @@ #include #include #include +#include #include /* @@ -129,16 +130,6 @@ extern void synchronize_irq(unsigned int irq); # define synchronize_irq(irq) barrier() #endif -struct task_struct; - -#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) -static inline void vtime_account(struct task_struct *tsk) -{ -} -#else -extern void vtime_account(struct task_struct *tsk); -#endif - #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) static inline void rcu_nmi_enter(void) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 36d12f0884c3..1865b1f29770 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -130,12 +131,4 @@ extern void account_process_tick(struct task_struct *, int user); extern void account_steal_ticks(unsigned long ticks); extern void account_idle_ticks(unsigned long ticks); -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -extern void vtime_task_switch(struct task_struct *prev); -extern void vtime_account_system(struct task_struct *tsk); -extern void vtime_account_idle(struct task_struct *tsk); -#else -static inline void vtime_task_switch(struct task_struct *prev) { } -#endif - #endif /* _LINUX_KERNEL_STAT_H */ diff --git a/include/linux/vtime.h b/include/linux/vtime.h new file mode 100644 index 000000000000..7199c24c8204 --- /dev/null +++ b/include/linux/vtime.h @@ -0,0 +1,22 @@ +#ifndef _LINUX_KERNEL_VTIME_H +#define _LINUX_KERNEL_VTIME_H + +struct task_struct; + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +extern void vtime_task_switch(struct task_struct *prev); +extern void vtime_account_system(struct task_struct *tsk); +extern void vtime_account_idle(struct task_struct *tsk); +#else +static inline void vtime_task_switch(struct task_struct *prev) { } +#endif + +#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) +static inline void vtime_account(struct task_struct *tsk) +{ +} +#else +extern void vtime_account(struct task_struct *tsk); +#endif + +#endif /* _LINUX_KERNEL_VTIME_H */ From 11113334d1c5dd5355c86e531c29f1202a855c86 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 24 Oct 2012 18:05:51 +0200 Subject: [PATCH 19/33] vtime: Make vtime_account_system() irqsafe vtime_account_system() currently has only one caller with vtime_account() which is irq safe. Now we are going to call it from other places like kvm where irqs are not always disabled by the time we account the cputime. So let's make it irqsafe. The arch implementation part is now prefixed with "__". vtime_account_idle() arch implementation is prefixed accordingly to stay consistent. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens --- arch/ia64/kernel/time.c | 8 ++++---- arch/powerpc/kernel/time.c | 4 ++-- arch/s390/kernel/vtime.c | 4 ++++ include/linux/vtime.h | 4 +++- kernel/sched/cputime.c | 16 +++++++++++++--- 5 files changed, 26 insertions(+), 10 deletions(-) diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index f6388216080d..5e4850305d3f 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -106,9 +106,9 @@ void vtime_task_switch(struct task_struct *prev) struct thread_info *ni = task_thread_info(current); if (idle_task(smp_processor_id()) != prev) - vtime_account_system(prev); + __vtime_account_system(prev); else - vtime_account_idle(prev); + __vtime_account_idle(prev); vtime_account_user(prev); @@ -135,14 +135,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) return delta_stime; } -void vtime_account_system(struct task_struct *tsk) +void __vtime_account_system(struct task_struct *tsk) { cputime_t delta = vtime_delta(tsk); account_system_time(tsk, 0, delta, delta); } -void vtime_account_idle(struct task_struct *tsk) +void __vtime_account_idle(struct task_struct *tsk) { account_idle_time(vtime_delta(tsk)); } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index ce4cb772dc78..0db456f30d45 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -336,7 +336,7 @@ static u64 vtime_delta(struct task_struct *tsk, return delta; } -void vtime_account_system(struct task_struct *tsk) +void __vtime_account_system(struct task_struct *tsk) { u64 delta, sys_scaled, stolen; @@ -346,7 +346,7 @@ void vtime_account_system(struct task_struct *tsk) account_steal_time(stolen); } -void vtime_account_idle(struct task_struct *tsk) +void __vtime_account_idle(struct task_struct *tsk) { u64 delta, sys_scaled, stolen; diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 790334427895..783e988c4e1e 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -140,6 +140,10 @@ void vtime_account(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(vtime_account); +void __vtime_account_system(struct task_struct *tsk) +__attribute__((alias("vtime_account"))); +EXPORT_SYMBOL_GPL(__vtime_account_system); + void __kprobes vtime_stop_cpu(void) { struct s390_idle_data *idle = &__get_cpu_var(s390_idle); diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 7199c24c8204..b9fc4f9ab470 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -5,10 +5,12 @@ struct task_struct; #ifdef CONFIG_VIRT_CPU_ACCOUNTING extern void vtime_task_switch(struct task_struct *prev); +extern void __vtime_account_system(struct task_struct *tsk); extern void vtime_account_system(struct task_struct *tsk); -extern void vtime_account_idle(struct task_struct *tsk); +extern void __vtime_account_idle(struct task_struct *tsk); #else static inline void vtime_task_switch(struct task_struct *prev) { } +static inline void vtime_account_system(struct task_struct *tsk) { } #endif #if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 81b763ba58a6..0359f47b0ae4 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -433,10 +433,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) *st = cputime.stime; } +void vtime_account_system(struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + __vtime_account_system(tsk); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(vtime_account_system); + /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement - * vtime_account_system() and vtime_account_idle(). Archs that + * __vtime_account_system() and __vtime_account_idle(). Archs that * have other meaning of the idle time (s390 only includes the * time spent by the CPU when it's in low power mode) must override * vtime_account(). @@ -449,9 +459,9 @@ void vtime_account(struct task_struct *tsk) local_irq_save(flags); if (in_interrupt() || !is_idle_task(tsk)) - vtime_account_system(tsk); + __vtime_account_system(tsk); else - vtime_account_idle(tsk); + __vtime_account_idle(tsk); local_irq_restore(flags); } From b080935c8638e08134629d0a9ebdf35669bec14d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 5 Oct 2012 23:07:19 +0200 Subject: [PATCH 20/33] kvm: Directly account vtime to system on guest switch Switching to or from guest context is done on ioctl context. So by the time we call kvm_guest_enter() or kvm_guest_exit() we know we are not running the idle task. As a result, we can directly account the cputime using vtime_account_system(). There are two good reasons to do this: * We avoid some useless checks on guest switch. It optimizes a bit this fast path. * In the case of CONFIG_IRQ_TIME_ACCOUNTING, calling vtime_account() checks for irq time to account. This is pointless since we know we are not in an irq on guest switch. This is wasting cpu cycles for no good reason. vtime_account_system() OTOH is a no-op in this config option. * We can remove the irq disable/enable around kvm guest switch in s390. A further optimization may consist in introducing a vtime_account_guest() that directly calls account_guest_time(). Signed-off-by: Frederic Weisbecker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Avi Kivity Cc: Marcelo Tosatti Cc: Joerg Roedel Cc: Alexander Graf Cc: Xiantao Zhang Cc: Christian Borntraeger Cc: Cornelia Huck Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- arch/s390/kvm/kvm-s390.c | 4 ---- include/linux/kvm_host.h | 12 ++++++++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ecced9d18986..d91a95568002 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -608,9 +608,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) kvm_s390_deliver_pending_interrupts(vcpu); vcpu->arch.sie_block->icptcode = 0; - local_irq_disable(); kvm_guest_enter(); - local_irq_enable(); VCPU_EVENT(vcpu, 6, "entering sie flags %x", atomic_read(&vcpu->arch.sie_block->cpuflags)); trace_kvm_s390_sie_enter(vcpu, @@ -629,9 +627,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", vcpu->arch.sie_block->icptcode); trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode); - local_irq_disable(); kvm_guest_exit(); - local_irq_enable(); memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16); return rc; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 93bfc9f9815c..0e2212fe4784 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -737,7 +737,11 @@ static inline int kvm_deassign_device(struct kvm *kvm, static inline void kvm_guest_enter(void) { BUG_ON(preemptible()); - vtime_account(current); + /* + * This is running in ioctl context so we can avoid + * the call to vtime_account() with its unnecessary idle check. + */ + vtime_account_system(current); current->flags |= PF_VCPU; /* KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode @@ -751,7 +755,11 @@ static inline void kvm_guest_enter(void) static inline void kvm_guest_exit(void) { - vtime_account(current); + /* + * This is running in ioctl context so we can avoid + * the call to vtime_account() with its unnecessary idle check. + */ + vtime_account_system(current); current->flags &= ~PF_VCPU; } From fa5058f3b63153e0147ef65bcdb3a4ee63581346 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 6 Oct 2012 04:07:19 +0200 Subject: [PATCH 21/33] cputime: Specialize irq vtime hooks With CONFIG_VIRT_CPU_ACCOUNTING, when vtime_account() is called in irq entry/exit, we perform a check on the context: if we are interrupting the idle task we account the pending cputime to idle, otherwise account to system time or its sub-areas: tsk->stime, hardirq time, softirq time, ... However this check for idle only concerns the hardirq entry and softirq entry: * Hardirq may directly interrupt the idle task, in which case we need to flush the pending CPU time to idle. * The idle task may be directly interrupted by a softirq if it calls local_bh_enable(). There is probably no such call in any idle task but we need to cover every case. Ksoftirqd is not concerned because the idle time is flushed on context switch and softirq in the end of hardirq have the idle time already flushed from the hardirq entry. In the other cases we always account to system/irq time: * On hardirq exit we account the time to hardirq time. * On softirq exit we account the time to softirq time. To optimize this and avoid the indirect call to vtime_account() and the checks it performs, specialize the vtime irq APIs and only perform the check on irq entry. Irq exit can directly call vtime_account_system(). CONFIG_IRQ_TIME_ACCOUNTING behaviour doesn't change and directly maps to its own vtime_account() implementation. One may want to take benefits from the new APIs to optimize irq time accounting as well in the future. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- include/linux/hardirq.h | 4 ++-- include/linux/vtime.h | 25 +++++++++++++++++++++++++ kernel/softirq.c | 6 +++--- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index b083a475423d..624ef3f45c8e 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -153,7 +153,7 @@ extern void rcu_nmi_exit(void); */ #define __irq_enter() \ do { \ - vtime_account(current); \ + vtime_account_irq_enter(current); \ add_preempt_count(HARDIRQ_OFFSET); \ trace_hardirq_enter(); \ } while (0) @@ -169,7 +169,7 @@ extern void irq_enter(void); #define __irq_exit() \ do { \ trace_hardirq_exit(); \ - vtime_account(current); \ + vtime_account_irq_exit(current); \ sub_preempt_count(HARDIRQ_OFFSET); \ } while (0) diff --git a/include/linux/vtime.h b/include/linux/vtime.h index b9fc4f9ab470..c35c02223da8 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -21,4 +21,29 @@ static inline void vtime_account(struct task_struct *tsk) extern void vtime_account(struct task_struct *tsk); #endif +static inline void vtime_account_irq_enter(struct task_struct *tsk) +{ + /* + * Hardirq can interrupt idle task anytime. So we need vtime_account() + * that performs the idle check in CONFIG_VIRT_CPU_ACCOUNTING. + * Softirq can also interrupt idle task directly if it calls + * local_bh_enable(). Such case probably don't exist but we never know. + * Ksoftirqd is not concerned because idle time is flushed on context + * switch. Softirqs in the end of hardirqs are also not a problem because + * the idle time is flushed on hardirq time already. + */ + vtime_account(tsk); +} + +static inline void vtime_account_irq_exit(struct task_struct *tsk) +{ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + /* On hard|softirq exit we always account to hard|softirq cputime */ + __vtime_account_system(tsk); +#endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + vtime_account(tsk); +#endif +} + #endif /* _LINUX_KERNEL_VTIME_H */ diff --git a/kernel/softirq.c b/kernel/softirq.c index cc96bdc0c2c9..ed567babe789 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void) current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); - vtime_account(current); + vtime_account_irq_enter(current); __local_bh_disable((unsigned long)__builtin_return_address(0), SOFTIRQ_OFFSET); @@ -272,7 +272,7 @@ restart: lockdep_softirq_exit(); - vtime_account(current); + vtime_account_irq_exit(current); __local_bh_enable(SOFTIRQ_OFFSET); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } @@ -341,7 +341,7 @@ static inline void invoke_softirq(void) */ void irq_exit(void) { - vtime_account(current); + vtime_account_irq_exit(current); trace_hardirq_exit(); sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) From 3e1df4f506836e6bea1ab61cf88c75c8b1840643 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 6 Oct 2012 05:23:22 +0200 Subject: [PATCH 22/33] cputime: Separate irqtime accounting from generic vtime vtime_account() doesn't have the same role in CONFIG_VIRT_CPU_ACCOUNTING and CONFIG_IRQ_TIME_ACCOUNTING. In the first case it handles time accounting in any context. In the second case it only handles irq time accounting. So when vtime_account() is called from outside vtime_account_irq_*() this call is pointless to CONFIG_IRQ_TIME_ACCOUNTING. To fix the confusion, change vtime_account() to irqtime_account_irq() in CONFIG_IRQ_TIME_ACCOUNTING. This way we ensure future account_vtime() calls won't waste useless cycles in the irqtime APIs. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- include/linux/vtime.h | 18 ++++++++---------- kernel/sched/cputime.c | 4 ++-- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/include/linux/vtime.h b/include/linux/vtime.h index c35c02223da8..0c2a2d303020 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -8,17 +8,18 @@ extern void vtime_task_switch(struct task_struct *prev); extern void __vtime_account_system(struct task_struct *tsk); extern void vtime_account_system(struct task_struct *tsk); extern void __vtime_account_idle(struct task_struct *tsk); +extern void vtime_account(struct task_struct *tsk); #else static inline void vtime_task_switch(struct task_struct *prev) { } +static inline void __vtime_account_system(struct task_struct *tsk) { } static inline void vtime_account_system(struct task_struct *tsk) { } +static inline void vtime_account(struct task_struct *tsk) { } #endif -#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) -static inline void vtime_account(struct task_struct *tsk) -{ -} +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +extern void irqtime_account_irq(struct task_struct *tsk); #else -extern void vtime_account(struct task_struct *tsk); +static inline void irqtime_account_irq(struct task_struct *tsk) { } #endif static inline void vtime_account_irq_enter(struct task_struct *tsk) @@ -33,17 +34,14 @@ static inline void vtime_account_irq_enter(struct task_struct *tsk) * the idle time is flushed on hardirq time already. */ vtime_account(tsk); + irqtime_account_irq(tsk); } static inline void vtime_account_irq_exit(struct task_struct *tsk) { -#ifdef CONFIG_VIRT_CPU_ACCOUNTING /* On hard|softirq exit we always account to hard|softirq cputime */ __vtime_account_system(tsk); -#endif -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - vtime_account(tsk); -#endif + irqtime_account_irq(tsk); } #endif /* _LINUX_KERNEL_VTIME_H */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 0359f47b0ae4..8d859dae5bed 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -43,7 +43,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ -void vtime_account(struct task_struct *curr) +void irqtime_account_irq(struct task_struct *curr) { unsigned long flags; s64 delta; @@ -73,7 +73,7 @@ void vtime_account(struct task_struct *curr) irq_time_write_end(); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(vtime_account); +EXPORT_SYMBOL_GPL(irqtime_account_irq); static int irqtime_account_hi_update(void) { From 5258f386ea4e8454bc801fb443e8a4217da1947c Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sun, 28 Oct 2012 12:19:23 -0700 Subject: [PATCH 23/33] sched/autogroup: Fix crash on reboot when autogroup is disabled Due to these two commits: 8323f26ce342 sched: Fix race in task_group() 800d4d30c8f2 sched, autogroup: Stop going ahead if autogroup is disabled ... autogroup scheduling's dynamic knobs are wrecked. With both patches applied, all you have to do to crash a box is disable autogroup during boot up, then reboot.. boom, NULL pointer dereference due to 800d4d30 not allowing autogroup to move things, and 8323f26ce making that the only way to switch runqueues. Remove most of the (dysfunctional) knobs and turn the remaining sched_autogroup_enabled knob readonly. If the user fiddles with cgroups hereafter, once tasks are moved, autogroup won't mess with them again unless they call setsid(). No knobs, no glitz, nada, just a cute little thing folks can turn on if they don't want to muck about with cgroups and/or systemd. Signed-off-by: Mike Galbraith Cc: Xiaotian Feng Cc: Peter Zijlstra Cc: Xiaotian Feng Cc: Linus Torvalds Cc: Andrew Morton Cc: Oleg Nesterov Cc: # v3.6 Link: http://lkml.kernel.org/r/1351451963.4999.8.camel@maggy.simpson.net Signed-off-by: Ingo Molnar --- fs/proc/base.c | 78 --------------------------------------- kernel/sched/auto_group.c | 68 ++++++---------------------------- kernel/sched/auto_group.h | 9 +---- kernel/sysctl.c | 6 +-- 4 files changed, 14 insertions(+), 147 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 1b6c84cbdb73..bb1d9623bad2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1271,81 +1271,6 @@ static const struct file_operations proc_pid_sched_operations = { #endif -#ifdef CONFIG_SCHED_AUTOGROUP -/* - * Print out autogroup related information: - */ -static int sched_autogroup_show(struct seq_file *m, void *v) -{ - struct inode *inode = m->private; - struct task_struct *p; - - p = get_proc_task(inode); - if (!p) - return -ESRCH; - proc_sched_autogroup_show_task(p, m); - - put_task_struct(p); - - return 0; -} - -static ssize_t -sched_autogroup_write(struct file *file, const char __user *buf, - size_t count, loff_t *offset) -{ - struct inode *inode = file->f_path.dentry->d_inode; - struct task_struct *p; - char buffer[PROC_NUMBUF]; - int nice; - int err; - - memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - return -EFAULT; - - err = kstrtoint(strstrip(buffer), 0, &nice); - if (err < 0) - return err; - - p = get_proc_task(inode); - if (!p) - return -ESRCH; - - err = proc_sched_autogroup_set_nice(p, nice); - if (err) - count = err; - - put_task_struct(p); - - return count; -} - -static int sched_autogroup_open(struct inode *inode, struct file *filp) -{ - int ret; - - ret = single_open(filp, sched_autogroup_show, NULL); - if (!ret) { - struct seq_file *m = filp->private_data; - - m->private = inode; - } - return ret; -} - -static const struct file_operations proc_pid_sched_autogroup_operations = { - .open = sched_autogroup_open, - .read = seq_read, - .write = sched_autogroup_write, - .llseek = seq_lseek, - .release = single_release, -}; - -#endif /* CONFIG_SCHED_AUTOGROUP */ - static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { @@ -3035,9 +2960,6 @@ static const struct pid_entry tgid_base_stuff[] = { INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), -#endif -#ifdef CONFIG_SCHED_AUTOGROUP - REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 0984a21076a3..0f1bacb005a4 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -110,6 +110,9 @@ out_fail: bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { + if (!sysctl_sched_autogroup_enabled) + return false; + if (tg != &root_task_group) return false; @@ -143,15 +146,11 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); - if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - t = p; do { sched_move_task(t); } while_each_thread(p, t); -out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } @@ -159,8 +158,11 @@ out: /* Allocates GFP_KERNEL, cannot be called under any spinlock */ void sched_autogroup_create_attach(struct task_struct *p) { - struct autogroup *ag = autogroup_create(); + struct autogroup *ag; + if (!sysctl_sched_autogroup_enabled) + return; + ag = autogroup_create(); autogroup_move_group(p, ag); /* drop extra reference added by autogroup_create() */ autogroup_kref_put(ag); @@ -176,11 +178,15 @@ EXPORT_SYMBOL(sched_autogroup_detach); void sched_autogroup_fork(struct signal_struct *sig) { + if (!sysctl_sched_autogroup_enabled) + return; sig->autogroup = autogroup_task_get(current); } void sched_autogroup_exit(struct signal_struct *sig) { + if (!sysctl_sched_autogroup_enabled) + return; autogroup_kref_put(sig->autogroup); } @@ -193,58 +199,6 @@ static int __init setup_autogroup(char *str) __setup("noautogroup", setup_autogroup); -#ifdef CONFIG_PROC_FS - -int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) -{ - static unsigned long next = INITIAL_JIFFIES; - struct autogroup *ag; - int err; - - if (nice < -20 || nice > 19) - return -EINVAL; - - err = security_task_setnice(current, nice); - if (err) - return err; - - if (nice < 0 && !can_nice(current, nice)) - return -EPERM; - - /* this is a heavy operation taking global locks.. */ - if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) - return -EAGAIN; - - next = HZ / 10 + jiffies; - ag = autogroup_task_get(p); - - down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); - if (!err) - ag->nice = nice; - up_write(&ag->lock); - - autogroup_kref_put(ag); - - return err; -} - -void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) -{ - struct autogroup *ag = autogroup_task_get(p); - - if (!task_group_is_autogroup(ag->tg)) - goto out; - - down_read(&ag->lock); - seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); - up_read(&ag->lock); - -out: - autogroup_kref_put(ag); -} -#endif /* CONFIG_PROC_FS */ - #ifdef CONFIG_SCHED_DEBUG int autogroup_path(struct task_group *tg, char *buf, int buflen) { diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h index 8bd047142816..4552c6bf79d2 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/auto_group.h @@ -4,11 +4,6 @@ #include struct autogroup { - /* - * reference doesn't mean how many thread attach to this - * autogroup now. It just stands for the number of task - * could use this autogroup. - */ struct kref kref; struct task_group *tg; struct rw_semaphore lock; @@ -29,9 +24,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); - - if (enabled && task_wants_autogroup(p, tg)) + if (task_wants_autogroup(p, tg)) return p->signal->autogroup->tg; return tg; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 81c7b1a1a307..2914d0f752cf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -363,10 +363,8 @@ static struct ctl_table kern_table[] = { .procname = "sched_autogroup_enabled", .data = &sysctl_sched_autogroup_enabled, .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .mode = 0444, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_CFS_BANDWIDTH From fd25b4c2f226de818e1d2b71e3e681d28bcaf5ba Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 13 Nov 2012 18:21:22 +0100 Subject: [PATCH 24/33] vtime: Remove the underscore prefix invasion Prepending irq-unsafe vtime APIs with underscores was actually a bad idea as the result is a big mess in the API namespace that is even waiting to be further extended. Also these helpers are always called from irq safe callers except kvm. Just provide a vtime_account_system_irqsafe() for this specific case so that we can remove the underscore prefix on other vtime functions. Signed-off-by: Frederic Weisbecker Reviewed-by: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens --- arch/ia64/kernel/time.c | 8 ++++---- arch/powerpc/kernel/time.c | 4 ++-- arch/s390/kernel/vtime.c | 4 ++-- include/linux/kvm_host.h | 4 ++-- include/linux/vtime.h | 8 ++++---- kernel/sched/cputime.c | 12 ++++++------ 6 files changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 5e4850305d3f..f6388216080d 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -106,9 +106,9 @@ void vtime_task_switch(struct task_struct *prev) struct thread_info *ni = task_thread_info(current); if (idle_task(smp_processor_id()) != prev) - __vtime_account_system(prev); + vtime_account_system(prev); else - __vtime_account_idle(prev); + vtime_account_idle(prev); vtime_account_user(prev); @@ -135,14 +135,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) return delta_stime; } -void __vtime_account_system(struct task_struct *tsk) +void vtime_account_system(struct task_struct *tsk) { cputime_t delta = vtime_delta(tsk); account_system_time(tsk, 0, delta, delta); } -void __vtime_account_idle(struct task_struct *tsk) +void vtime_account_idle(struct task_struct *tsk) { account_idle_time(vtime_delta(tsk)); } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 0db456f30d45..ce4cb772dc78 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -336,7 +336,7 @@ static u64 vtime_delta(struct task_struct *tsk, return delta; } -void __vtime_account_system(struct task_struct *tsk) +void vtime_account_system(struct task_struct *tsk) { u64 delta, sys_scaled, stolen; @@ -346,7 +346,7 @@ void __vtime_account_system(struct task_struct *tsk) account_steal_time(stolen); } -void __vtime_account_idle(struct task_struct *tsk) +void vtime_account_idle(struct task_struct *tsk) { u64 delta, sys_scaled, stolen; diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 783e988c4e1e..80d1dbc5d42e 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -140,9 +140,9 @@ void vtime_account(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(vtime_account); -void __vtime_account_system(struct task_struct *tsk) +void vtime_account_system(struct task_struct *tsk) __attribute__((alias("vtime_account"))); -EXPORT_SYMBOL_GPL(__vtime_account_system); +EXPORT_SYMBOL_GPL(vtime_account_system); void __kprobes vtime_stop_cpu(void) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0e2212fe4784..f17158bdd4fc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -741,7 +741,7 @@ static inline void kvm_guest_enter(void) * This is running in ioctl context so we can avoid * the call to vtime_account() with its unnecessary idle check. */ - vtime_account_system(current); + vtime_account_system_irqsafe(current); current->flags |= PF_VCPU; /* KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode @@ -759,7 +759,7 @@ static inline void kvm_guest_exit(void) * This is running in ioctl context so we can avoid * the call to vtime_account() with its unnecessary idle check. */ - vtime_account_system(current); + vtime_account_system_irqsafe(current); current->flags &= ~PF_VCPU; } diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 0c2a2d303020..5ad13c325deb 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -5,14 +5,14 @@ struct task_struct; #ifdef CONFIG_VIRT_CPU_ACCOUNTING extern void vtime_task_switch(struct task_struct *prev); -extern void __vtime_account_system(struct task_struct *tsk); extern void vtime_account_system(struct task_struct *tsk); -extern void __vtime_account_idle(struct task_struct *tsk); +extern void vtime_account_system_irqsafe(struct task_struct *tsk); +extern void vtime_account_idle(struct task_struct *tsk); extern void vtime_account(struct task_struct *tsk); #else static inline void vtime_task_switch(struct task_struct *prev) { } -static inline void __vtime_account_system(struct task_struct *tsk) { } static inline void vtime_account_system(struct task_struct *tsk) { } +static inline void vtime_account_system_irqsafe(struct task_struct *tsk) { } static inline void vtime_account(struct task_struct *tsk) { } #endif @@ -40,7 +40,7 @@ static inline void vtime_account_irq_enter(struct task_struct *tsk) static inline void vtime_account_irq_exit(struct task_struct *tsk) { /* On hard|softirq exit we always account to hard|softirq cputime */ - __vtime_account_system(tsk); + vtime_account_system(tsk); irqtime_account_irq(tsk); } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8d859dae5bed..c0aa1ba752ea 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -433,20 +433,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) *st = cputime.stime; } -void vtime_account_system(struct task_struct *tsk) +void vtime_account_system_irqsafe(struct task_struct *tsk) { unsigned long flags; local_irq_save(flags); - __vtime_account_system(tsk); + vtime_account_system(tsk); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(vtime_account_system); +EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement - * __vtime_account_system() and __vtime_account_idle(). Archs that + * vtime_account_system() and vtime_account_idle(). Archs that * have other meaning of the idle time (s390 only includes the * time spent by the CPU when it's in low power mode) must override * vtime_account(). @@ -459,9 +459,9 @@ void vtime_account(struct task_struct *tsk) local_irq_save(flags); if (in_interrupt() || !is_idle_task(tsk)) - __vtime_account_system(tsk); + vtime_account_system(tsk); else - __vtime_account_idle(tsk); + vtime_account_idle(tsk); local_irq_restore(flags); } From bcebdf846522056a84ba0b0cba5f5413868c9394 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 13 Nov 2012 23:51:06 +0100 Subject: [PATCH 25/33] vtime: Explicitly account pending user time on process tick All vtime implementations just flush the user time on process tick. Consolidate that in generic code by calling a user time accounting helper. This avoids an indirect call in ia64 and prepare to also consolidate vtime context switch code. Signed-off-by: Frederic Weisbecker Reviewed-by: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens --- arch/ia64/kernel/time.c | 11 +---------- arch/powerpc/kernel/time.c | 14 +++++++------- arch/s390/kernel/vtime.c | 7 ++++++- include/linux/kernel_stat.h | 8 ++++++++ include/linux/vtime.h | 1 + 5 files changed, 23 insertions(+), 18 deletions(-) diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index f6388216080d..834c78bd3b5f 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -83,7 +83,7 @@ static struct clocksource *itc_clocksource; extern cputime_t cycle_to_cputime(u64 cyc); -static void vtime_account_user(struct task_struct *tsk) +void vtime_account_user(struct task_struct *tsk) { cputime_t delta_utime; struct thread_info *ti = task_thread_info(tsk); @@ -147,15 +147,6 @@ void vtime_account_idle(struct task_struct *tsk) account_idle_time(vtime_delta(tsk)); } -/* - * Called from the timer interrupt handler to charge accumulated user time - * to the current process. Must be called with interrupts disabled. - */ -void account_process_tick(struct task_struct *p, int user_tick) -{ - vtime_account_user(p); -} - #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ static irqreturn_t diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index ce4cb772dc78..a667aaf85846 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -355,15 +355,15 @@ void vtime_account_idle(struct task_struct *tsk) } /* - * Transfer the user and system times accumulated in the paca - * by the exception entry and exit code to the generic process - * user and system time records. + * Transfer the user time accumulated in the paca + * by the exception entry and exit code to the generic + * process user time records. * Must be called with interrupts disabled. - * Assumes that vtime_account() has been called recently - * (i.e. since the last entry from usermode) so that + * Assumes that vtime_account_system/idle() has been called + * recently (i.e. since the last entry from usermode) so that * get_paca()->user_time_scaled is up to date. */ -void account_process_tick(struct task_struct *tsk, int user_tick) +void vtime_account_user(struct task_struct *tsk) { cputime_t utime, utimescaled; @@ -378,7 +378,7 @@ void account_process_tick(struct task_struct *tsk, int user_tick) void vtime_task_switch(struct task_struct *prev) { vtime_account(prev); - account_process_tick(prev, 0); + vtime_account_user(prev); } #else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 80d1dbc5d42e..7c6d861a1a40 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -112,7 +112,12 @@ void vtime_task_switch(struct task_struct *prev) S390_lowcore.system_timer = ti->system_timer; } -void account_process_tick(struct task_struct *tsk, int user_tick) +/* + * In s390, accounting pending user time also implies + * accounting system time in order to correctly compute + * the stolen time accounting. + */ +void vtime_account_user(struct task_struct *tsk) { if (do_account_vtime(tsk, HARDIRQ_OFFSET)) virt_timer_expire(); diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 1865b1f29770..66b70780e910 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -127,7 +127,15 @@ extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t) extern void account_steal_time(cputime_t); extern void account_idle_time(cputime_t); +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +static inline void account_process_tick(struct task_struct *tsk, int user) +{ + vtime_account_user(tsk); +} +#else extern void account_process_tick(struct task_struct *, int user); +#endif + extern void account_steal_ticks(unsigned long ticks); extern void account_idle_ticks(unsigned long ticks); diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 5ad13c325deb..ae30ab58431a 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -8,6 +8,7 @@ extern void vtime_task_switch(struct task_struct *prev); extern void vtime_account_system(struct task_struct *tsk); extern void vtime_account_system_irqsafe(struct task_struct *tsk); extern void vtime_account_idle(struct task_struct *tsk); +extern void vtime_account_user(struct task_struct *tsk); extern void vtime_account(struct task_struct *tsk); #else static inline void vtime_task_switch(struct task_struct *prev) { } From e3942ba04052364d3c6454103362cafd87456010 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Nov 2012 00:24:25 +0100 Subject: [PATCH 26/33] vtime: Consolidate a bit the ctx switch code On ia64 and powerpc, vtime context switch only consists in flushing system and user pending time, plus a few arch housekeeping. Consolidate that into a generic implementation. s390 is a special case because pending user and system time accounting there is hard to dissociate. So it's keeping its own implementation. Signed-off-by: Frederic Weisbecker Reviewed-by: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens --- arch/ia64/include/asm/cputime.h | 2 ++ arch/ia64/kernel/time.c | 9 +-------- arch/powerpc/include/asm/cputime.h | 2 ++ arch/powerpc/kernel/time.c | 6 ------ arch/s390/include/asm/cputime.h | 1 + kernel/sched/cputime.c | 13 +++++++++++++ 6 files changed, 19 insertions(+), 14 deletions(-) diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h index 3deac956d325..7fcf7f08ab06 100644 --- a/arch/ia64/include/asm/cputime.h +++ b/arch/ia64/include/asm/cputime.h @@ -103,5 +103,7 @@ static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val) #define cputime64_to_clock_t(__ct) \ cputime_to_clock_t((__force cputime_t)__ct) +extern void arch_vtime_task_switch(struct task_struct *tsk); + #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ #endif /* __IA64_CPUTIME_H */ diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 834c78bd3b5f..c9a7d2ebe089 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -100,18 +100,11 @@ void vtime_account_user(struct task_struct *tsk) * accumulated times to the current process, and to prepare accounting on * the next process. */ -void vtime_task_switch(struct task_struct *prev) +void arch_vtime_task_switch(struct task_struct *prev) { struct thread_info *pi = task_thread_info(prev); struct thread_info *ni = task_thread_info(current); - if (idle_task(smp_processor_id()) != prev) - vtime_account_system(prev); - else - vtime_account_idle(prev); - - vtime_account_user(prev); - pi->ac_stamp = ni->ac_stamp; ni->ac_stime = ni->ac_utime = 0; } diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 487d46ff68a1..483733bd06d4 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -228,6 +228,8 @@ static inline cputime_t clock_t_to_cputime(const unsigned long clk) #define cputime64_to_clock_t(ct) cputime_to_clock_t((cputime_t)(ct)) +static inline void arch_vtime_task_switch(struct task_struct *tsk) { } + #endif /* __KERNEL__ */ #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ #endif /* __POWERPC_CPUTIME_H */ diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index a667aaf85846..3486cfad4a63 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -375,12 +375,6 @@ void vtime_account_user(struct task_struct *tsk) account_user_time(tsk, utime, utimescaled); } -void vtime_task_switch(struct task_struct *prev) -{ - vtime_account(prev); - vtime_account_user(prev); -} - #else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ #define calc_cputime_factors() #endif diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 023d5ae24482..d2ff41370c0c 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -14,6 +14,7 @@ #define __ARCH_HAS_VTIME_ACCOUNT +#define __ARCH_HAS_VTIME_TASK_SWITCH /* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index c0aa1ba752ea..2e8d34aac97e 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -443,6 +443,19 @@ void vtime_account_system_irqsafe(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); +#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +void vtime_task_switch(struct task_struct *prev) +{ + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_system(prev); + + vtime_account_user(prev); + arch_vtime_task_switch(prev); +} +#endif + /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement From 1017769bd0073f0a73e066377cd79a10cf0a33ab Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Nov 2012 00:26:54 +0100 Subject: [PATCH 27/33] vtime: No need to disable irqs on vtime_account() vtime_account() is only called from irq entry. irqs are always disabled at this point so we can safely remove the irq disabling guards on that function. Signed-off-by: Frederic Weisbecker Reviewed-by: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens --- kernel/sched/cputime.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 2e8d34aac97e..80b2fd5a7cf0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -467,16 +467,10 @@ void vtime_task_switch(struct task_struct *prev) #ifndef __ARCH_HAS_VTIME_ACCOUNT void vtime_account(struct task_struct *tsk) { - unsigned long flags; - - local_irq_save(flags); - if (in_interrupt() || !is_idle_task(tsk)) vtime_account_system(tsk); else vtime_account_idle(tsk); - - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(vtime_account); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ From 1b2852b152be5150fbef7b585388ec43cf6f4415 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 19 Nov 2012 17:00:24 +0100 Subject: [PATCH 28/33] vtime: Warn if irqs aren't disabled on system time accounting APIs System time accounting APIs such as vtime_account_system() and vtime_account_idle() need to be irqsafe. Current callers include irq entry, exit and kvm, all of which have been checked against that requirement. Now it's better to grow that with an automatic check in case we have further callers or we missed something. Suggested-by: Steven Rostedt Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens --- arch/ia64/kernel/time.c | 2 ++ arch/powerpc/kernel/time.c | 2 ++ arch/s390/kernel/vtime.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index c9a7d2ebe089..b1995efbfd21 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -119,6 +119,8 @@ static cputime_t vtime_delta(struct task_struct *tsk) cputime_t delta_stime; __u64 now; + WARN_ON_ONCE(!irqs_disabled()); + now = ia64_get_itc(); delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp)); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 3486cfad4a63..b3b14352b05e 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -297,6 +297,8 @@ static u64 vtime_delta(struct task_struct *tsk, u64 now, nowscaled, deltascaled; u64 udelta, delta, user_scaled; + WARN_ON_ONCE(!irqs_disabled()); + now = mftb(); nowscaled = read_spurr(now); get_paca()->system_time += now - get_paca()->starttime; diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 7c6d861a1a40..e84b8b68444a 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -132,6 +132,8 @@ void vtime_account(struct task_struct *tsk) struct thread_info *ti = task_thread_info(tsk); u64 timer, system; + WARN_ON_ONCE(!irqs_disabled()); + timer = S390_lowcore.last_update_timer; S390_lowcore.last_update_timer = get_vtimer(); S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer; From a634f93335daa8f38180a0e576ccd68a73c36eaf Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Nov 2012 15:55:59 +0100 Subject: [PATCH 29/33] cputime: Move thread_group_cputime() to sched code thread_group_cputime() is a general cputime API that is not only used by posix cpu timer. Let's move this helper to sched code. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- kernel/posix-cpu-timers.c | 24 ------------------------ kernel/sched/cputime.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 125cb67daa21..d73840271dce 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -217,30 +217,6 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, return 0; } -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) -{ - struct signal_struct *sig = tsk->signal; - struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; - - rcu_read_lock(); - /* make sure we can trust tsk->thread_group list */ - if (!likely(pid_alive(tsk))) - goto out; - - t = tsk; - do { - times->utime += t->utime; - times->stime += t->stime; - times->sum_exec_runtime += task_sched_runtime(t); - } while_each_thread(tsk, t); -out: - rcu_read_unlock(); -} - static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) { if (b->utime > a->utime) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8d859dae5bed..e56f138a23c7 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -288,6 +288,34 @@ static __always_inline bool steal_account_process_tick(void) return false; } +/* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. + */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct signal_struct *sig = tsk->signal; + struct task_struct *t; + + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + rcu_read_lock(); + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) + goto out; + + t = tsk; + do { + times->utime += t->utime; + times->stime += t->stime; + times->sum_exec_runtime += task_sched_runtime(t); + } while_each_thread(tsk, t); +out: + rcu_read_unlock(); +} + #ifndef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING From e80d0a1ae8bb8fee0edd37427836f108b30f596b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Nov 2012 16:26:44 +0100 Subject: [PATCH 30/33] cputime: Rename thread_group_times to thread_group_cputime_adjusted We have thread_group_cputime() and thread_group_times(). The naming doesn't provide enough information about the difference between these two APIs. To lower the confusion, rename thread_group_times() to thread_group_cputime_adjusted(). This name better suggests that it's a version of thread_group_cputime() that does some stabilization on the raw cputime values. ie here: scale on top of CFS runtime stats and bound lower value for monotonicity. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- fs/proc/array.c | 4 ++-- include/linux/sched.h | 4 ++-- kernel/exit.c | 4 ++-- kernel/sched/cputime.c | 8 ++++---- kernel/sys.c | 6 +++--- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index c1c207c36cae..d3696708fc1a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -438,7 +438,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt += sig->min_flt; maj_flt += sig->maj_flt; - thread_group_times(task, &utime, &stime); + thread_group_cputime_adjusted(task, &utime, &stime); gtime += sig->gtime; } @@ -454,7 +454,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; - task_times(task, &utime, &stime); + task_cputime_adjusted(task, &utime, &stime); gtime = task->gtime; } diff --git a/include/linux/sched.h b/include/linux/sched.h index e1581a029e3d..e75cab5820ab 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1751,8 +1751,8 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } -extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); -extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); +extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); +extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); /* * Per process flags diff --git a/kernel/exit.c b/kernel/exit.c index 346616c0092c..618f7ee56003 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1186,11 +1186,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * as other threads in the parent group can be right * here reaping other children at the same time. * - * We use thread_group_times() to get times for the thread + * We use thread_group_cputime_adjusted() to get times for the thread * group, which consolidates times for all threads in the * group including the group leader. */ - thread_group_times(p, &tgutime, &tgstime); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e56f138a23c7..7dc155371b95 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -445,13 +445,13 @@ void account_idle_ticks(unsigned long ticks) * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { *ut = p->utime; *st = p->stime; } -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; @@ -516,7 +516,7 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { cputime_t rtime, utime = p->utime, total = utime + p->stime; @@ -543,7 +543,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) /* * Must be called with siglock held. */ -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct signal_struct *sig = p->signal; struct task_cputime cputime; diff --git a/kernel/sys.c b/kernel/sys.c index e6e0ece5f6a0..265b37690421 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1046,7 +1046,7 @@ void do_sys_times(struct tms *tms) cputime_t tgutime, tgstime, cutime, cstime; spin_lock_irq(¤t->sighand->siglock); - thread_group_times(current, &tgutime, &tgstime); + thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; spin_unlock_irq(¤t->sighand->siglock); @@ -1704,7 +1704,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = stime = 0; if (who == RUSAGE_THREAD) { - task_times(current, &utime, &stime); + task_cputime_adjusted(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = p->signal->maxrss; goto out; @@ -1730,7 +1730,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) break; case RUSAGE_SELF: - thread_group_times(p, &tgutime, &tgstime); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); utime += tgutime; stime += tgstime; r->ru_nvcsw += p->signal->nvcsw; From d37f761dbd276790f70dcf73a287fde2c3464482 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 22 Nov 2012 00:58:35 +0100 Subject: [PATCH 31/33] cputime: Consolidate cputime adjustment code task_cputime_adjusted() and thread_group_cputime_adjusted() essentially share the same code. They just don't use the same source: * The first function uses the cputime in the task struct and the previous adjusted snapshot that ensures monotonicity. * The second adds the cputime of all tasks in the group and the previous adjusted snapshot of the whole group from the signal structure. Just consolidate the common code that does the adjustment. These functions just need to fetch the values from the appropriate source. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- include/linux/sched.h | 23 +++++++++++++++++---- kernel/fork.c | 2 +- kernel/sched/cputime.c | 46 +++++++++++++++++++++--------------------- 3 files changed, 43 insertions(+), 28 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index e75cab5820ab..5dafac366811 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -433,14 +433,29 @@ struct cpu_itimer { u32 incr_error; }; +/** + * struct cputime - snaphsot of system and user cputime + * @utime: time spent in user mode + * @stime: time spent in system mode + * + * Gathers a generic snapshot of user and system time. + */ +struct cputime { + cputime_t utime; + cputime_t stime; +}; + /** * struct task_cputime - collected CPU time counts * @utime: time spent in user mode, in &cputime_t units * @stime: time spent in kernel mode, in &cputime_t units * @sum_exec_runtime: total time spent on the CPU, in nanoseconds * - * This structure groups together three kinds of CPU time that are - * tracked for threads and thread groups. Most things considering + * This is an extension of struct cputime that includes the total runtime + * spent by the task from the scheduler point of view. + * + * As a result, this structure groups together three kinds of CPU time + * that are tracked for threads and thread groups. Most things considering * CPU time want to group these counts together and treat all three * of them in parallel. */ @@ -581,7 +596,7 @@ struct signal_struct { cputime_t gtime; cputime_t cgtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - cputime_t prev_utime, prev_stime; + struct cputime prev_cputime; #endif unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; @@ -1340,7 +1355,7 @@ struct task_struct { cputime_t utime, stime, utimescaled, stimescaled; cputime_t gtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - cputime_t prev_utime, prev_stime; + struct cputime prev_cputime; #endif unsigned long nvcsw, nivcsw; /* context switch counts */ struct timespec start_time; /* monotonic time */ diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa2..0e7cdb90476f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1222,7 +1222,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - p->prev_utime = p->prev_stime = 0; + p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 7dc155371b95..220fdc4db770 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -516,14 +516,18 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +static void cputime_adjust(struct task_cputime *curr, + struct cputime *prev, + cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime = p->utime, total = utime + p->stime; + cputime_t rtime, utime, total; + utime = curr->utime; + total = utime + curr->stime; /* * Use CFS's precise accounting: */ - rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + rtime = nsecs_to_cputime(curr->sum_exec_runtime); if (total) utime = scale_utime(utime, rtime, total); @@ -533,11 +537,22 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) /* * Compare with previous values, to keep monotonicity: */ - p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + prev->utime = max(prev->utime, utime); + prev->stime = max(prev->stime, rtime - prev->utime); - *ut = p->prev_utime; - *st = p->prev_stime; + *ut = prev->utime; + *st = prev->stime; +} + +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime = { + .utime = p->utime, + .stime = p->stime, + .sum_exec_runtime = p->se.sum_exec_runtime, + }; + + cputime_adjust(&cputime, &p->prev_cputime, ut, st); } /* @@ -545,24 +560,9 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { - struct signal_struct *sig = p->signal; struct task_cputime cputime; - cputime_t rtime, utime, total; thread_group_cputime(p, &cputime); - - total = cputime.utime + cputime.stime; - rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - - if (total) - utime = scale_utime(cputime.utime, rtime, total); - else - utime = rtime; - - sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); - - *ut = sig->prev_utime; - *st = sig->prev_stime; + cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } #endif From fa09205783d11cc05122ad6e4ce06074624b2c0c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 28 Nov 2012 17:00:57 +0100 Subject: [PATCH 32/33] cputime: Comment cputime's adjusting code The reason for the scaling and monotonicity correction performed by cputime_adjust() may not be immediately clear to the reviewer. Add some comments to explain what happens there. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- kernel/sched/cputime.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 220fdc4db770..b7f731768625 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -516,6 +516,10 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } +/* + * Adjust tick based cputime random precision against scheduler + * runtime accounting. + */ static void cputime_adjust(struct task_cputime *curr, struct cputime *prev, cputime_t *ut, cputime_t *st) @@ -524,8 +528,16 @@ static void cputime_adjust(struct task_cputime *curr, utime = curr->utime; total = utime + curr->stime; + /* - * Use CFS's precise accounting: + * Tick based cputime accounting depend on random scheduling + * timeslices of a task to be interrupted or not by the timer. + * Depending on these circumstances, the number of these interrupts + * may be over or under-optimistic, matching the real user and system + * cputime with a variable precision. + * + * Fix this by scaling these tick based values against the total + * runtime accounted by the CFS scheduler. */ rtime = nsecs_to_cputime(curr->sum_exec_runtime); @@ -535,7 +547,9 @@ static void cputime_adjust(struct task_cputime *curr, utime = rtime; /* - * Compare with previous values, to keep monotonicity: + * If the tick based count grows faster than the scheduler one, + * the result of the scaling may go backward. + * Let's enforce monotonicity. */ prev->utime = max(prev->utime, utime); prev->stime = max(prev->stime, rtime - prev->utime); From c1ad41f1f7270c1956da13fa8fd59d8d5929d56e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Dec 2012 10:23:45 +0100 Subject: [PATCH 33/33] Revert "sched/autogroup: Fix crash on reboot when autogroup is disabled" This reverts commit 5258f386ea4e8454bc801fb443e8a4217da1947c, because the underlying autogroups bug got fixed upstream in a better way, via: fd8ef11730f1 Revert "sched, autogroup: Stop going ahead if autogroup is disabled" Cc: Mike Galbraith Cc: Yong Zhang Cc: Peter Zijlstra Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- fs/proc/base.c | 78 +++++++++++++++++++++++++++++++++++++++ kernel/sched/auto_group.c | 68 ++++++++++++++++++++++++++++------ kernel/sched/auto_group.h | 9 ++++- kernel/sysctl.c | 6 ++- 4 files changed, 147 insertions(+), 14 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 587631e1cd06..9e28356a959a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1272,6 +1272,81 @@ static const struct file_operations proc_pid_sched_operations = { #endif +#ifdef CONFIG_SCHED_AUTOGROUP +/* + * Print out autogroup related information: + */ +static int sched_autogroup_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_autogroup_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_autogroup_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + char buffer[PROC_NUMBUF]; + int nice; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + err = kstrtoint(strstrip(buffer), 0, &nice); + if (err < 0) + return err; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = proc_sched_autogroup_set_nice(p, nice); + if (err) + count = err; + + put_task_struct(p); + + return count; +} + +static int sched_autogroup_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = single_open(filp, sched_autogroup_show, NULL); + if (!ret) { + struct seq_file *m = filp->private_data; + + m->private = inode; + } + return ret; +} + +static const struct file_operations proc_pid_sched_autogroup_operations = { + .open = sched_autogroup_open, + .read = seq_read, + .write = sched_autogroup_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif /* CONFIG_SCHED_AUTOGROUP */ + static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { @@ -2582,6 +2657,9 @@ static const struct pid_entry tgid_base_stuff[] = { INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), +#endif +#ifdef CONFIG_SCHED_AUTOGROUP + REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 0f1bacb005a4..0984a21076a3 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -110,9 +110,6 @@ out_fail: bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { - if (!sysctl_sched_autogroup_enabled) - return false; - if (tg != &root_task_group) return false; @@ -146,11 +143,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); + if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) + goto out; + t = p; do { sched_move_task(t); } while_each_thread(p, t); +out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } @@ -158,11 +159,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) /* Allocates GFP_KERNEL, cannot be called under any spinlock */ void sched_autogroup_create_attach(struct task_struct *p) { - struct autogroup *ag; + struct autogroup *ag = autogroup_create(); - if (!sysctl_sched_autogroup_enabled) - return; - ag = autogroup_create(); autogroup_move_group(p, ag); /* drop extra reference added by autogroup_create() */ autogroup_kref_put(ag); @@ -178,15 +176,11 @@ EXPORT_SYMBOL(sched_autogroup_detach); void sched_autogroup_fork(struct signal_struct *sig) { - if (!sysctl_sched_autogroup_enabled) - return; sig->autogroup = autogroup_task_get(current); } void sched_autogroup_exit(struct signal_struct *sig) { - if (!sysctl_sched_autogroup_enabled) - return; autogroup_kref_put(sig->autogroup); } @@ -199,6 +193,58 @@ static int __init setup_autogroup(char *str) __setup("noautogroup", setup_autogroup); +#ifdef CONFIG_PROC_FS + +int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) +{ + static unsigned long next = INITIAL_JIFFIES; + struct autogroup *ag; + int err; + + if (nice < -20 || nice > 19) + return -EINVAL; + + err = security_task_setnice(current, nice); + if (err) + return err; + + if (nice < 0 && !can_nice(current, nice)) + return -EPERM; + + /* this is a heavy operation taking global locks.. */ + if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) + return -EAGAIN; + + next = HZ / 10 + jiffies; + ag = autogroup_task_get(p); + + down_write(&ag->lock); + err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); + if (!err) + ag->nice = nice; + up_write(&ag->lock); + + autogroup_kref_put(ag); + + return err; +} + +void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) +{ + struct autogroup *ag = autogroup_task_get(p); + + if (!task_group_is_autogroup(ag->tg)) + goto out; + + down_read(&ag->lock); + seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); + up_read(&ag->lock); + +out: + autogroup_kref_put(ag); +} +#endif /* CONFIG_PROC_FS */ + #ifdef CONFIG_SCHED_DEBUG int autogroup_path(struct task_group *tg, char *buf, int buflen) { diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h index 4552c6bf79d2..8bd047142816 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/auto_group.h @@ -4,6 +4,11 @@ #include struct autogroup { + /* + * reference doesn't mean how many thread attach to this + * autogroup now. It just stands for the number of task + * could use this autogroup. + */ struct kref kref; struct task_group *tg; struct rw_semaphore lock; @@ -24,7 +29,9 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { - if (task_wants_autogroup(p, tg)) + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) return p->signal->autogroup->tg; return tg; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b0fa5ad09873..26f65eaa01f9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -367,8 +367,10 @@ static struct ctl_table kern_table[] = { .procname = "sched_autogroup_enabled", .data = &sysctl_sched_autogroup_enabled, .maxlen = sizeof(unsigned int), - .mode = 0444, - .proc_handler = proc_dointvec, + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, }, #endif #ifdef CONFIG_CFS_BANDWIDTH