diff --git a/include/linux/sched.h b/include/linux/sched.h index 66169005f008..03c13b663e4b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -136,6 +136,7 @@ extern unsigned long weighted_cpuload(const int cpu); struct seq_file; struct cfs_rq; +struct task_grp; #ifdef CONFIG_SCHED_DEBUG extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); @@ -1834,6 +1835,17 @@ extern int sched_mc_power_savings, sched_smt_power_savings; extern void normalize_rt_tasks(void); +#ifdef CONFIG_FAIR_GROUP_SCHED + +extern struct task_grp init_task_grp; + +extern struct task_grp *sched_create_group(void); +extern void sched_destroy_group(struct task_grp *tg); +extern void sched_move_task(struct task_struct *tsk); +extern int sched_group_set_shares(struct task_grp *tg, unsigned long shares); + +#endif + #ifdef CONFIG_TASK_XACCT static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { diff --git a/init/Kconfig b/init/Kconfig index 11c6762a6529..ef90a154dd90 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -282,13 +282,12 @@ config CPUSETS Say N if unsure. config FAIR_GROUP_SCHED - bool "Fair group scheduler" - depends on EXPERIMENTAL && CONTAINERS + bool "Fair group cpu scheduler" + default n + depends on EXPERIMENTAL help - This option enables you to group tasks and control CPU resource - allocation to such groups. - - Say N if unsure. + This feature lets cpu scheduler recognize task groups and control cpu + bandwidth allocation to such task groups. config SYSFS_DEPRECATED bool "Create deprecated sysfs files" diff --git a/kernel/sched.c b/kernel/sched.c index ee7ac71b12f8..e10c403b1213 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -173,13 +173,10 @@ struct rt_prio_array { #ifdef CONFIG_FAIR_GROUP_SCHED -#include - struct cfs_rq; /* task group related information */ struct task_grp { - struct container_subsys_state css; /* schedulable entities of this group on each cpu */ struct sched_entity **se; /* runqueue "owned" by this group on each cpu */ @@ -192,22 +189,28 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; -static struct sched_entity *init_sched_entity_p[CONFIG_NR_CPUS]; -static struct cfs_rq *init_cfs_rq_p[CONFIG_NR_CPUS]; +static struct sched_entity *init_sched_entity_p[NR_CPUS]; +static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; /* Default task group. * Every task in system belong to this group at bootup. */ -static struct task_grp init_task_grp = { - .se = init_sched_entity_p, - .cfs_rq = init_cfs_rq_p, - }; +struct task_grp init_task_grp = { + .se = init_sched_entity_p, + .cfs_rq = init_cfs_rq_p, + }; + +#define INIT_TASK_GRP_LOAD NICE_0_LOAD +static int init_task_grp_load = INIT_TASK_GRP_LOAD; /* return group to which a task belongs */ static inline struct task_grp *task_grp(struct task_struct *p) { - return container_of(task_subsys_state(p, cpu_subsys_id), - struct task_grp, css); + struct task_grp *tg; + + tg = &init_task_grp; + + return tg; } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ @@ -250,6 +253,7 @@ struct cfs_rq { */ struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ struct task_grp *tg; /* group that "owns" this runqueue */ + struct rcu_head rcu; #endif }; @@ -6513,11 +6517,12 @@ void __init sched_init(void) init_sched_entity_p[i] = se; se->cfs_rq = &rq->cfs; se->my_q = cfs_rq; - se->load.weight = NICE_0_LOAD; - se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); + se->load.weight = init_task_grp_load; + se->load.inv_weight = + div64_64(1ULL<<32, init_task_grp_load); se->parent = NULL; } - init_task_grp.shares = NICE_0_LOAD; + init_task_grp.shares = init_task_grp_load; #endif for (j = 0; j < CPU_LOAD_IDX_MAX; j++) @@ -6707,45 +6712,28 @@ void set_curr_task(int cpu, struct task_struct *p) #ifdef CONFIG_FAIR_GROUP_SCHED -/* return corresponding task_grp object of a container */ -static inline struct task_grp *container_tg(struct container *cont) -{ - return container_of(container_subsys_state(cont, cpu_subsys_id), - struct task_grp, css); -} - /* allocate runqueue etc for a new task group */ -static struct container_subsys_state * -sched_create_group(struct container_subsys *ss, struct container *cont) +struct task_grp *sched_create_group(void) { struct task_grp *tg; struct cfs_rq *cfs_rq; struct sched_entity *se; + struct rq *rq; int i; - if (!cont->parent) { - /* This is early initialization for the top container */ - init_task_grp.css.container = cont; - return &init_task_grp.css; - } - - /* we support only 1-level deep hierarchical scheduler atm */ - if (cont->parent->parent) - return ERR_PTR(-EINVAL); - tg = kzalloc(sizeof(*tg), GFP_KERNEL); if (!tg) return ERR_PTR(-ENOMEM); - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * num_possible_cpus(), GFP_KERNEL); + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); if (!tg->cfs_rq) goto err; - tg->se = kzalloc(sizeof(se) * num_possible_cpus(), GFP_KERNEL); + tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); if (!tg->se) goto err; for_each_possible_cpu(i) { - struct rq *rq = cpu_rq(i); + rq = cpu_rq(i); cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); @@ -6763,7 +6751,6 @@ sched_create_group(struct container_subsys *ss, struct container *cont) tg->cfs_rq[i] = cfs_rq; init_cfs_rq(cfs_rq, rq); cfs_rq->tg = tg; - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); tg->se[i] = se; se->cfs_rq = &rq->cfs; @@ -6773,12 +6760,15 @@ sched_create_group(struct container_subsys *ss, struct container *cont) se->parent = NULL; } + for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = tg->cfs_rq[i]; + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + } + tg->shares = NICE_0_LOAD; - /* Bind the container to task_grp object we just created */ - tg->css.container = cont; - - return &tg->css; + return tg; err: for_each_possible_cpu(i) { @@ -6797,24 +6787,14 @@ err: return ERR_PTR(-ENOMEM); } - -/* destroy runqueue etc associated with a task group */ -static void sched_destroy_group(struct container_subsys *ss, - struct container *cont) +/* rcu callback to free various structures associated with a task group */ +static void free_sched_group(struct rcu_head *rhp) { - struct task_grp *tg = container_tg(cont); - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu); + struct task_grp *tg = cfs_rq->tg; struct sched_entity *se; int i; - for_each_possible_cpu(i) { - cfs_rq = tg->cfs_rq[i]; - list_del_rcu(&cfs_rq->leaf_cfs_rq_list); - } - - /* wait for possible concurrent references to cfs_rqs complete */ - synchronize_sched(); - /* now it should be safe to free those cfs_rqs */ for_each_possible_cpu(i) { cfs_rq = tg->cfs_rq[i]; @@ -6829,19 +6809,29 @@ static void sched_destroy_group(struct container_subsys *ss, kfree(tg); } -static int sched_can_attach(struct container_subsys *ss, - struct container *cont, struct task_struct *tsk) +/* Destroy runqueue etc associated with a task group */ +void sched_destroy_group(struct task_grp *tg) { - /* We don't support RT-tasks being in separate groups */ - if (tsk->sched_class != &fair_sched_class) - return -EINVAL; + struct cfs_rq *cfs_rq; + int i; - return 0; + for_each_possible_cpu(i) { + cfs_rq = tg->cfs_rq[i]; + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + } + + cfs_rq = tg->cfs_rq[0]; + + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&cfs_rq->rcu, free_sched_group); } -/* change task's runqueue when it moves between groups */ -static void sched_move_task(struct container_subsys *ss, struct container *cont, - struct container *old_cont, struct task_struct *tsk) +/* change task's runqueue when it moves between groups. + * The caller of this function should have put the task in its new group + * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to + * reflect its new group. + */ +void sched_move_task(struct task_struct *tsk) { int on_rq, running; unsigned long flags; @@ -6896,58 +6886,20 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) spin_unlock_irq(&rq->lock); } -static ssize_t cpu_shares_write(struct container *cont, struct cftype *cftype, - struct file *file, const char __user *userbuf, - size_t nbytes, loff_t *ppos) +int sched_group_set_shares(struct task_grp *tg, unsigned long shares) { int i; - unsigned long shareval; - struct task_grp *tg = container_tg(cont); - char buffer[2*sizeof(unsigned long) + 1]; - if (nbytes > 2*sizeof(unsigned long)) /* safety check */ - return -E2BIG; + if (tg->shares == shares) + return 0; - if (copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; + /* return -EINVAL if the new value is not sane */ - buffer[nbytes] = 0; /* nul-terminate */ - shareval = simple_strtoul(buffer, NULL, 10); - - tg->shares = shareval; + tg->shares = shares; for_each_possible_cpu(i) - set_se_shares(tg->se[i], shareval); + set_se_shares(tg->se[i], shares); - return nbytes; + return 0; } -static u64 cpu_shares_read_uint(struct container *cont, struct cftype *cft) -{ - struct task_grp *tg = container_tg(cont); - - return (u64) tg->shares; -} - -struct cftype cpuctl_share = { - .name = "shares", - .read_uint = cpu_shares_read_uint, - .write = cpu_shares_write, -}; - -static int sched_populate(struct container_subsys *ss, struct container *cont) -{ - return container_add_file(cont, ss, &cpuctl_share); -} - -struct container_subsys cpu_subsys = { - .name = "cpu", - .create = sched_create_group, - .destroy = sched_destroy_group, - .can_attach = sched_can_attach, - .attach = sched_move_task, - .populate = sched_populate, - .subsys_id = cpu_subsys_id, - .early_init = 1, -}; - -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 556942cf2606..abd65ed9f2a5 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -877,7 +877,10 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) if (!cfs_rq->nr_running) return MAX_PRIO; - curr = __pick_next_entity(cfs_rq); + curr = cfs_rq->curr; + if (!curr) + curr = __pick_next_entity(cfs_rq); + p = task_of(curr); return p->prio;