mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-16 08:44:21 +08:00
Merge branch 'sched-v28-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-v28-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (38 commits) sched debug: add name to sched_domain sysctl entries sched: sync wakeups vs avg_overlap sched: remove redundant code in cpu_cgroup_create() sched_rt.c: resch needed in rt_rq_enqueue() for the root rt_rq cpusets: scan_for_empty_cpusets(), cpuset doesn't seem to be so const sched: minor optimizations in wake_affine and select_task_rq_fair sched: maintain only task entities in cfs_rq->tasks list sched: fixup buddy selection sched: more sanity checks on the bandwidth settings sched: add some comments to the bandwidth code sched: fixlet for group load balance sched: rework wakeup preemption CFS scheduler: documentation about scheduling policies sched: clarify ifdef tangle sched: fix list traversal to use _rcu variant sched: turn off WAKEUP_OVERLAP sched: wakeup preempt when small overlap kernel/cpu.c: create a CPU_STARTING cpu_chain notifier kernel/cpu.c: Move the CPU_DYING notifiers sched: fix __load_balance_iterator() for cfq with only one task ...
This commit is contained in:
commit
b11ce8a26d
@ -168,10 +168,10 @@ if ($#ARGV < 0) {
|
|||||||
mkdir $ARGV[0],0777;
|
mkdir $ARGV[0],0777;
|
||||||
$state = 0;
|
$state = 0;
|
||||||
while (<STDIN>) {
|
while (<STDIN>) {
|
||||||
if (/^\.TH \"[^\"]*\" 4 \"([^\"]*)\"/) {
|
if (/^\.TH \"[^\"]*\" 9 \"([^\"]*)\"/) {
|
||||||
if ($state == 1) { close OUT }
|
if ($state == 1) { close OUT }
|
||||||
$state = 1;
|
$state = 1;
|
||||||
$fn = "$ARGV[0]/$1.4";
|
$fn = "$ARGV[0]/$1.9";
|
||||||
print STDERR "Creating $fn\n";
|
print STDERR "Creating $fn\n";
|
||||||
open OUT, ">$fn" or die "can't open $fn: $!\n";
|
open OUT, ">$fn" or die "can't open $fn: $!\n";
|
||||||
print OUT $_;
|
print OUT $_;
|
||||||
|
@ -1,151 +1,242 @@
|
|||||||
|
=============
|
||||||
This is the CFS scheduler.
|
CFS Scheduler
|
||||||
|
=============
|
||||||
80% of CFS's design can be summed up in a single sentence: CFS basically
|
|
||||||
models an "ideal, precise multi-tasking CPU" on real hardware.
|
|
||||||
|
|
||||||
"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100%
|
|
||||||
physical power and which can run each task at precise equal speed, in
|
|
||||||
parallel, each at 1/nr_running speed. For example: if there are 2 tasks
|
|
||||||
running then it runs each at 50% physical power - totally in parallel.
|
|
||||||
|
|
||||||
On real hardware, we can run only a single task at once, so while that
|
|
||||||
one task runs, the other tasks that are waiting for the CPU are at a
|
|
||||||
disadvantage - the current task gets an unfair amount of CPU time. In
|
|
||||||
CFS this fairness imbalance is expressed and tracked via the per-task
|
|
||||||
p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of
|
|
||||||
time the task should now run on the CPU for it to become completely fair
|
|
||||||
and balanced.
|
|
||||||
|
|
||||||
( small detail: on 'ideal' hardware, the p->wait_runtime value would
|
|
||||||
always be zero - no task would ever get 'out of balance' from the
|
|
||||||
'ideal' share of CPU time. )
|
|
||||||
|
|
||||||
CFS's task picking logic is based on this p->wait_runtime value and it
|
|
||||||
is thus very simple: it always tries to run the task with the largest
|
|
||||||
p->wait_runtime value. In other words, CFS tries to run the task with
|
|
||||||
the 'gravest need' for more CPU time. So CFS always tries to split up
|
|
||||||
CPU time between runnable tasks as close to 'ideal multitasking
|
|
||||||
hardware' as possible.
|
|
||||||
|
|
||||||
Most of the rest of CFS's design just falls out of this really simple
|
|
||||||
concept, with a few add-on embellishments like nice levels,
|
|
||||||
multiprocessing and various algorithm variants to recognize sleepers.
|
|
||||||
|
|
||||||
In practice it works like this: the system runs a task a bit, and when
|
|
||||||
the task schedules (or a scheduler tick happens) the task's CPU usage is
|
|
||||||
'accounted for': the (small) time it just spent using the physical CPU
|
|
||||||
is deducted from p->wait_runtime. [minus the 'fair share' it would have
|
|
||||||
gotten anyway]. Once p->wait_runtime gets low enough so that another
|
|
||||||
task becomes the 'leftmost task' of the time-ordered rbtree it maintains
|
|
||||||
(plus a small amount of 'granularity' distance relative to the leftmost
|
|
||||||
task so that we do not over-schedule tasks and trash the cache) then the
|
|
||||||
new leftmost task is picked and the current task is preempted.
|
|
||||||
|
|
||||||
The rq->fair_clock value tracks the 'CPU time a runnable task would have
|
|
||||||
fairly gotten, had it been runnable during that time'. So by using
|
|
||||||
rq->fair_clock values we can accurately timestamp and measure the
|
|
||||||
'expected CPU time' a task should have gotten. All runnable tasks are
|
|
||||||
sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and
|
|
||||||
CFS picks the 'leftmost' task and sticks to it. As the system progresses
|
|
||||||
forwards, newly woken tasks are put into the tree more and more to the
|
|
||||||
right - slowly but surely giving a chance for every task to become the
|
|
||||||
'leftmost task' and thus get on the CPU within a deterministic amount of
|
|
||||||
time.
|
|
||||||
|
|
||||||
Some implementation details:
|
|
||||||
|
|
||||||
- the introduction of Scheduling Classes: an extensible hierarchy of
|
|
||||||
scheduler modules. These modules encapsulate scheduling policy
|
|
||||||
details and are handled by the scheduler core without the core
|
|
||||||
code assuming about them too much.
|
|
||||||
|
|
||||||
- sched_fair.c implements the 'CFS desktop scheduler': it is a
|
|
||||||
replacement for the vanilla scheduler's SCHED_OTHER interactivity
|
|
||||||
code.
|
|
||||||
|
|
||||||
I'd like to give credit to Con Kolivas for the general approach here:
|
|
||||||
he has proven via RSDL/SD that 'fair scheduling' is possible and that
|
|
||||||
it results in better desktop scheduling. Kudos Con!
|
|
||||||
|
|
||||||
The CFS patch uses a completely different approach and implementation
|
|
||||||
from RSDL/SD. My goal was to make CFS's interactivity quality exceed
|
|
||||||
that of RSDL/SD, which is a high standard to meet :-) Testing
|
|
||||||
feedback is welcome to decide this one way or another. [ and, in any
|
|
||||||
case, all of SD's logic could be added via a kernel/sched_sd.c module
|
|
||||||
as well, if Con is interested in such an approach. ]
|
|
||||||
|
|
||||||
CFS's design is quite radical: it does not use runqueues, it uses a
|
|
||||||
time-ordered rbtree to build a 'timeline' of future task execution,
|
|
||||||
and thus has no 'array switch' artifacts (by which both the vanilla
|
|
||||||
scheduler and RSDL/SD are affected).
|
|
||||||
|
|
||||||
CFS uses nanosecond granularity accounting and does not rely on any
|
|
||||||
jiffies or other HZ detail. Thus the CFS scheduler has no notion of
|
|
||||||
'timeslices' and has no heuristics whatsoever. There is only one
|
|
||||||
central tunable (you have to switch on CONFIG_SCHED_DEBUG):
|
|
||||||
|
|
||||||
/proc/sys/kernel/sched_granularity_ns
|
|
||||||
|
|
||||||
which can be used to tune the scheduler from 'desktop' (low
|
|
||||||
latencies) to 'server' (good batching) workloads. It defaults to a
|
|
||||||
setting suitable for desktop workloads. SCHED_BATCH is handled by the
|
|
||||||
CFS scheduler module too.
|
|
||||||
|
|
||||||
Due to its design, the CFS scheduler is not prone to any of the
|
|
||||||
'attacks' that exist today against the heuristics of the stock
|
|
||||||
scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all
|
|
||||||
work fine and do not impact interactivity and produce the expected
|
|
||||||
behavior.
|
|
||||||
|
|
||||||
the CFS scheduler has a much stronger handling of nice levels and
|
|
||||||
SCHED_BATCH: both types of workloads should be isolated much more
|
|
||||||
agressively than under the vanilla scheduler.
|
|
||||||
|
|
||||||
( another detail: due to nanosec accounting and timeline sorting,
|
|
||||||
sched_yield() support is very simple under CFS, and in fact under
|
|
||||||
CFS sched_yield() behaves much better than under any other
|
|
||||||
scheduler i have tested so far. )
|
|
||||||
|
|
||||||
- sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler
|
|
||||||
way than the vanilla scheduler does. It uses 100 runqueues (for all
|
|
||||||
100 RT priority levels, instead of 140 in the vanilla scheduler)
|
|
||||||
and it needs no expired array.
|
|
||||||
|
|
||||||
- reworked/sanitized SMP load-balancing: the runqueue-walking
|
|
||||||
assumptions are gone from the load-balancing code now, and
|
|
||||||
iterators of the scheduling modules are used. The balancing code got
|
|
||||||
quite a bit simpler as a result.
|
|
||||||
|
|
||||||
|
|
||||||
Group scheduler extension to CFS
|
1. OVERVIEW
|
||||||
================================
|
|
||||||
|
|
||||||
Normally the scheduler operates on individual tasks and strives to provide
|
CFS stands for "Completely Fair Scheduler," and is the new "desktop" process
|
||||||
fair CPU time to each task. Sometimes, it may be desirable to group tasks
|
scheduler implemented by Ingo Molnar and merged in Linux 2.6.23. It is the
|
||||||
and provide fair CPU time to each such task group. For example, it may
|
replacement for the previous vanilla scheduler's SCHED_OTHER interactivity
|
||||||
be desirable to first provide fair CPU time to each user on the system
|
code.
|
||||||
and then to each task belonging to a user.
|
|
||||||
|
|
||||||
CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
|
80% of CFS's design can be summed up in a single sentence: CFS basically models
|
||||||
SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
|
an "ideal, precise multi-tasking CPU" on real hardware.
|
||||||
groups. At present, there are two (mutually exclusive) mechanisms to group
|
|
||||||
tasks for CPU bandwidth control purpose:
|
|
||||||
|
|
||||||
- Based on user id (CONFIG_FAIR_USER_SCHED)
|
"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100% physical
|
||||||
In this option, tasks are grouped according to their user id.
|
power and which can run each task at precise equal speed, in parallel, each at
|
||||||
- Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
|
1/nr_running speed. For example: if there are 2 tasks running, then it runs
|
||||||
This options lets the administrator create arbitrary groups
|
each at 50% physical power --- i.e., actually in parallel.
|
||||||
of tasks, using the "cgroup" pseudo filesystem. See
|
|
||||||
Documentation/cgroups.txt for more information about this
|
On real hardware, we can run only a single task at once, so we have to
|
||||||
filesystem.
|
introduce the concept of "virtual runtime." The virtual runtime of a task
|
||||||
|
specifies when its next timeslice would start execution on the ideal
|
||||||
|
multi-tasking CPU described above. In practice, the virtual runtime of a task
|
||||||
|
is its actual runtime normalized to the total number of running tasks.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
2. FEW IMPLEMENTATION DETAILS
|
||||||
|
|
||||||
|
In CFS the virtual runtime is expressed and tracked via the per-task
|
||||||
|
p->se.vruntime (nanosec-unit) value. This way, it's possible to accurately
|
||||||
|
timestamp and measure the "expected CPU time" a task should have gotten.
|
||||||
|
|
||||||
|
[ small detail: on "ideal" hardware, at any time all tasks would have the same
|
||||||
|
p->se.vruntime value --- i.e., tasks would execute simultaneously and no task
|
||||||
|
would ever get "out of balance" from the "ideal" share of CPU time. ]
|
||||||
|
|
||||||
|
CFS's task picking logic is based on this p->se.vruntime value and it is thus
|
||||||
|
very simple: it always tries to run the task with the smallest p->se.vruntime
|
||||||
|
value (i.e., the task which executed least so far). CFS always tries to split
|
||||||
|
up CPU time between runnable tasks as close to "ideal multitasking hardware" as
|
||||||
|
possible.
|
||||||
|
|
||||||
|
Most of the rest of CFS's design just falls out of this really simple concept,
|
||||||
|
with a few add-on embellishments like nice levels, multiprocessing and various
|
||||||
|
algorithm variants to recognize sleepers.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
3. THE RBTREE
|
||||||
|
|
||||||
|
CFS's design is quite radical: it does not use the old data structures for the
|
||||||
|
runqueues, but it uses a time-ordered rbtree to build a "timeline" of future
|
||||||
|
task execution, and thus has no "array switch" artifacts (by which both the
|
||||||
|
previous vanilla scheduler and RSDL/SD are affected).
|
||||||
|
|
||||||
|
CFS also maintains the rq->cfs.min_vruntime value, which is a monotonic
|
||||||
|
increasing value tracking the smallest vruntime among all tasks in the
|
||||||
|
runqueue. The total amount of work done by the system is tracked using
|
||||||
|
min_vruntime; that value is used to place newly activated entities on the left
|
||||||
|
side of the tree as much as possible.
|
||||||
|
|
||||||
|
The total number of running tasks in the runqueue is accounted through the
|
||||||
|
rq->cfs.load value, which is the sum of the weights of the tasks queued on the
|
||||||
|
runqueue.
|
||||||
|
|
||||||
|
CFS maintains a time-ordered rbtree, where all runnable tasks are sorted by the
|
||||||
|
p->se.vruntime key (there is a subtraction using rq->cfs.min_vruntime to
|
||||||
|
account for possible wraparounds). CFS picks the "leftmost" task from this
|
||||||
|
tree and sticks to it.
|
||||||
|
As the system progresses forwards, the executed tasks are put into the tree
|
||||||
|
more and more to the right --- slowly but surely giving a chance for every task
|
||||||
|
to become the "leftmost task" and thus get on the CPU within a deterministic
|
||||||
|
amount of time.
|
||||||
|
|
||||||
|
Summing up, CFS works like this: it runs a task a bit, and when the task
|
||||||
|
schedules (or a scheduler tick happens) the task's CPU usage is "accounted
|
||||||
|
for": the (small) time it just spent using the physical CPU is added to
|
||||||
|
p->se.vruntime. Once p->se.vruntime gets high enough so that another task
|
||||||
|
becomes the "leftmost task" of the time-ordered rbtree it maintains (plus a
|
||||||
|
small amount of "granularity" distance relative to the leftmost task so that we
|
||||||
|
do not over-schedule tasks and trash the cache), then the new leftmost task is
|
||||||
|
picked and the current task is preempted.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
4. SOME FEATURES OF CFS
|
||||||
|
|
||||||
|
CFS uses nanosecond granularity accounting and does not rely on any jiffies or
|
||||||
|
other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
|
||||||
|
way the previous scheduler had, and has no heuristics whatsoever. There is
|
||||||
|
only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
|
||||||
|
|
||||||
|
/proc/sys/kernel/sched_granularity_ns
|
||||||
|
|
||||||
|
which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
|
||||||
|
"server" (i.e., good batching) workloads. It defaults to a setting suitable
|
||||||
|
for desktop workloads. SCHED_BATCH is handled by the CFS scheduler module too.
|
||||||
|
|
||||||
|
Due to its design, the CFS scheduler is not prone to any of the "attacks" that
|
||||||
|
exist today against the heuristics of the stock scheduler: fiftyp.c, thud.c,
|
||||||
|
chew.c, ring-test.c, massive_intr.c all work fine and do not impact
|
||||||
|
interactivity and produce the expected behavior.
|
||||||
|
|
||||||
|
The CFS scheduler has a much stronger handling of nice levels and SCHED_BATCH
|
||||||
|
than the previous vanilla scheduler: both types of workloads are isolated much
|
||||||
|
more aggressively.
|
||||||
|
|
||||||
|
SMP load-balancing has been reworked/sanitized: the runqueue-walking
|
||||||
|
assumptions are gone from the load-balancing code now, and iterators of the
|
||||||
|
scheduling modules are used. The balancing code got quite a bit simpler as a
|
||||||
|
result.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
5. Scheduling policies
|
||||||
|
|
||||||
|
CFS implements three scheduling policies:
|
||||||
|
|
||||||
|
- SCHED_NORMAL (traditionally called SCHED_OTHER): The scheduling
|
||||||
|
policy that is used for regular tasks.
|
||||||
|
|
||||||
|
- SCHED_BATCH: Does not preempt nearly as often as regular tasks
|
||||||
|
would, thereby allowing tasks to run longer and make better use of
|
||||||
|
caches but at the cost of interactivity. This is well suited for
|
||||||
|
batch jobs.
|
||||||
|
|
||||||
|
- SCHED_IDLE: This is even weaker than nice 19, but its not a true
|
||||||
|
idle timer scheduler in order to avoid to get into priority
|
||||||
|
inversion problems which would deadlock the machine.
|
||||||
|
|
||||||
|
SCHED_FIFO/_RR are implemented in sched_rt.c and are as specified by
|
||||||
|
POSIX.
|
||||||
|
|
||||||
|
The command chrt from util-linux-ng 2.13.1.1 can set all of these except
|
||||||
|
SCHED_IDLE.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
6. SCHEDULING CLASSES
|
||||||
|
|
||||||
|
The new CFS scheduler has been designed in such a way to introduce "Scheduling
|
||||||
|
Classes," an extensible hierarchy of scheduler modules. These modules
|
||||||
|
encapsulate scheduling policy details and are handled by the scheduler core
|
||||||
|
without the core code assuming too much about them.
|
||||||
|
|
||||||
|
sched_fair.c implements the CFS scheduler described above.
|
||||||
|
|
||||||
|
sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler way than
|
||||||
|
the previous vanilla scheduler did. It uses 100 runqueues (for all 100 RT
|
||||||
|
priority levels, instead of 140 in the previous scheduler) and it needs no
|
||||||
|
expired array.
|
||||||
|
|
||||||
|
Scheduling classes are implemented through the sched_class structure, which
|
||||||
|
contains hooks to functions that must be called whenever an interesting event
|
||||||
|
occurs.
|
||||||
|
|
||||||
|
This is the (partial) list of the hooks:
|
||||||
|
|
||||||
|
- enqueue_task(...)
|
||||||
|
|
||||||
|
Called when a task enters a runnable state.
|
||||||
|
It puts the scheduling entity (task) into the red-black tree and
|
||||||
|
increments the nr_running variable.
|
||||||
|
|
||||||
|
- dequeue_tree(...)
|
||||||
|
|
||||||
|
When a task is no longer runnable, this function is called to keep the
|
||||||
|
corresponding scheduling entity out of the red-black tree. It decrements
|
||||||
|
the nr_running variable.
|
||||||
|
|
||||||
|
- yield_task(...)
|
||||||
|
|
||||||
|
This function is basically just a dequeue followed by an enqueue, unless the
|
||||||
|
compat_yield sysctl is turned on; in that case, it places the scheduling
|
||||||
|
entity at the right-most end of the red-black tree.
|
||||||
|
|
||||||
|
- check_preempt_curr(...)
|
||||||
|
|
||||||
|
This function checks if a task that entered the runnable state should
|
||||||
|
preempt the currently running task.
|
||||||
|
|
||||||
|
- pick_next_task(...)
|
||||||
|
|
||||||
|
This function chooses the most appropriate task eligible to run next.
|
||||||
|
|
||||||
|
- set_curr_task(...)
|
||||||
|
|
||||||
|
This function is called when a task changes its scheduling class or changes
|
||||||
|
its task group.
|
||||||
|
|
||||||
|
- task_tick(...)
|
||||||
|
|
||||||
|
This function is mostly called from time tick functions; it might lead to
|
||||||
|
process switch. This drives the running preemption.
|
||||||
|
|
||||||
|
- task_new(...)
|
||||||
|
|
||||||
|
The core scheduler gives the scheduling module an opportunity to manage new
|
||||||
|
task startup. The CFS scheduling module uses it for group scheduling, while
|
||||||
|
the scheduling module for a real-time task does not use it.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
7. GROUP SCHEDULER EXTENSIONS TO CFS
|
||||||
|
|
||||||
|
Normally, the scheduler operates on individual tasks and strives to provide
|
||||||
|
fair CPU time to each task. Sometimes, it may be desirable to group tasks and
|
||||||
|
provide fair CPU time to each such task group. For example, it may be
|
||||||
|
desirable to first provide fair CPU time to each user on the system and then to
|
||||||
|
each task belonging to a user.
|
||||||
|
|
||||||
|
CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be
|
||||||
|
grouped and divides CPU time fairly among such groups.
|
||||||
|
|
||||||
|
CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
|
||||||
|
SCHED_RR) tasks.
|
||||||
|
|
||||||
|
CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
|
||||||
|
SCHED_BATCH) tasks.
|
||||||
|
|
||||||
|
At present, there are two (mutually exclusive) mechanisms to group tasks for
|
||||||
|
CPU bandwidth control purposes:
|
||||||
|
|
||||||
|
- Based on user id (CONFIG_USER_SCHED)
|
||||||
|
|
||||||
|
With this option, tasks are grouped according to their user id.
|
||||||
|
|
||||||
|
- Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED)
|
||||||
|
|
||||||
|
This options needs CONFIG_CGROUPS to be defined, and lets the administrator
|
||||||
|
create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See
|
||||||
|
Documentation/cgroups.txt for more information about this filesystem.
|
||||||
|
|
||||||
Only one of these options to group tasks can be chosen and not both.
|
Only one of these options to group tasks can be chosen and not both.
|
||||||
|
|
||||||
Group scheduler tunables:
|
When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new
|
||||||
|
user and a "cpu_share" file is added in that directory.
|
||||||
When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
|
|
||||||
each new user and a "cpu_share" file is added in that directory.
|
|
||||||
|
|
||||||
# cd /sys/kernel/uids
|
# cd /sys/kernel/uids
|
||||||
# cat 512/cpu_share # Display user 512's CPU share
|
# cat 512/cpu_share # Display user 512's CPU share
|
||||||
@ -155,16 +246,14 @@ each new user and a "cpu_share" file is added in that directory.
|
|||||||
2048
|
2048
|
||||||
#
|
#
|
||||||
|
|
||||||
CPU bandwidth between two users are divided in the ratio of their CPU shares.
|
CPU bandwidth between two users is divided in the ratio of their CPU shares.
|
||||||
For ex: if you would like user "root" to get twice the bandwidth of user
|
For example: if you would like user "root" to get twice the bandwidth of user
|
||||||
"guest", then set the cpu_share for both the users such that "root"'s
|
"guest," then set the cpu_share for both the users such that "root"'s cpu_share
|
||||||
cpu_share is twice "guest"'s cpu_share
|
is twice "guest"'s cpu_share.
|
||||||
|
|
||||||
|
When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each
|
||||||
When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
|
group created using the pseudo filesystem. See example steps below to create
|
||||||
for each group created using the pseudo filesystem. See example steps
|
task groups and modify their CPU share using the "cgroups" pseudo filesystem.
|
||||||
below to create task groups and modify their CPU share using the "cgroups"
|
|
||||||
pseudo filesystem
|
|
||||||
|
|
||||||
# mkdir /dev/cpuctl
|
# mkdir /dev/cpuctl
|
||||||
# mount -t cgroup -ocpu none /dev/cpuctl
|
# mount -t cgroup -ocpu none /dev/cpuctl
|
||||||
|
@ -149,6 +149,9 @@ smp_callin(void)
|
|||||||
atomic_inc(&init_mm.mm_count);
|
atomic_inc(&init_mm.mm_count);
|
||||||
current->active_mm = &init_mm;
|
current->active_mm = &init_mm;
|
||||||
|
|
||||||
|
/* inform the notifiers about the new cpu */
|
||||||
|
notify_cpu_starting(cpuid);
|
||||||
|
|
||||||
/* Must have completely accurate bogos. */
|
/* Must have completely accurate bogos. */
|
||||||
local_irq_enable();
|
local_irq_enable();
|
||||||
|
|
||||||
|
@ -277,6 +277,7 @@ asmlinkage void __cpuinit secondary_start_kernel(void)
|
|||||||
/*
|
/*
|
||||||
* Enable local interrupts.
|
* Enable local interrupts.
|
||||||
*/
|
*/
|
||||||
|
notify_cpu_starting(cpu);
|
||||||
local_irq_enable();
|
local_irq_enable();
|
||||||
local_fiq_enable();
|
local_fiq_enable();
|
||||||
|
|
||||||
|
@ -178,6 +178,7 @@ void __init smp_callin(void)
|
|||||||
unmask_irq(IPI_INTR_VECT);
|
unmask_irq(IPI_INTR_VECT);
|
||||||
unmask_irq(TIMER0_INTR_VECT);
|
unmask_irq(TIMER0_INTR_VECT);
|
||||||
preempt_disable();
|
preempt_disable();
|
||||||
|
notify_cpu_starting(cpu);
|
||||||
local_irq_enable();
|
local_irq_enable();
|
||||||
|
|
||||||
cpu_set(cpu, cpu_online_map);
|
cpu_set(cpu, cpu_online_map);
|
||||||
|
@ -401,6 +401,7 @@ smp_callin (void)
|
|||||||
spin_lock(&vector_lock);
|
spin_lock(&vector_lock);
|
||||||
/* Setup the per cpu irq handling data structures */
|
/* Setup the per cpu irq handling data structures */
|
||||||
__setup_vector_irq(cpuid);
|
__setup_vector_irq(cpuid);
|
||||||
|
notify_cpu_starting(cpuid);
|
||||||
cpu_set(cpuid, cpu_online_map);
|
cpu_set(cpuid, cpu_online_map);
|
||||||
per_cpu(cpu_state, cpuid) = CPU_ONLINE;
|
per_cpu(cpu_state, cpuid) = CPU_ONLINE;
|
||||||
spin_unlock(&vector_lock);
|
spin_unlock(&vector_lock);
|
||||||
|
@ -498,6 +498,8 @@ static void __init smp_online(void)
|
|||||||
{
|
{
|
||||||
int cpu_id = smp_processor_id();
|
int cpu_id = smp_processor_id();
|
||||||
|
|
||||||
|
notify_cpu_starting(cpu_id);
|
||||||
|
|
||||||
local_irq_enable();
|
local_irq_enable();
|
||||||
|
|
||||||
/* Get our bogomips. */
|
/* Get our bogomips. */
|
||||||
|
@ -121,6 +121,8 @@ asmlinkage __cpuinit void start_secondary(void)
|
|||||||
cpu = smp_processor_id();
|
cpu = smp_processor_id();
|
||||||
cpu_data[cpu].udelay_val = loops_per_jiffy;
|
cpu_data[cpu].udelay_val = loops_per_jiffy;
|
||||||
|
|
||||||
|
notify_cpu_starting(cpu);
|
||||||
|
|
||||||
mp_ops->smp_finish();
|
mp_ops->smp_finish();
|
||||||
set_cpu_sibling_map(cpu);
|
set_cpu_sibling_map(cpu);
|
||||||
|
|
||||||
|
@ -453,6 +453,7 @@ int __devinit start_secondary(void *unused)
|
|||||||
secondary_cpu_time_init();
|
secondary_cpu_time_init();
|
||||||
|
|
||||||
ipi_call_lock();
|
ipi_call_lock();
|
||||||
|
notify_cpu_starting(cpu);
|
||||||
cpu_set(cpu, cpu_online_map);
|
cpu_set(cpu, cpu_online_map);
|
||||||
/* Update sibling maps */
|
/* Update sibling maps */
|
||||||
base = cpu_first_thread_in_core(cpu);
|
base = cpu_first_thread_in_core(cpu);
|
||||||
|
@ -585,6 +585,8 @@ int __cpuinit start_secondary(void *cpuvoid)
|
|||||||
/* Enable pfault pseudo page faults on this cpu. */
|
/* Enable pfault pseudo page faults on this cpu. */
|
||||||
pfault_init();
|
pfault_init();
|
||||||
|
|
||||||
|
/* call cpu notifiers */
|
||||||
|
notify_cpu_starting(smp_processor_id());
|
||||||
/* Mark this cpu as online */
|
/* Mark this cpu as online */
|
||||||
spin_lock(&call_lock);
|
spin_lock(&call_lock);
|
||||||
cpu_set(smp_processor_id(), cpu_online_map);
|
cpu_set(smp_processor_id(), cpu_online_map);
|
||||||
|
@ -82,6 +82,8 @@ asmlinkage void __cpuinit start_secondary(void)
|
|||||||
|
|
||||||
preempt_disable();
|
preempt_disable();
|
||||||
|
|
||||||
|
notify_cpu_starting(smp_processor_id());
|
||||||
|
|
||||||
local_irq_enable();
|
local_irq_enable();
|
||||||
|
|
||||||
calibrate_delay();
|
calibrate_delay();
|
||||||
|
@ -88,6 +88,7 @@ void __init smp4d_callin(void)
|
|||||||
local_flush_cache_all();
|
local_flush_cache_all();
|
||||||
local_flush_tlb_all();
|
local_flush_tlb_all();
|
||||||
|
|
||||||
|
notify_cpu_starting(cpuid);
|
||||||
/*
|
/*
|
||||||
* Unblock the master CPU _only_ when the scheduler state
|
* Unblock the master CPU _only_ when the scheduler state
|
||||||
* of all secondary CPUs will be up-to-date, so after
|
* of all secondary CPUs will be up-to-date, so after
|
||||||
|
@ -71,6 +71,8 @@ void __cpuinit smp4m_callin(void)
|
|||||||
local_flush_cache_all();
|
local_flush_cache_all();
|
||||||
local_flush_tlb_all();
|
local_flush_tlb_all();
|
||||||
|
|
||||||
|
notify_cpu_starting(cpuid);
|
||||||
|
|
||||||
/* Get our local ticker going. */
|
/* Get our local ticker going. */
|
||||||
smp_setup_percpu_timer();
|
smp_setup_percpu_timer();
|
||||||
|
|
||||||
|
@ -85,6 +85,7 @@ static int idle_proc(void *cpup)
|
|||||||
while (!cpu_isset(cpu, smp_commenced_mask))
|
while (!cpu_isset(cpu, smp_commenced_mask))
|
||||||
cpu_relax();
|
cpu_relax();
|
||||||
|
|
||||||
|
notify_cpu_starting(cpu);
|
||||||
cpu_set(cpu, cpu_online_map);
|
cpu_set(cpu, cpu_online_map);
|
||||||
default_idle();
|
default_idle();
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -257,6 +257,7 @@ static void __cpuinit smp_callin(void)
|
|||||||
end_local_APIC_setup();
|
end_local_APIC_setup();
|
||||||
map_cpu_to_logical_apicid();
|
map_cpu_to_logical_apicid();
|
||||||
|
|
||||||
|
notify_cpu_starting(cpuid);
|
||||||
/*
|
/*
|
||||||
* Get our bogomips.
|
* Get our bogomips.
|
||||||
*
|
*
|
||||||
|
@ -448,6 +448,8 @@ static void __init start_secondary(void *unused)
|
|||||||
|
|
||||||
VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
|
VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
|
||||||
|
|
||||||
|
notify_cpu_starting(cpuid);
|
||||||
|
|
||||||
/* enable interrupts */
|
/* enable interrupts */
|
||||||
local_irq_enable();
|
local_irq_enable();
|
||||||
|
|
||||||
|
@ -10,6 +10,18 @@
|
|||||||
|
|
||||||
#include <linux/wait.h>
|
#include <linux/wait.h>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct completion - structure used to maintain state for a "completion"
|
||||||
|
*
|
||||||
|
* This is the opaque structure used to maintain the state for a "completion".
|
||||||
|
* Completions currently use a FIFO to queue threads that have to wait for
|
||||||
|
* the "completion" event.
|
||||||
|
*
|
||||||
|
* See also: complete(), wait_for_completion() (and friends _timeout,
|
||||||
|
* _interruptible, _interruptible_timeout, and _killable), init_completion(),
|
||||||
|
* and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and
|
||||||
|
* INIT_COMPLETION().
|
||||||
|
*/
|
||||||
struct completion {
|
struct completion {
|
||||||
unsigned int done;
|
unsigned int done;
|
||||||
wait_queue_head_t wait;
|
wait_queue_head_t wait;
|
||||||
@ -21,6 +33,14 @@ struct completion {
|
|||||||
#define COMPLETION_INITIALIZER_ONSTACK(work) \
|
#define COMPLETION_INITIALIZER_ONSTACK(work) \
|
||||||
({ init_completion(&work); work; })
|
({ init_completion(&work); work; })
|
||||||
|
|
||||||
|
/**
|
||||||
|
* DECLARE_COMPLETION: - declare and initialize a completion structure
|
||||||
|
* @work: identifier for the completion structure
|
||||||
|
*
|
||||||
|
* This macro declares and initializes a completion structure. Generally used
|
||||||
|
* for static declarations. You should use the _ONSTACK variant for automatic
|
||||||
|
* variables.
|
||||||
|
*/
|
||||||
#define DECLARE_COMPLETION(work) \
|
#define DECLARE_COMPLETION(work) \
|
||||||
struct completion work = COMPLETION_INITIALIZER(work)
|
struct completion work = COMPLETION_INITIALIZER(work)
|
||||||
|
|
||||||
@ -29,6 +49,13 @@ struct completion {
|
|||||||
* completions - so we use the _ONSTACK() variant for those that
|
* completions - so we use the _ONSTACK() variant for those that
|
||||||
* are on the kernel stack:
|
* are on the kernel stack:
|
||||||
*/
|
*/
|
||||||
|
/**
|
||||||
|
* DECLARE_COMPLETION_ONSTACK: - declare and initialize a completion structure
|
||||||
|
* @work: identifier for the completion structure
|
||||||
|
*
|
||||||
|
* This macro declares and initializes a completion structure on the kernel
|
||||||
|
* stack.
|
||||||
|
*/
|
||||||
#ifdef CONFIG_LOCKDEP
|
#ifdef CONFIG_LOCKDEP
|
||||||
# define DECLARE_COMPLETION_ONSTACK(work) \
|
# define DECLARE_COMPLETION_ONSTACK(work) \
|
||||||
struct completion work = COMPLETION_INITIALIZER_ONSTACK(work)
|
struct completion work = COMPLETION_INITIALIZER_ONSTACK(work)
|
||||||
@ -36,6 +63,13 @@ struct completion {
|
|||||||
# define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work)
|
# define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/**
|
||||||
|
* init_completion: - Initialize a dynamically allocated completion
|
||||||
|
* @x: completion structure that is to be initialized
|
||||||
|
*
|
||||||
|
* This inline function will initialize a dynamically created completion
|
||||||
|
* structure.
|
||||||
|
*/
|
||||||
static inline void init_completion(struct completion *x)
|
static inline void init_completion(struct completion *x)
|
||||||
{
|
{
|
||||||
x->done = 0;
|
x->done = 0;
|
||||||
@ -55,6 +89,13 @@ extern bool completion_done(struct completion *x);
|
|||||||
extern void complete(struct completion *);
|
extern void complete(struct completion *);
|
||||||
extern void complete_all(struct completion *);
|
extern void complete_all(struct completion *);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* INIT_COMPLETION: - reinitialize a completion structure
|
||||||
|
* @x: completion structure to be reinitialized
|
||||||
|
*
|
||||||
|
* This macro should be used to reinitialize a completion structure so it can
|
||||||
|
* be reused. This is especially important after complete_all() is used.
|
||||||
|
*/
|
||||||
#define INIT_COMPLETION(x) ((x).done = 0)
|
#define INIT_COMPLETION(x) ((x).done = 0)
|
||||||
|
|
||||||
|
|
||||||
|
@ -69,6 +69,7 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
int cpu_up(unsigned int cpu);
|
int cpu_up(unsigned int cpu);
|
||||||
|
void notify_cpu_starting(unsigned int cpu);
|
||||||
extern void cpu_hotplug_init(void);
|
extern void cpu_hotplug_init(void);
|
||||||
extern void cpu_maps_update_begin(void);
|
extern void cpu_maps_update_begin(void);
|
||||||
extern void cpu_maps_update_done(void);
|
extern void cpu_maps_update_done(void);
|
||||||
|
@ -213,9 +213,16 @@ static inline int notifier_to_errno(int ret)
|
|||||||
#define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */
|
#define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */
|
||||||
#define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */
|
#define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */
|
||||||
#define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task,
|
#define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task,
|
||||||
* not handling interrupts, soon dead */
|
* not handling interrupts, soon dead.
|
||||||
|
* Called on the dying cpu, interrupts
|
||||||
|
* are already disabled. Must not
|
||||||
|
* sleep, must not fail */
|
||||||
#define CPU_POST_DEAD 0x0009 /* CPU (unsigned)v dead, cpu_hotplug
|
#define CPU_POST_DEAD 0x0009 /* CPU (unsigned)v dead, cpu_hotplug
|
||||||
* lock is dropped */
|
* lock is dropped */
|
||||||
|
#define CPU_STARTING 0x000A /* CPU (unsigned)v soon running.
|
||||||
|
* Called on the new cpu, just before
|
||||||
|
* enabling interrupts. Must not sleep,
|
||||||
|
* must not fail */
|
||||||
|
|
||||||
/* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
|
/* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
|
||||||
* operation in progress
|
* operation in progress
|
||||||
@ -229,6 +236,7 @@ static inline int notifier_to_errno(int ret)
|
|||||||
#define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
|
#define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
|
||||||
#define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN)
|
#define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN)
|
||||||
#define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN)
|
#define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN)
|
||||||
|
#define CPU_STARTING_FROZEN (CPU_STARTING | CPU_TASKS_FROZEN)
|
||||||
|
|
||||||
/* Hibernation and suspend events */
|
/* Hibernation and suspend events */
|
||||||
#define PM_HIBERNATION_PREPARE 0x0001 /* Going to hibernate */
|
#define PM_HIBERNATION_PREPARE 0x0001 /* Going to hibernate */
|
||||||
|
@ -104,8 +104,8 @@ struct prop_local_single {
|
|||||||
* snapshot of the last seen global state
|
* snapshot of the last seen global state
|
||||||
* and a lock protecting this state
|
* and a lock protecting this state
|
||||||
*/
|
*/
|
||||||
int shift;
|
|
||||||
unsigned long period;
|
unsigned long period;
|
||||||
|
int shift;
|
||||||
spinlock_t lock; /* protect the snapshot state */
|
spinlock_t lock; /* protect the snapshot state */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -451,8 +451,8 @@ struct signal_struct {
|
|||||||
* - everyone except group_exit_task is stopped during signal delivery
|
* - everyone except group_exit_task is stopped during signal delivery
|
||||||
* of fatal signals, group_exit_task processes the signal.
|
* of fatal signals, group_exit_task processes the signal.
|
||||||
*/
|
*/
|
||||||
struct task_struct *group_exit_task;
|
|
||||||
int notify_count;
|
int notify_count;
|
||||||
|
struct task_struct *group_exit_task;
|
||||||
|
|
||||||
/* thread group stop support, overloads group_exit_code too */
|
/* thread group stop support, overloads group_exit_code too */
|
||||||
int group_stop_count;
|
int group_stop_count;
|
||||||
@ -824,6 +824,9 @@ struct sched_domain {
|
|||||||
unsigned int ttwu_move_affine;
|
unsigned int ttwu_move_affine;
|
||||||
unsigned int ttwu_move_balance;
|
unsigned int ttwu_move_balance;
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef CONFIG_SCHED_DEBUG
|
||||||
|
char *name;
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
|
extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
|
||||||
@ -897,7 +900,7 @@ struct sched_class {
|
|||||||
void (*yield_task) (struct rq *rq);
|
void (*yield_task) (struct rq *rq);
|
||||||
int (*select_task_rq)(struct task_struct *p, int sync);
|
int (*select_task_rq)(struct task_struct *p, int sync);
|
||||||
|
|
||||||
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
|
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
|
||||||
|
|
||||||
struct task_struct * (*pick_next_task) (struct rq *rq);
|
struct task_struct * (*pick_next_task) (struct rq *rq);
|
||||||
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
|
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
|
||||||
@ -1010,8 +1013,8 @@ struct sched_entity {
|
|||||||
|
|
||||||
struct sched_rt_entity {
|
struct sched_rt_entity {
|
||||||
struct list_head run_list;
|
struct list_head run_list;
|
||||||
unsigned int time_slice;
|
|
||||||
unsigned long timeout;
|
unsigned long timeout;
|
||||||
|
unsigned int time_slice;
|
||||||
int nr_cpus_allowed;
|
int nr_cpus_allowed;
|
||||||
|
|
||||||
struct sched_rt_entity *back;
|
struct sched_rt_entity *back;
|
||||||
|
24
kernel/cpu.c
24
kernel/cpu.c
@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param)
|
|||||||
struct take_cpu_down_param *param = _param;
|
struct take_cpu_down_param *param = _param;
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
|
|
||||||
param->hcpu);
|
|
||||||
/* Ensure this CPU doesn't handle any more interrupts. */
|
/* Ensure this CPU doesn't handle any more interrupts. */
|
||||||
err = __cpu_disable();
|
err = __cpu_disable();
|
||||||
if (err < 0)
|
if (err < 0)
|
||||||
return err;
|
return err;
|
||||||
|
|
||||||
|
raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
|
||||||
|
param->hcpu);
|
||||||
|
|
||||||
/* Force idle task to run as soon as we yield: it should
|
/* Force idle task to run as soon as we yield: it should
|
||||||
immediately notice cpu is offline and die quickly. */
|
immediately notice cpu is offline and die quickly. */
|
||||||
sched_idle_next();
|
sched_idle_next();
|
||||||
@ -453,6 +454,25 @@ out:
|
|||||||
}
|
}
|
||||||
#endif /* CONFIG_PM_SLEEP_SMP */
|
#endif /* CONFIG_PM_SLEEP_SMP */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
|
||||||
|
* @cpu: cpu that just started
|
||||||
|
*
|
||||||
|
* This function calls the cpu_chain notifiers with CPU_STARTING.
|
||||||
|
* It must be called by the arch code on the new cpu, before the new cpu
|
||||||
|
* enables interrupts and before the "boot" cpu returns from __cpu_up().
|
||||||
|
*/
|
||||||
|
void notify_cpu_starting(unsigned int cpu)
|
||||||
|
{
|
||||||
|
unsigned long val = CPU_STARTING;
|
||||||
|
|
||||||
|
#ifdef CONFIG_PM_SLEEP_SMP
|
||||||
|
if (cpu_isset(cpu, frozen_cpus))
|
||||||
|
val = CPU_STARTING_FROZEN;
|
||||||
|
#endif /* CONFIG_PM_SLEEP_SMP */
|
||||||
|
raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* CONFIG_SMP */
|
#endif /* CONFIG_SMP */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1921,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
|
|||||||
* that has tasks along with an empty 'mems'. But if we did see such
|
* that has tasks along with an empty 'mems'. But if we did see such
|
||||||
* a cpuset, we'd handle it just like we do if its 'cpus' was empty.
|
* a cpuset, we'd handle it just like we do if its 'cpus' was empty.
|
||||||
*/
|
*/
|
||||||
static void scan_for_empty_cpusets(const struct cpuset *root)
|
static void scan_for_empty_cpusets(struct cpuset *root)
|
||||||
{
|
{
|
||||||
LIST_HEAD(queue);
|
LIST_HEAD(queue);
|
||||||
struct cpuset *cp; /* scans cpusets being updated */
|
struct cpuset *cp; /* scans cpusets being updated */
|
||||||
|
407
kernel/sched.c
407
kernel/sched.c
@ -204,11 +204,16 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
|
|||||||
rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
|
rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int rt_bandwidth_enabled(void)
|
||||||
|
{
|
||||||
|
return sysctl_sched_rt_runtime >= 0;
|
||||||
|
}
|
||||||
|
|
||||||
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
|
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
|
||||||
{
|
{
|
||||||
ktime_t now;
|
ktime_t now;
|
||||||
|
|
||||||
if (rt_b->rt_runtime == RUNTIME_INF)
|
if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (hrtimer_active(&rt_b->rt_period_timer))
|
if (hrtimer_active(&rt_b->rt_period_timer))
|
||||||
@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
|
|||||||
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
|
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
|
||||||
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
|
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
|
||||||
#endif /* CONFIG_RT_GROUP_SCHED */
|
#endif /* CONFIG_RT_GROUP_SCHED */
|
||||||
#else /* !CONFIG_FAIR_GROUP_SCHED */
|
#else /* !CONFIG_USER_SCHED */
|
||||||
#define root_task_group init_task_group
|
#define root_task_group init_task_group
|
||||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
#endif /* CONFIG_USER_SCHED */
|
||||||
|
|
||||||
/* task_group_lock serializes add/remove of task groups and also changes to
|
/* task_group_lock serializes add/remove of task groups and also changes to
|
||||||
* a task group's cpu shares.
|
* a task group's cpu shares.
|
||||||
@ -604,9 +609,9 @@ struct rq {
|
|||||||
|
|
||||||
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
|
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
|
||||||
|
|
||||||
static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
|
static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
|
||||||
{
|
{
|
||||||
rq->curr->sched_class->check_preempt_curr(rq, p);
|
rq->curr->sched_class->check_preempt_curr(rq, p, sync);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int cpu_of(struct rq *rq)
|
static inline int cpu_of(struct rq *rq)
|
||||||
@ -1102,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
|
|||||||
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
|
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void init_hrtick(void)
|
static inline void init_hrtick(void)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_SMP */
|
#endif /* CONFIG_SMP */
|
||||||
@ -1121,7 +1126,7 @@ static void init_rq_hrtick(struct rq *rq)
|
|||||||
rq->hrtick_timer.function = hrtick;
|
rq->hrtick_timer.function = hrtick;
|
||||||
rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
|
rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
|
||||||
}
|
}
|
||||||
#else
|
#else /* CONFIG_SCHED_HRTICK */
|
||||||
static inline void hrtick_clear(struct rq *rq)
|
static inline void hrtick_clear(struct rq *rq)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
@ -1133,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq)
|
|||||||
static inline void init_hrtick(void)
|
static inline void init_hrtick(void)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
#endif
|
#endif /* CONFIG_SCHED_HRTICK */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* resched_task - mark a task 'to be rescheduled now'.
|
* resched_task - mark a task 'to be rescheduled now'.
|
||||||
@ -1380,6 +1385,51 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
|
|||||||
update_load_sub(&rq->load, load);
|
update_load_sub(&rq->load, load);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
|
||||||
|
typedef int (*tg_visitor)(struct task_group *, void *);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Iterate the full tree, calling @down when first entering a node and @up when
|
||||||
|
* leaving it for the final time.
|
||||||
|
*/
|
||||||
|
static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
|
||||||
|
{
|
||||||
|
struct task_group *parent, *child;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
rcu_read_lock();
|
||||||
|
parent = &root_task_group;
|
||||||
|
down:
|
||||||
|
ret = (*down)(parent, data);
|
||||||
|
if (ret)
|
||||||
|
goto out_unlock;
|
||||||
|
list_for_each_entry_rcu(child, &parent->children, siblings) {
|
||||||
|
parent = child;
|
||||||
|
goto down;
|
||||||
|
|
||||||
|
up:
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
ret = (*up)(parent, data);
|
||||||
|
if (ret)
|
||||||
|
goto out_unlock;
|
||||||
|
|
||||||
|
child = parent;
|
||||||
|
parent = parent->parent;
|
||||||
|
if (parent)
|
||||||
|
goto up;
|
||||||
|
out_unlock:
|
||||||
|
rcu_read_unlock();
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int tg_nop(struct task_group *tg, void *data)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
static unsigned long source_load(int cpu, int type);
|
static unsigned long source_load(int cpu, int type);
|
||||||
static unsigned long target_load(int cpu, int type);
|
static unsigned long target_load(int cpu, int type);
|
||||||
@ -1397,37 +1447,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
|
|||||||
|
|
||||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||||
|
|
||||||
typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Iterate the full tree, calling @down when first entering a node and @up when
|
|
||||||
* leaving it for the final time.
|
|
||||||
*/
|
|
||||||
static void
|
|
||||||
walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
|
|
||||||
{
|
|
||||||
struct task_group *parent, *child;
|
|
||||||
|
|
||||||
rcu_read_lock();
|
|
||||||
parent = &root_task_group;
|
|
||||||
down:
|
|
||||||
(*down)(parent, cpu, sd);
|
|
||||||
list_for_each_entry_rcu(child, &parent->children, siblings) {
|
|
||||||
parent = child;
|
|
||||||
goto down;
|
|
||||||
|
|
||||||
up:
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
(*up)(parent, cpu, sd);
|
|
||||||
|
|
||||||
child = parent;
|
|
||||||
parent = parent->parent;
|
|
||||||
if (parent)
|
|
||||||
goto up;
|
|
||||||
rcu_read_unlock();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
|
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1486,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
|
|||||||
* This needs to be done in a bottom-up fashion because the rq weight of a
|
* This needs to be done in a bottom-up fashion because the rq weight of a
|
||||||
* parent group depends on the shares of its child groups.
|
* parent group depends on the shares of its child groups.
|
||||||
*/
|
*/
|
||||||
static void
|
static int tg_shares_up(struct task_group *tg, void *data)
|
||||||
tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|
||||||
{
|
{
|
||||||
unsigned long rq_weight = 0;
|
unsigned long rq_weight = 0;
|
||||||
unsigned long shares = 0;
|
unsigned long shares = 0;
|
||||||
|
struct sched_domain *sd = data;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
for_each_cpu_mask(i, sd->span) {
|
for_each_cpu_mask(i, sd->span) {
|
||||||
@ -1515,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|||||||
__update_group_shares_cpu(tg, i, shares, rq_weight);
|
__update_group_shares_cpu(tg, i, shares, rq_weight);
|
||||||
spin_unlock_irqrestore(&rq->lock, flags);
|
spin_unlock_irqrestore(&rq->lock, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1522,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|||||||
* This needs to be done in a top-down fashion because the load of a child
|
* This needs to be done in a top-down fashion because the load of a child
|
||||||
* group is a fraction of its parents load.
|
* group is a fraction of its parents load.
|
||||||
*/
|
*/
|
||||||
static void
|
static int tg_load_down(struct task_group *tg, void *data)
|
||||||
tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|
||||||
{
|
{
|
||||||
unsigned long load;
|
unsigned long load;
|
||||||
|
long cpu = (long)data;
|
||||||
|
|
||||||
if (!tg->parent) {
|
if (!tg->parent) {
|
||||||
load = cpu_rq(cpu)->load.weight;
|
load = cpu_rq(cpu)->load.weight;
|
||||||
@ -1536,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|||||||
}
|
}
|
||||||
|
|
||||||
tg->cfs_rq[cpu]->h_load = load;
|
tg->cfs_rq[cpu]->h_load = load;
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
return 0;
|
||||||
tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|
||||||
{
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void update_shares(struct sched_domain *sd)
|
static void update_shares(struct sched_domain *sd)
|
||||||
@ -1550,7 +1568,7 @@ static void update_shares(struct sched_domain *sd)
|
|||||||
|
|
||||||
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
|
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
|
||||||
sd->last_update = now;
|
sd->last_update = now;
|
||||||
walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
|
walk_tg_tree(tg_nop, tg_shares_up, sd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1561,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
|
|||||||
spin_lock(&rq->lock);
|
spin_lock(&rq->lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void update_h_load(int cpu)
|
static void update_h_load(long cpu)
|
||||||
{
|
{
|
||||||
walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
|
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
@ -1921,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
|
|||||||
running = task_running(rq, p);
|
running = task_running(rq, p);
|
||||||
on_rq = p->se.on_rq;
|
on_rq = p->se.on_rq;
|
||||||
ncsw = 0;
|
ncsw = 0;
|
||||||
if (!match_state || p->state == match_state) {
|
if (!match_state || p->state == match_state)
|
||||||
ncsw = p->nivcsw + p->nvcsw;
|
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
|
||||||
if (unlikely(!ncsw))
|
|
||||||
ncsw = 1;
|
|
||||||
}
|
|
||||||
task_rq_unlock(rq, &flags);
|
task_rq_unlock(rq, &flags);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -2285,7 +2300,7 @@ out_running:
|
|||||||
trace_mark(kernel_sched_wakeup,
|
trace_mark(kernel_sched_wakeup,
|
||||||
"pid %d state %ld ## rq %p task %p rq->curr %p",
|
"pid %d state %ld ## rq %p task %p rq->curr %p",
|
||||||
p->pid, p->state, rq, p, rq->curr);
|
p->pid, p->state, rq, p, rq->curr);
|
||||||
check_preempt_curr(rq, p);
|
check_preempt_curr(rq, p, sync);
|
||||||
|
|
||||||
p->state = TASK_RUNNING;
|
p->state = TASK_RUNNING;
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
@ -2420,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
|
|||||||
trace_mark(kernel_sched_wakeup_new,
|
trace_mark(kernel_sched_wakeup_new,
|
||||||
"pid %d state %ld ## rq %p task %p rq->curr %p",
|
"pid %d state %ld ## rq %p task %p rq->curr %p",
|
||||||
p->pid, p->state, rq, p, rq->curr);
|
p->pid, p->state, rq, p, rq->curr);
|
||||||
check_preempt_curr(rq, p);
|
check_preempt_curr(rq, p, 0);
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
if (p->sched_class->task_wake_up)
|
if (p->sched_class->task_wake_up)
|
||||||
p->sched_class->task_wake_up(rq, p);
|
p->sched_class->task_wake_up(rq, p);
|
||||||
@ -2880,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
|
|||||||
* Note that idle threads have a prio of MAX_PRIO, for this test
|
* Note that idle threads have a prio of MAX_PRIO, for this test
|
||||||
* to be always true for them.
|
* to be always true for them.
|
||||||
*/
|
*/
|
||||||
check_preempt_curr(this_rq, p);
|
check_preempt_curr(this_rq, p, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -4627,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
|
|||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
|
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* complete: - signals a single thread waiting on this completion
|
||||||
|
* @x: holds the state of this particular completion
|
||||||
|
*
|
||||||
|
* This will wake up a single thread waiting on this completion. Threads will be
|
||||||
|
* awakened in the same order in which they were queued.
|
||||||
|
*
|
||||||
|
* See also complete_all(), wait_for_completion() and related routines.
|
||||||
|
*/
|
||||||
void complete(struct completion *x)
|
void complete(struct completion *x)
|
||||||
{
|
{
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
@ -4638,6 +4662,12 @@ void complete(struct completion *x)
|
|||||||
}
|
}
|
||||||
EXPORT_SYMBOL(complete);
|
EXPORT_SYMBOL(complete);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* complete_all: - signals all threads waiting on this completion
|
||||||
|
* @x: holds the state of this particular completion
|
||||||
|
*
|
||||||
|
* This will wake up all threads waiting on this particular completion event.
|
||||||
|
*/
|
||||||
void complete_all(struct completion *x)
|
void complete_all(struct completion *x)
|
||||||
{
|
{
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
@ -4658,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
|
|||||||
wait.flags |= WQ_FLAG_EXCLUSIVE;
|
wait.flags |= WQ_FLAG_EXCLUSIVE;
|
||||||
__add_wait_queue_tail(&x->wait, &wait);
|
__add_wait_queue_tail(&x->wait, &wait);
|
||||||
do {
|
do {
|
||||||
if ((state == TASK_INTERRUPTIBLE &&
|
if (signal_pending_state(state, current)) {
|
||||||
signal_pending(current)) ||
|
|
||||||
(state == TASK_KILLABLE &&
|
|
||||||
fatal_signal_pending(current))) {
|
|
||||||
timeout = -ERESTARTSYS;
|
timeout = -ERESTARTSYS;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -4689,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state)
|
|||||||
return timeout;
|
return timeout;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* wait_for_completion: - waits for completion of a task
|
||||||
|
* @x: holds the state of this particular completion
|
||||||
|
*
|
||||||
|
* This waits to be signaled for completion of a specific task. It is NOT
|
||||||
|
* interruptible and there is no timeout.
|
||||||
|
*
|
||||||
|
* See also similar routines (i.e. wait_for_completion_timeout()) with timeout
|
||||||
|
* and interrupt capability. Also see complete().
|
||||||
|
*/
|
||||||
void __sched wait_for_completion(struct completion *x)
|
void __sched wait_for_completion(struct completion *x)
|
||||||
{
|
{
|
||||||
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
|
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(wait_for_completion);
|
EXPORT_SYMBOL(wait_for_completion);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* wait_for_completion_timeout: - waits for completion of a task (w/timeout)
|
||||||
|
* @x: holds the state of this particular completion
|
||||||
|
* @timeout: timeout value in jiffies
|
||||||
|
*
|
||||||
|
* This waits for either a completion of a specific task to be signaled or for a
|
||||||
|
* specified timeout to expire. The timeout is in jiffies. It is not
|
||||||
|
* interruptible.
|
||||||
|
*/
|
||||||
unsigned long __sched
|
unsigned long __sched
|
||||||
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
|
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
|
||||||
{
|
{
|
||||||
@ -4702,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
|
|||||||
}
|
}
|
||||||
EXPORT_SYMBOL(wait_for_completion_timeout);
|
EXPORT_SYMBOL(wait_for_completion_timeout);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* wait_for_completion_interruptible: - waits for completion of a task (w/intr)
|
||||||
|
* @x: holds the state of this particular completion
|
||||||
|
*
|
||||||
|
* This waits for completion of a specific task to be signaled. It is
|
||||||
|
* interruptible.
|
||||||
|
*/
|
||||||
int __sched wait_for_completion_interruptible(struct completion *x)
|
int __sched wait_for_completion_interruptible(struct completion *x)
|
||||||
{
|
{
|
||||||
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
|
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
|
||||||
@ -4711,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
|
|||||||
}
|
}
|
||||||
EXPORT_SYMBOL(wait_for_completion_interruptible);
|
EXPORT_SYMBOL(wait_for_completion_interruptible);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
|
||||||
|
* @x: holds the state of this particular completion
|
||||||
|
* @timeout: timeout value in jiffies
|
||||||
|
*
|
||||||
|
* This waits for either a completion of a specific task to be signaled or for a
|
||||||
|
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
|
||||||
|
*/
|
||||||
unsigned long __sched
|
unsigned long __sched
|
||||||
wait_for_completion_interruptible_timeout(struct completion *x,
|
wait_for_completion_interruptible_timeout(struct completion *x,
|
||||||
unsigned long timeout)
|
unsigned long timeout)
|
||||||
@ -4719,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
|
|||||||
}
|
}
|
||||||
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
|
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* wait_for_completion_killable: - waits for completion of a task (killable)
|
||||||
|
* @x: holds the state of this particular completion
|
||||||
|
*
|
||||||
|
* This waits to be signaled for completion of a specific task. It can be
|
||||||
|
* interrupted by a kill signal.
|
||||||
|
*/
|
||||||
int __sched wait_for_completion_killable(struct completion *x)
|
int __sched wait_for_completion_killable(struct completion *x)
|
||||||
{
|
{
|
||||||
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
|
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
|
||||||
@ -5121,7 +5189,8 @@ recheck:
|
|||||||
* Do not allow realtime tasks into groups that have no runtime
|
* Do not allow realtime tasks into groups that have no runtime
|
||||||
* assigned.
|
* assigned.
|
||||||
*/
|
*/
|
||||||
if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
|
if (rt_bandwidth_enabled() && rt_policy(policy) &&
|
||||||
|
task_group(p)->rt_bandwidth.rt_runtime == 0)
|
||||||
return -EPERM;
|
return -EPERM;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -5957,7 +6026,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
|
|||||||
set_task_cpu(p, dest_cpu);
|
set_task_cpu(p, dest_cpu);
|
||||||
if (on_rq) {
|
if (on_rq) {
|
||||||
activate_task(rq_dest, p, 0);
|
activate_task(rq_dest, p, 0);
|
||||||
check_preempt_curr(rq_dest, p);
|
check_preempt_curr(rq_dest, p, 0);
|
||||||
}
|
}
|
||||||
done:
|
done:
|
||||||
ret = 1;
|
ret = 1;
|
||||||
@ -6282,7 +6351,7 @@ set_table_entry(struct ctl_table *entry,
|
|||||||
static struct ctl_table *
|
static struct ctl_table *
|
||||||
sd_alloc_ctl_domain_table(struct sched_domain *sd)
|
sd_alloc_ctl_domain_table(struct sched_domain *sd)
|
||||||
{
|
{
|
||||||
struct ctl_table *table = sd_alloc_ctl_entry(12);
|
struct ctl_table *table = sd_alloc_ctl_entry(13);
|
||||||
|
|
||||||
if (table == NULL)
|
if (table == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -6310,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
|
|||||||
sizeof(int), 0644, proc_dointvec_minmax);
|
sizeof(int), 0644, proc_dointvec_minmax);
|
||||||
set_table_entry(&table[10], "flags", &sd->flags,
|
set_table_entry(&table[10], "flags", &sd->flags,
|
||||||
sizeof(int), 0644, proc_dointvec_minmax);
|
sizeof(int), 0644, proc_dointvec_minmax);
|
||||||
/* &table[11] is terminator */
|
set_table_entry(&table[11], "name", sd->name,
|
||||||
|
CORENAME_MAX_SIZE, 0444, proc_dostring);
|
||||||
|
/* &table[12] is terminator */
|
||||||
|
|
||||||
return table;
|
return table;
|
||||||
}
|
}
|
||||||
@ -7194,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
|
|||||||
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
|
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#ifdef CONFIG_SCHED_DEBUG
|
||||||
|
# define SD_INIT_NAME(sd, type) sd->name = #type
|
||||||
|
#else
|
||||||
|
# define SD_INIT_NAME(sd, type) do { } while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
#define SD_INIT(sd, type) sd_init_##type(sd)
|
#define SD_INIT(sd, type) sd_init_##type(sd)
|
||||||
|
|
||||||
#define SD_INIT_FUNC(type) \
|
#define SD_INIT_FUNC(type) \
|
||||||
static noinline void sd_init_##type(struct sched_domain *sd) \
|
static noinline void sd_init_##type(struct sched_domain *sd) \
|
||||||
{ \
|
{ \
|
||||||
memset(sd, 0, sizeof(*sd)); \
|
memset(sd, 0, sizeof(*sd)); \
|
||||||
*sd = SD_##type##_INIT; \
|
*sd = SD_##type##_INIT; \
|
||||||
sd->level = SD_LV_##type; \
|
sd->level = SD_LV_##type; \
|
||||||
|
SD_INIT_NAME(sd, type); \
|
||||||
}
|
}
|
||||||
|
|
||||||
SD_INIT_FUNC(CPU)
|
SD_INIT_FUNC(CPU)
|
||||||
@ -8242,20 +8321,25 @@ void __might_sleep(char *file, int line)
|
|||||||
#ifdef in_atomic
|
#ifdef in_atomic
|
||||||
static unsigned long prev_jiffy; /* ratelimiting */
|
static unsigned long prev_jiffy; /* ratelimiting */
|
||||||
|
|
||||||
if ((in_atomic() || irqs_disabled()) &&
|
if ((!in_atomic() && !irqs_disabled()) ||
|
||||||
system_state == SYSTEM_RUNNING && !oops_in_progress) {
|
system_state != SYSTEM_RUNNING || oops_in_progress)
|
||||||
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
|
return;
|
||||||
return;
|
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
|
||||||
prev_jiffy = jiffies;
|
return;
|
||||||
printk(KERN_ERR "BUG: sleeping function called from invalid"
|
prev_jiffy = jiffies;
|
||||||
" context at %s:%d\n", file, line);
|
|
||||||
printk("in_atomic():%d, irqs_disabled():%d\n",
|
printk(KERN_ERR
|
||||||
in_atomic(), irqs_disabled());
|
"BUG: sleeping function called from invalid context at %s:%d\n",
|
||||||
debug_show_held_locks(current);
|
file, line);
|
||||||
if (irqs_disabled())
|
printk(KERN_ERR
|
||||||
print_irqtrace_events(current);
|
"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
|
||||||
dump_stack();
|
in_atomic(), irqs_disabled(),
|
||||||
}
|
current->pid, current->comm);
|
||||||
|
|
||||||
|
debug_show_held_locks(current);
|
||||||
|
if (irqs_disabled())
|
||||||
|
print_irqtrace_events(current);
|
||||||
|
dump_stack();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(__might_sleep);
|
EXPORT_SYMBOL(__might_sleep);
|
||||||
@ -8753,75 +8837,97 @@ static DEFINE_MUTEX(rt_constraints_mutex);
|
|||||||
static unsigned long to_ratio(u64 period, u64 runtime)
|
static unsigned long to_ratio(u64 period, u64 runtime)
|
||||||
{
|
{
|
||||||
if (runtime == RUNTIME_INF)
|
if (runtime == RUNTIME_INF)
|
||||||
return 1ULL << 16;
|
return 1ULL << 20;
|
||||||
|
|
||||||
return div64_u64(runtime << 16, period);
|
return div64_u64(runtime << 20, period);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_CGROUP_SCHED
|
|
||||||
static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
|
|
||||||
{
|
|
||||||
struct task_group *tgi, *parent = tg->parent;
|
|
||||||
unsigned long total = 0;
|
|
||||||
|
|
||||||
if (!parent) {
|
|
||||||
if (global_rt_period() < period)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
return to_ratio(period, runtime) <
|
|
||||||
to_ratio(global_rt_period(), global_rt_runtime());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
rcu_read_lock();
|
|
||||||
list_for_each_entry_rcu(tgi, &parent->children, siblings) {
|
|
||||||
if (tgi == tg)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
|
|
||||||
tgi->rt_bandwidth.rt_runtime);
|
|
||||||
}
|
|
||||||
rcu_read_unlock();
|
|
||||||
|
|
||||||
return total + to_ratio(period, runtime) <=
|
|
||||||
to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
|
|
||||||
parent->rt_bandwidth.rt_runtime);
|
|
||||||
}
|
|
||||||
#elif defined CONFIG_USER_SCHED
|
|
||||||
static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
|
|
||||||
{
|
|
||||||
struct task_group *tgi;
|
|
||||||
unsigned long total = 0;
|
|
||||||
unsigned long global_ratio =
|
|
||||||
to_ratio(global_rt_period(), global_rt_runtime());
|
|
||||||
|
|
||||||
rcu_read_lock();
|
|
||||||
list_for_each_entry_rcu(tgi, &task_groups, list) {
|
|
||||||
if (tgi == tg)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
|
|
||||||
tgi->rt_bandwidth.rt_runtime);
|
|
||||||
}
|
|
||||||
rcu_read_unlock();
|
|
||||||
|
|
||||||
return total + to_ratio(period, runtime) < global_ratio;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Must be called with tasklist_lock held */
|
/* Must be called with tasklist_lock held */
|
||||||
static inline int tg_has_rt_tasks(struct task_group *tg)
|
static inline int tg_has_rt_tasks(struct task_group *tg)
|
||||||
{
|
{
|
||||||
struct task_struct *g, *p;
|
struct task_struct *g, *p;
|
||||||
|
|
||||||
do_each_thread(g, p) {
|
do_each_thread(g, p) {
|
||||||
if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
|
if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
|
||||||
return 1;
|
return 1;
|
||||||
} while_each_thread(g, p);
|
} while_each_thread(g, p);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct rt_schedulable_data {
|
||||||
|
struct task_group *tg;
|
||||||
|
u64 rt_period;
|
||||||
|
u64 rt_runtime;
|
||||||
|
};
|
||||||
|
|
||||||
|
static int tg_schedulable(struct task_group *tg, void *data)
|
||||||
|
{
|
||||||
|
struct rt_schedulable_data *d = data;
|
||||||
|
struct task_group *child;
|
||||||
|
unsigned long total, sum = 0;
|
||||||
|
u64 period, runtime;
|
||||||
|
|
||||||
|
period = ktime_to_ns(tg->rt_bandwidth.rt_period);
|
||||||
|
runtime = tg->rt_bandwidth.rt_runtime;
|
||||||
|
|
||||||
|
if (tg == d->tg) {
|
||||||
|
period = d->rt_period;
|
||||||
|
runtime = d->rt_runtime;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Cannot have more runtime than the period.
|
||||||
|
*/
|
||||||
|
if (runtime > period && runtime != RUNTIME_INF)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Ensure we don't starve existing RT tasks.
|
||||||
|
*/
|
||||||
|
if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
|
||||||
|
return -EBUSY;
|
||||||
|
|
||||||
|
total = to_ratio(period, runtime);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Nobody can have more than the global setting allows.
|
||||||
|
*/
|
||||||
|
if (total > to_ratio(global_rt_period(), global_rt_runtime()))
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The sum of our children's runtime should not exceed our own.
|
||||||
|
*/
|
||||||
|
list_for_each_entry_rcu(child, &tg->children, siblings) {
|
||||||
|
period = ktime_to_ns(child->rt_bandwidth.rt_period);
|
||||||
|
runtime = child->rt_bandwidth.rt_runtime;
|
||||||
|
|
||||||
|
if (child == d->tg) {
|
||||||
|
period = d->rt_period;
|
||||||
|
runtime = d->rt_runtime;
|
||||||
|
}
|
||||||
|
|
||||||
|
sum += to_ratio(period, runtime);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sum > total)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
|
||||||
|
{
|
||||||
|
struct rt_schedulable_data data = {
|
||||||
|
.tg = tg,
|
||||||
|
.rt_period = period,
|
||||||
|
.rt_runtime = runtime,
|
||||||
|
};
|
||||||
|
|
||||||
|
return walk_tg_tree(tg_schedulable, tg_nop, &data);
|
||||||
|
}
|
||||||
|
|
||||||
static int tg_set_bandwidth(struct task_group *tg,
|
static int tg_set_bandwidth(struct task_group *tg,
|
||||||
u64 rt_period, u64 rt_runtime)
|
u64 rt_period, u64 rt_runtime)
|
||||||
{
|
{
|
||||||
@ -8829,14 +8935,9 @@ static int tg_set_bandwidth(struct task_group *tg,
|
|||||||
|
|
||||||
mutex_lock(&rt_constraints_mutex);
|
mutex_lock(&rt_constraints_mutex);
|
||||||
read_lock(&tasklist_lock);
|
read_lock(&tasklist_lock);
|
||||||
if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
|
err = __rt_schedulable(tg, rt_period, rt_runtime);
|
||||||
err = -EBUSY;
|
if (err)
|
||||||
goto unlock;
|
goto unlock;
|
||||||
}
|
|
||||||
if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
|
|
||||||
err = -EINVAL;
|
|
||||||
goto unlock;
|
|
||||||
}
|
|
||||||
|
|
||||||
spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
|
spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
|
||||||
tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
|
tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
|
||||||
@ -8905,19 +9006,25 @@ long sched_group_rt_period(struct task_group *tg)
|
|||||||
|
|
||||||
static int sched_rt_global_constraints(void)
|
static int sched_rt_global_constraints(void)
|
||||||
{
|
{
|
||||||
struct task_group *tg = &root_task_group;
|
u64 runtime, period;
|
||||||
u64 rt_runtime, rt_period;
|
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
if (sysctl_sched_rt_period <= 0)
|
if (sysctl_sched_rt_period <= 0)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
|
runtime = global_rt_runtime();
|
||||||
rt_runtime = tg->rt_bandwidth.rt_runtime;
|
period = global_rt_period();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Sanity check on the sysctl variables.
|
||||||
|
*/
|
||||||
|
if (runtime > period && runtime != RUNTIME_INF)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
mutex_lock(&rt_constraints_mutex);
|
mutex_lock(&rt_constraints_mutex);
|
||||||
if (!__rt_schedulable(tg, rt_period, rt_runtime))
|
read_lock(&tasklist_lock);
|
||||||
ret = -EINVAL;
|
ret = __rt_schedulable(NULL, 0, 0);
|
||||||
|
read_unlock(&tasklist_lock);
|
||||||
mutex_unlock(&rt_constraints_mutex);
|
mutex_unlock(&rt_constraints_mutex);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
@ -8991,7 +9098,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
|
|||||||
|
|
||||||
if (!cgrp->parent) {
|
if (!cgrp->parent) {
|
||||||
/* This is early initialization for the top cgroup */
|
/* This is early initialization for the top cgroup */
|
||||||
init_task_group.css.cgroup = cgrp;
|
|
||||||
return &init_task_group.css;
|
return &init_task_group.css;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -9000,9 +9106,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
|
|||||||
if (IS_ERR(tg))
|
if (IS_ERR(tg))
|
||||||
return ERR_PTR(-ENOMEM);
|
return ERR_PTR(-ENOMEM);
|
||||||
|
|
||||||
/* Bind the cgroup to task_group object we just created */
|
|
||||||
tg->css.cgroup = cgrp;
|
|
||||||
|
|
||||||
return &tg->css;
|
return &tg->css;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -408,64 +408,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|||||||
return __sched_period(nr_running);
|
return __sched_period(nr_running);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
|
|
||||||
* that it favours >=0 over <0.
|
|
||||||
*
|
|
||||||
* -20 |
|
|
||||||
* |
|
|
||||||
* 0 --------+-------
|
|
||||||
* .'
|
|
||||||
* 19 .'
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
static unsigned long
|
|
||||||
calc_delta_asym(unsigned long delta, struct sched_entity *se)
|
|
||||||
{
|
|
||||||
struct load_weight lw = {
|
|
||||||
.weight = NICE_0_LOAD,
|
|
||||||
.inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
|
|
||||||
};
|
|
||||||
|
|
||||||
for_each_sched_entity(se) {
|
|
||||||
struct load_weight *se_lw = &se->load;
|
|
||||||
unsigned long rw = cfs_rq_of(se)->load.weight;
|
|
||||||
|
|
||||||
#ifdef CONFIG_FAIR_SCHED_GROUP
|
|
||||||
struct cfs_rq *cfs_rq = se->my_q;
|
|
||||||
struct task_group *tg = NULL
|
|
||||||
|
|
||||||
if (cfs_rq)
|
|
||||||
tg = cfs_rq->tg;
|
|
||||||
|
|
||||||
if (tg && tg->shares < NICE_0_LOAD) {
|
|
||||||
/*
|
|
||||||
* scale shares to what it would have been had
|
|
||||||
* tg->weight been NICE_0_LOAD:
|
|
||||||
*
|
|
||||||
* weight = 1024 * shares / tg->weight
|
|
||||||
*/
|
|
||||||
lw.weight *= se->load.weight;
|
|
||||||
lw.weight /= tg->shares;
|
|
||||||
|
|
||||||
lw.inv_weight = 0;
|
|
||||||
|
|
||||||
se_lw = &lw;
|
|
||||||
rw += lw.weight - se->load.weight;
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (se->load.weight < NICE_0_LOAD) {
|
|
||||||
se_lw = &lw;
|
|
||||||
rw += NICE_0_LOAD - se->load.weight;
|
|
||||||
}
|
|
||||||
|
|
||||||
delta = calc_delta_mine(delta, rw, se_lw);
|
|
||||||
}
|
|
||||||
|
|
||||||
return delta;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Update the current task's runtime statistics. Skip current tasks that
|
* Update the current task's runtime statistics. Skip current tasks that
|
||||||
* are not in our scheduling class.
|
* are not in our scheduling class.
|
||||||
@ -586,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|||||||
update_load_add(&cfs_rq->load, se->load.weight);
|
update_load_add(&cfs_rq->load, se->load.weight);
|
||||||
if (!parent_entity(se))
|
if (!parent_entity(se))
|
||||||
inc_cpu_load(rq_of(cfs_rq), se->load.weight);
|
inc_cpu_load(rq_of(cfs_rq), se->load.weight);
|
||||||
if (entity_is_task(se))
|
if (entity_is_task(se)) {
|
||||||
add_cfs_task_weight(cfs_rq, se->load.weight);
|
add_cfs_task_weight(cfs_rq, se->load.weight);
|
||||||
|
list_add(&se->group_node, &cfs_rq->tasks);
|
||||||
|
}
|
||||||
cfs_rq->nr_running++;
|
cfs_rq->nr_running++;
|
||||||
se->on_rq = 1;
|
se->on_rq = 1;
|
||||||
list_add(&se->group_node, &cfs_rq->tasks);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@ -599,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|||||||
update_load_sub(&cfs_rq->load, se->load.weight);
|
update_load_sub(&cfs_rq->load, se->load.weight);
|
||||||
if (!parent_entity(se))
|
if (!parent_entity(se))
|
||||||
dec_cpu_load(rq_of(cfs_rq), se->load.weight);
|
dec_cpu_load(rq_of(cfs_rq), se->load.weight);
|
||||||
if (entity_is_task(se))
|
if (entity_is_task(se)) {
|
||||||
add_cfs_task_weight(cfs_rq, -se->load.weight);
|
add_cfs_task_weight(cfs_rq, -se->load.weight);
|
||||||
|
list_del_init(&se->group_node);
|
||||||
|
}
|
||||||
cfs_rq->nr_running--;
|
cfs_rq->nr_running--;
|
||||||
se->on_rq = 0;
|
se->on_rq = 0;
|
||||||
list_del_init(&se->group_node);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||||
@ -1085,7 +1029,6 @@ static long effective_load(struct task_group *tg, int cpu,
|
|||||||
long wl, long wg)
|
long wl, long wg)
|
||||||
{
|
{
|
||||||
struct sched_entity *se = tg->se[cpu];
|
struct sched_entity *se = tg->se[cpu];
|
||||||
long more_w;
|
|
||||||
|
|
||||||
if (!tg->parent)
|
if (!tg->parent)
|
||||||
return wl;
|
return wl;
|
||||||
@ -1097,18 +1040,17 @@ static long effective_load(struct task_group *tg, int cpu,
|
|||||||
if (!wl && sched_feat(ASYM_EFF_LOAD))
|
if (!wl && sched_feat(ASYM_EFF_LOAD))
|
||||||
return wl;
|
return wl;
|
||||||
|
|
||||||
/*
|
|
||||||
* Instead of using this increment, also add the difference
|
|
||||||
* between when the shares were last updated and now.
|
|
||||||
*/
|
|
||||||
more_w = se->my_q->load.weight - se->my_q->rq_weight;
|
|
||||||
wl += more_w;
|
|
||||||
wg += more_w;
|
|
||||||
|
|
||||||
for_each_sched_entity(se) {
|
for_each_sched_entity(se) {
|
||||||
#define D(n) (likely(n) ? (n) : 1)
|
|
||||||
|
|
||||||
long S, rw, s, a, b;
|
long S, rw, s, a, b;
|
||||||
|
long more_w;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Instead of using this increment, also add the difference
|
||||||
|
* between when the shares were last updated and now.
|
||||||
|
*/
|
||||||
|
more_w = se->my_q->load.weight - se->my_q->rq_weight;
|
||||||
|
wl += more_w;
|
||||||
|
wg += more_w;
|
||||||
|
|
||||||
S = se->my_q->tg->shares;
|
S = se->my_q->tg->shares;
|
||||||
s = se->my_q->shares;
|
s = se->my_q->shares;
|
||||||
@ -1117,7 +1059,11 @@ static long effective_load(struct task_group *tg, int cpu,
|
|||||||
a = S*(rw + wl);
|
a = S*(rw + wl);
|
||||||
b = S*rw + s*wg;
|
b = S*rw + s*wg;
|
||||||
|
|
||||||
wl = s*(a-b)/D(b);
|
wl = s*(a-b);
|
||||||
|
|
||||||
|
if (likely(b))
|
||||||
|
wl /= b;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Assume the group is already running and will
|
* Assume the group is already running and will
|
||||||
* thus already be accounted for in the weight.
|
* thus already be accounted for in the weight.
|
||||||
@ -1126,7 +1072,6 @@ static long effective_load(struct task_group *tg, int cpu,
|
|||||||
* alter the group weight.
|
* alter the group weight.
|
||||||
*/
|
*/
|
||||||
wg = 0;
|
wg = 0;
|
||||||
#undef D
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return wl;
|
return wl;
|
||||||
@ -1143,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
static int
|
static int
|
||||||
wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
|
wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
|
||||||
struct task_struct *p, int prev_cpu, int this_cpu, int sync,
|
struct task_struct *p, int prev_cpu, int this_cpu, int sync,
|
||||||
int idx, unsigned long load, unsigned long this_load,
|
int idx, unsigned long load, unsigned long this_load,
|
||||||
unsigned int imbalance)
|
unsigned int imbalance)
|
||||||
@ -1158,6 +1103,11 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
|
|||||||
if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
|
if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
if (!sync && sched_feat(SYNC_WAKEUPS) &&
|
||||||
|
curr->se.avg_overlap < sysctl_sched_migration_cost &&
|
||||||
|
p->se.avg_overlap < sysctl_sched_migration_cost)
|
||||||
|
sync = 1;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If sync wakeup then subtract the (maximum possible)
|
* If sync wakeup then subtract the (maximum possible)
|
||||||
* effect of the currently running task from the load
|
* effect of the currently running task from the load
|
||||||
@ -1182,17 +1132,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
|
|||||||
* a reasonable amount of time then attract this newly
|
* a reasonable amount of time then attract this newly
|
||||||
* woken task:
|
* woken task:
|
||||||
*/
|
*/
|
||||||
if (sync && balanced) {
|
if (sync && balanced)
|
||||||
if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
|
return 1;
|
||||||
p->se.avg_overlap < sysctl_sched_migration_cost)
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
schedstat_inc(p, se.nr_wakeups_affine_attempts);
|
schedstat_inc(p, se.nr_wakeups_affine_attempts);
|
||||||
tl_per_task = cpu_avg_load_per_task(this_cpu);
|
tl_per_task = cpu_avg_load_per_task(this_cpu);
|
||||||
|
|
||||||
if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
|
if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
|
||||||
balanced) {
|
tl_per_task)) {
|
||||||
/*
|
/*
|
||||||
* This domain has SD_WAKE_AFFINE and
|
* This domain has SD_WAKE_AFFINE and
|
||||||
* p is cache cold in this domain, and
|
* p is cache cold in this domain, and
|
||||||
@ -1211,16 +1158,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
|
|||||||
struct sched_domain *sd, *this_sd = NULL;
|
struct sched_domain *sd, *this_sd = NULL;
|
||||||
int prev_cpu, this_cpu, new_cpu;
|
int prev_cpu, this_cpu, new_cpu;
|
||||||
unsigned long load, this_load;
|
unsigned long load, this_load;
|
||||||
struct rq *rq, *this_rq;
|
struct rq *this_rq;
|
||||||
unsigned int imbalance;
|
unsigned int imbalance;
|
||||||
int idx;
|
int idx;
|
||||||
|
|
||||||
prev_cpu = task_cpu(p);
|
prev_cpu = task_cpu(p);
|
||||||
rq = task_rq(p);
|
|
||||||
this_cpu = smp_processor_id();
|
this_cpu = smp_processor_id();
|
||||||
this_rq = cpu_rq(this_cpu);
|
this_rq = cpu_rq(this_cpu);
|
||||||
new_cpu = prev_cpu;
|
new_cpu = prev_cpu;
|
||||||
|
|
||||||
|
if (prev_cpu == this_cpu)
|
||||||
|
goto out;
|
||||||
/*
|
/*
|
||||||
* 'this_sd' is the first domain that both
|
* 'this_sd' is the first domain that both
|
||||||
* this_cpu and prev_cpu are present in:
|
* this_cpu and prev_cpu are present in:
|
||||||
@ -1248,13 +1196,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
|
|||||||
load = source_load(prev_cpu, idx);
|
load = source_load(prev_cpu, idx);
|
||||||
this_load = target_load(this_cpu, idx);
|
this_load = target_load(this_cpu, idx);
|
||||||
|
|
||||||
if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
|
if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
|
||||||
load, this_load, imbalance))
|
load, this_load, imbalance))
|
||||||
return this_cpu;
|
return this_cpu;
|
||||||
|
|
||||||
if (prev_cpu == this_cpu)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Start passive balancing when half the imbalance_pct
|
* Start passive balancing when half the imbalance_pct
|
||||||
* limit is reached.
|
* limit is reached.
|
||||||
@ -1281,62 +1226,20 @@ static unsigned long wakeup_gran(struct sched_entity *se)
|
|||||||
* + nice tasks.
|
* + nice tasks.
|
||||||
*/
|
*/
|
||||||
if (sched_feat(ASYM_GRAN))
|
if (sched_feat(ASYM_GRAN))
|
||||||
gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
|
gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
|
||||||
else
|
|
||||||
gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
|
|
||||||
|
|
||||||
return gran;
|
return gran;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Should 'se' preempt 'curr'.
|
|
||||||
*
|
|
||||||
* |s1
|
|
||||||
* |s2
|
|
||||||
* |s3
|
|
||||||
* g
|
|
||||||
* |<--->|c
|
|
||||||
*
|
|
||||||
* w(c, s1) = -1
|
|
||||||
* w(c, s2) = 0
|
|
||||||
* w(c, s3) = 1
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
static int
|
|
||||||
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
|
|
||||||
{
|
|
||||||
s64 gran, vdiff = curr->vruntime - se->vruntime;
|
|
||||||
|
|
||||||
if (vdiff < 0)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
gran = wakeup_gran(curr);
|
|
||||||
if (vdiff > gran)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* return depth at which a sched entity is present in the hierarchy */
|
|
||||||
static inline int depth_se(struct sched_entity *se)
|
|
||||||
{
|
|
||||||
int depth = 0;
|
|
||||||
|
|
||||||
for_each_sched_entity(se)
|
|
||||||
depth++;
|
|
||||||
|
|
||||||
return depth;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Preempt the current task with a newly woken task if needed:
|
* Preempt the current task with a newly woken task if needed:
|
||||||
*/
|
*/
|
||||||
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
|
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
|
||||||
{
|
{
|
||||||
struct task_struct *curr = rq->curr;
|
struct task_struct *curr = rq->curr;
|
||||||
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
|
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
|
||||||
struct sched_entity *se = &curr->se, *pse = &p->se;
|
struct sched_entity *se = &curr->se, *pse = &p->se;
|
||||||
int se_depth, pse_depth;
|
s64 delta_exec;
|
||||||
|
|
||||||
if (unlikely(rt_prio(p->prio))) {
|
if (unlikely(rt_prio(p->prio))) {
|
||||||
update_rq_clock(rq);
|
update_rq_clock(rq);
|
||||||
@ -1350,6 +1253,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
|
|||||||
|
|
||||||
cfs_rq_of(pse)->next = pse;
|
cfs_rq_of(pse)->next = pse;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We can come here with TIF_NEED_RESCHED already set from new task
|
||||||
|
* wake up path.
|
||||||
|
*/
|
||||||
|
if (test_tsk_need_resched(curr))
|
||||||
|
return;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Batch tasks do not preempt (their preemption is driven by
|
* Batch tasks do not preempt (their preemption is driven by
|
||||||
* the tick):
|
* the tick):
|
||||||
@ -1360,33 +1270,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
|
|||||||
if (!sched_feat(WAKEUP_PREEMPT))
|
if (!sched_feat(WAKEUP_PREEMPT))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/*
|
if (sched_feat(WAKEUP_OVERLAP) && (sync ||
|
||||||
* preemption test can be made between sibling entities who are in the
|
(se->avg_overlap < sysctl_sched_migration_cost &&
|
||||||
* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
|
pse->avg_overlap < sysctl_sched_migration_cost))) {
|
||||||
* both tasks until we find their ancestors who are siblings of common
|
resched_task(curr);
|
||||||
* parent.
|
return;
|
||||||
*/
|
|
||||||
|
|
||||||
/* First walk up until both entities are at same depth */
|
|
||||||
se_depth = depth_se(se);
|
|
||||||
pse_depth = depth_se(pse);
|
|
||||||
|
|
||||||
while (se_depth > pse_depth) {
|
|
||||||
se_depth--;
|
|
||||||
se = parent_entity(se);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while (pse_depth > se_depth) {
|
delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
|
||||||
pse_depth--;
|
if (delta_exec > wakeup_gran(pse))
|
||||||
pse = parent_entity(pse);
|
|
||||||
}
|
|
||||||
|
|
||||||
while (!is_same_group(se, pse)) {
|
|
||||||
se = parent_entity(se);
|
|
||||||
pse = parent_entity(pse);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (wakeup_preempt_entity(se, pse) == 1)
|
|
||||||
resched_task(curr);
|
resched_task(curr);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1445,19 +1337,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
|
|||||||
if (next == &cfs_rq->tasks)
|
if (next == &cfs_rq->tasks)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
/* Skip over entities that are not tasks */
|
se = list_entry(next, struct sched_entity, group_node);
|
||||||
do {
|
p = task_of(se);
|
||||||
se = list_entry(next, struct sched_entity, group_node);
|
cfs_rq->balance_iterator = next->next;
|
||||||
next = next->next;
|
|
||||||
} while (next != &cfs_rq->tasks && !entity_is_task(se));
|
|
||||||
|
|
||||||
if (next == &cfs_rq->tasks)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
cfs_rq->balance_iterator = next;
|
|
||||||
|
|
||||||
if (entity_is_task(se))
|
|
||||||
p = task_of(se);
|
|
||||||
|
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
@ -1507,7 +1389,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
|
|||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
update_h_load(busiest_cpu);
|
update_h_load(busiest_cpu);
|
||||||
|
|
||||||
list_for_each_entry(tg, &task_groups, list) {
|
list_for_each_entry_rcu(tg, &task_groups, list) {
|
||||||
struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
|
struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
|
||||||
unsigned long busiest_h_load = busiest_cfs_rq->h_load;
|
unsigned long busiest_h_load = busiest_cfs_rq->h_load;
|
||||||
unsigned long busiest_weight = busiest_cfs_rq->load.weight;
|
unsigned long busiest_weight = busiest_cfs_rq->load.weight;
|
||||||
@ -1620,10 +1502,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
|
|||||||
* 'current' within the tree based on its new key value.
|
* 'current' within the tree based on its new key value.
|
||||||
*/
|
*/
|
||||||
swap(curr->vruntime, se->vruntime);
|
swap(curr->vruntime, se->vruntime);
|
||||||
|
resched_task(rq->curr);
|
||||||
}
|
}
|
||||||
|
|
||||||
enqueue_task_fair(rq, p, 0);
|
enqueue_task_fair(rq, p, 0);
|
||||||
resched_task(rq->curr);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1642,7 +1524,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
|
|||||||
if (p->prio > oldprio)
|
if (p->prio > oldprio)
|
||||||
resched_task(rq->curr);
|
resched_task(rq->curr);
|
||||||
} else
|
} else
|
||||||
check_preempt_curr(rq, p);
|
check_preempt_curr(rq, p, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1659,7 +1541,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
|
|||||||
if (running)
|
if (running)
|
||||||
resched_task(rq->curr);
|
resched_task(rq->curr);
|
||||||
else
|
else
|
||||||
check_preempt_curr(rq, p);
|
check_preempt_curr(rq, p, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Account for a task changing its policy or group.
|
/* Account for a task changing its policy or group.
|
||||||
|
@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1)
|
|||||||
SCHED_FEAT(LB_BIAS, 1)
|
SCHED_FEAT(LB_BIAS, 1)
|
||||||
SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
|
SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
|
||||||
SCHED_FEAT(ASYM_EFF_LOAD, 1)
|
SCHED_FEAT(ASYM_EFF_LOAD, 1)
|
||||||
|
SCHED_FEAT(WAKEUP_OVERLAP, 0)
|
||||||
|
@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
|
|||||||
/*
|
/*
|
||||||
* Idle tasks are unconditionally rescheduled:
|
* Idle tasks are unconditionally rescheduled:
|
||||||
*/
|
*/
|
||||||
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
|
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
|
||||||
{
|
{
|
||||||
resched_task(rq->idle);
|
resched_task(rq->idle);
|
||||||
}
|
}
|
||||||
@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
|
|||||||
if (running)
|
if (running)
|
||||||
resched_task(rq->curr);
|
resched_task(rq->curr);
|
||||||
else
|
else
|
||||||
check_preempt_curr(rq, p);
|
check_preempt_curr(rq, p, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void prio_changed_idle(struct rq *rq, struct task_struct *p,
|
static void prio_changed_idle(struct rq *rq, struct task_struct *p,
|
||||||
@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
|
|||||||
if (p->prio > oldprio)
|
if (p->prio > oldprio)
|
||||||
resched_task(rq->curr);
|
resched_task(rq->curr);
|
||||||
} else
|
} else
|
||||||
check_preempt_curr(rq, p);
|
check_preempt_curr(rq, p, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
|
|||||||
|
|
||||||
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
|
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
|
||||||
{
|
{
|
||||||
|
struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
|
||||||
struct sched_rt_entity *rt_se = rt_rq->rt_se;
|
struct sched_rt_entity *rt_se = rt_rq->rt_se;
|
||||||
|
|
||||||
if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
|
if (rt_rq->rt_nr_running) {
|
||||||
struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
|
if (rt_se && !on_rt_rq(rt_se))
|
||||||
|
enqueue_rt_entity(rt_se);
|
||||||
enqueue_rt_entity(rt_se);
|
|
||||||
if (rt_rq->highest_prio < curr->prio)
|
if (rt_rq->highest_prio < curr->prio)
|
||||||
resched_task(curr);
|
resched_task(curr);
|
||||||
}
|
}
|
||||||
@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
|
|||||||
#endif /* CONFIG_RT_GROUP_SCHED */
|
#endif /* CONFIG_RT_GROUP_SCHED */
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
|
/*
|
||||||
|
* We ran out of runtime, see if we can borrow some from our neighbours.
|
||||||
|
*/
|
||||||
static int do_balance_runtime(struct rt_rq *rt_rq)
|
static int do_balance_runtime(struct rt_rq *rt_rq)
|
||||||
{
|
{
|
||||||
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
|
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
|
||||||
@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
spin_lock(&iter->rt_runtime_lock);
|
spin_lock(&iter->rt_runtime_lock);
|
||||||
|
/*
|
||||||
|
* Either all rqs have inf runtime and there's nothing to steal
|
||||||
|
* or __disable_runtime() below sets a specific rq to inf to
|
||||||
|
* indicate its been disabled and disalow stealing.
|
||||||
|
*/
|
||||||
if (iter->rt_runtime == RUNTIME_INF)
|
if (iter->rt_runtime == RUNTIME_INF)
|
||||||
goto next;
|
goto next;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* From runqueues with spare time, take 1/n part of their
|
||||||
|
* spare time, but no more than our period.
|
||||||
|
*/
|
||||||
diff = iter->rt_runtime - iter->rt_time;
|
diff = iter->rt_runtime - iter->rt_time;
|
||||||
if (diff > 0) {
|
if (diff > 0) {
|
||||||
diff = div_u64((u64)diff, weight);
|
diff = div_u64((u64)diff, weight);
|
||||||
@ -274,6 +286,9 @@ next:
|
|||||||
return more;
|
return more;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Ensure this RQ takes back all the runtime it lend to its neighbours.
|
||||||
|
*/
|
||||||
static void __disable_runtime(struct rq *rq)
|
static void __disable_runtime(struct rq *rq)
|
||||||
{
|
{
|
||||||
struct root_domain *rd = rq->rd;
|
struct root_domain *rd = rq->rd;
|
||||||
@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq)
|
|||||||
|
|
||||||
spin_lock(&rt_b->rt_runtime_lock);
|
spin_lock(&rt_b->rt_runtime_lock);
|
||||||
spin_lock(&rt_rq->rt_runtime_lock);
|
spin_lock(&rt_rq->rt_runtime_lock);
|
||||||
|
/*
|
||||||
|
* Either we're all inf and nobody needs to borrow, or we're
|
||||||
|
* already disabled and thus have nothing to do, or we have
|
||||||
|
* exactly the right amount of runtime to take out.
|
||||||
|
*/
|
||||||
if (rt_rq->rt_runtime == RUNTIME_INF ||
|
if (rt_rq->rt_runtime == RUNTIME_INF ||
|
||||||
rt_rq->rt_runtime == rt_b->rt_runtime)
|
rt_rq->rt_runtime == rt_b->rt_runtime)
|
||||||
goto balanced;
|
goto balanced;
|
||||||
spin_unlock(&rt_rq->rt_runtime_lock);
|
spin_unlock(&rt_rq->rt_runtime_lock);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Calculate the difference between what we started out with
|
||||||
|
* and what we current have, that's the amount of runtime
|
||||||
|
* we lend and now have to reclaim.
|
||||||
|
*/
|
||||||
want = rt_b->rt_runtime - rt_rq->rt_runtime;
|
want = rt_b->rt_runtime - rt_rq->rt_runtime;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Greedy reclaim, take back as much as we can.
|
||||||
|
*/
|
||||||
for_each_cpu_mask(i, rd->span) {
|
for_each_cpu_mask(i, rd->span) {
|
||||||
struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
|
struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
|
||||||
s64 diff;
|
s64 diff;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Can't reclaim from ourselves or disabled runqueues.
|
||||||
|
*/
|
||||||
if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
|
if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq)
|
|||||||
}
|
}
|
||||||
|
|
||||||
spin_lock(&rt_rq->rt_runtime_lock);
|
spin_lock(&rt_rq->rt_runtime_lock);
|
||||||
|
/*
|
||||||
|
* We cannot be left wanting - that would mean some runtime
|
||||||
|
* leaked out of the system.
|
||||||
|
*/
|
||||||
BUG_ON(want);
|
BUG_ON(want);
|
||||||
balanced:
|
balanced:
|
||||||
|
/*
|
||||||
|
* Disable all the borrow logic by pretending we have inf
|
||||||
|
* runtime - in which case borrowing doesn't make sense.
|
||||||
|
*/
|
||||||
rt_rq->rt_runtime = RUNTIME_INF;
|
rt_rq->rt_runtime = RUNTIME_INF;
|
||||||
spin_unlock(&rt_rq->rt_runtime_lock);
|
spin_unlock(&rt_rq->rt_runtime_lock);
|
||||||
spin_unlock(&rt_b->rt_runtime_lock);
|
spin_unlock(&rt_b->rt_runtime_lock);
|
||||||
@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq)
|
|||||||
if (unlikely(!scheduler_running))
|
if (unlikely(!scheduler_running))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reset each runqueue's bandwidth settings
|
||||||
|
*/
|
||||||
for_each_leaf_rt_rq(rt_rq, rq) {
|
for_each_leaf_rt_rq(rt_rq, rq) {
|
||||||
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
|
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
|
||||||
|
|
||||||
@ -389,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
|
|||||||
int i, idle = 1;
|
int i, idle = 1;
|
||||||
cpumask_t span;
|
cpumask_t span;
|
||||||
|
|
||||||
if (rt_b->rt_runtime == RUNTIME_INF)
|
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
span = sched_rt_period_mask();
|
span = sched_rt_period_mask();
|
||||||
@ -487,6 +529,9 @@ static void update_curr_rt(struct rq *rq)
|
|||||||
curr->se.exec_start = rq->clock;
|
curr->se.exec_start = rq->clock;
|
||||||
cpuacct_charge(curr, delta_exec);
|
cpuacct_charge(curr, delta_exec);
|
||||||
|
|
||||||
|
if (!rt_bandwidth_enabled())
|
||||||
|
return;
|
||||||
|
|
||||||
for_each_sched_rt_entity(rt_se) {
|
for_each_sched_rt_entity(rt_se) {
|
||||||
rt_rq = rt_rq_of_se(rt_se);
|
rt_rq = rt_rq_of_se(rt_se);
|
||||||
|
|
||||||
@ -784,7 +829,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
|
|||||||
/*
|
/*
|
||||||
* Preempt the current task with a newly woken task if needed:
|
* Preempt the current task with a newly woken task if needed:
|
||||||
*/
|
*/
|
||||||
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
|
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
|
||||||
{
|
{
|
||||||
if (p->prio < rq->curr->prio) {
|
if (p->prio < rq->curr->prio) {
|
||||||
resched_task(rq->curr);
|
resched_task(rq->curr);
|
||||||
|
@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
|
|||||||
{
|
{
|
||||||
struct user_struct *up = container_of(kobj, struct user_struct, kobj);
|
struct user_struct *up = container_of(kobj, struct user_struct, kobj);
|
||||||
|
|
||||||
return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
|
return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
|
||||||
}
|
}
|
||||||
|
|
||||||
static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
|
static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
|
||||||
@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
|
|||||||
unsigned long rt_runtime;
|
unsigned long rt_runtime;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
sscanf(buf, "%lu", &rt_runtime);
|
sscanf(buf, "%ld", &rt_runtime);
|
||||||
|
|
||||||
rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
|
rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user