2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* kernel/cpuset.c
|
|
|
|
*
|
|
|
|
* Processor and Memory placement constraints for sets of tasks.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2003 BULL SA.
|
2007-10-19 14:40:20 +08:00
|
|
|
* Copyright (C) 2004-2007 Silicon Graphics, Inc.
|
2007-10-19 14:39:39 +08:00
|
|
|
* Copyright (C) 2006 Google, Inc
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Portions derived from Patrick Mochel's sysfs code.
|
|
|
|
* sysfs is Copyright (c) 2001-3 Patrick Mochel
|
|
|
|
*
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
|
|
|
* 2003-10-10 Written by Simon Derr.
|
2005-04-17 06:20:36 +08:00
|
|
|
* 2003-10-22 Updates by Stephen Hemminger.
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
|
|
|
* 2004 May-July Rework by Paul Jackson.
|
2007-10-19 14:39:39 +08:00
|
|
|
* 2006 Rework by Paul Menage to use generic cgroups
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
* 2008 Rework of the scheduler domains and CPU hotplug handling
|
|
|
|
* by Max Krasnyansky
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* This file is subject to the terms and conditions of the GNU General Public
|
|
|
|
* License. See the file COPYING in the main directory of the Linux
|
|
|
|
* distribution for more details.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/cpu.h>
|
|
|
|
#include <linux/cpumask.h>
|
|
|
|
#include <linux/cpuset.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/interrupt.h>
|
|
|
|
#include <linux/kernel.h>
|
[PATCH] cpusets: automatic numa mempolicy rebinding
This patch automatically updates a tasks NUMA mempolicy when its cpuset
memory placement changes. It does so within the context of the task,
without any need to support low level external mempolicy manipulation.
If a system is not using cpusets, or if running on a system with just the
root (all-encompassing) cpuset, then this remap is a no-op. Only when a
task is moved between cpusets, or a cpusets memory placement is changed
does the following apply. Otherwise, the main routine below,
rebind_policy() is not even called.
When mixing cpusets, scheduler affinity, and NUMA mempolicies, the
essential role of cpusets is to place jobs (several related tasks) on a set
of CPUs and Memory Nodes, the essential role of sched_setaffinity is to
manage a jobs processor placement within its allowed cpuset, and the
essential role of NUMA mempolicy (mbind, set_mempolicy) is to manage a jobs
memory placement within its allowed cpuset.
However, CPU affinity and NUMA memory placement are managed within the
kernel using absolute system wide numbering, not cpuset relative numbering.
This is ok until a job is migrated to a different cpuset, or what's the
same, a jobs cpuset is moved to different CPUs and Memory Nodes.
Then the CPU affinity and NUMA memory placement of the tasks in the job
need to be updated, to preserve their cpuset-relative position. This can
be done for CPU affinity using sched_setaffinity() from user code, as one
task can modify anothers CPU affinity. This cannot be done from an
external task for NUMA memory placement, as that can only be modified in
the context of the task using it.
However, it easy enough to remap a tasks NUMA mempolicy automatically when
a task is migrated, using the existing cpuset mechanism to trigger a
refresh of a tasks memory placement after its cpuset has changed. All that
is needed is the old and new nodemask, and notice to the task that it needs
to rebind its mempolicy. The tasks mems_allowed has the old mask, the
tasks cpuset has the new mask, and the existing
cpuset_update_current_mems_allowed() mechanism provides the notice. The
bitmap/cpumask/nodemask remap operators provide the cpuset relative
calculations.
This patch leaves open a couple of issues:
1) Updating vma and shmfs/tmpfs/hugetlbfs memory policies:
These mempolicies may reference nodes outside of those allowed to
the current task by its cpuset. Tasks are migrated as part of jobs,
which reside on what might be several cpusets in a subtree. When such
a job is migrated, all NUMA memory policy references to nodes within
that cpuset subtree should be translated, and references to any nodes
outside that subtree should be left untouched. A future patch will
provide the cpuset mechanism needed to mark such subtrees. With that
patch, we will be able to correctly migrate these other memory policies
across a job migration.
2) Updating cpuset, affinity and memory policies in user space:
This is harder. Any placement state stored in user space using
system-wide numbering will be invalidated across a migration. More
work will be required to provide user code with a migration-safe means
to manage its cpuset relative placement, while preserving the current
API's that pass system wide numbers, not cpuset relative numbers across
the kernel-user boundary.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-31 07:02:36 +08:00
|
|
|
#include <linux/mempolicy.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/mm.h>
|
2008-11-20 07:36:30 +08:00
|
|
|
#include <linux/memory.h>
|
2011-05-24 02:51:41 +08:00
|
|
|
#include <linux/export.h>
|
[PATCH] cpuset: use rcu directly optimization
Optimize the cpuset impact on page allocation, the most performance critical
cpuset hook in the kernel.
On each page allocation, the cpuset hook needs to check for a possible change
in the current tasks cpuset. It can now handle the common case, of no change,
without taking any spinlock or semaphore, thanks to RCU.
Convert a spinlock on the current task to an rcu_read_lock(), saving
approximately a memory barrier and an atomic op, depending on architecture.
This is done by adding rcu_assign_pointer() and synchronize_rcu() calls to the
write side of the task->cpuset pointer, in cpuset.c:attach_task(), to delay
freeing up a detached cpuset until after any critical sections referencing
that pointer.
Thanks to Andi Kleen, Nick Piggin and Eric Dumazet for ideas.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:02:02 +08:00
|
|
|
#include <linux/rcupdate.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/sched.h>
|
2019-07-19 21:59:55 +08:00
|
|
|
#include <linux/sched/deadline.h>
|
2017-02-09 01:51:29 +08:00
|
|
|
#include <linux/sched/mm.h>
|
2017-02-06 17:57:33 +08:00
|
|
|
#include <linux/sched/task.h>
|
2006-06-23 17:04:00 +08:00
|
|
|
#include <linux/security.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/spinlock.h>
|
2017-09-07 07:24:53 +08:00
|
|
|
#include <linux/oom.h>
|
2017-10-27 10:42:37 +08:00
|
|
|
#include <linux/sched/isolation.h>
|
2008-02-07 16:14:43 +08:00
|
|
|
#include <linux/cgroup.h>
|
2013-06-09 17:14:22 +08:00
|
|
|
#include <linux/wait.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
cpuset: fix a deadlock due to incomplete patching of cpusets_enabled()
In codepaths that use the begin/retry interface for reading
mems_allowed_seq with irqs disabled, there exists a race condition that
stalls the patch process after only modifying a subset of the
static_branch call sites.
This problem manifested itself as a deadlock in the slub allocator,
inside get_any_partial. The loop reads mems_allowed_seq value (via
read_mems_allowed_begin), performs the defrag operation, and then
verifies the consistency of mem_allowed via the read_mems_allowed_retry
and the cookie returned by xxx_begin.
The issue here is that both begin and retry first check if cpusets are
enabled via cpusets_enabled() static branch. This branch can be
rewritted dynamically (via cpuset_inc) if a new cpuset is created. The
x86 jump label code fully synchronizes across all CPUs for every entry
it rewrites. If it rewrites only one of the callsites (specifically the
one in read_mems_allowed_retry) and then waits for the
smp_call_function(do_sync_core) to complete while a CPU is inside the
begin/retry section with IRQs off and the mems_allowed value is changed,
we can hang.
This is because begin() will always return 0 (since it wasn't patched
yet) while retry() will test the 0 against the actual value of the seq
counter.
The fix is to use two different static keys: one for begin
(pre_enable_key) and one for retry (enable_key). In cpuset_inc(), we
first bump the pre_enable key to ensure that cpuset_mems_allowed_begin()
always return a valid seqcount if are enabling cpusets. Similarly, when
disabling cpusets via cpuset_dec(), we first ensure that callers of
cpuset_mems_allowed_retry() will start ignoring the seqcount value
before we let cpuset_mems_allowed_begin() return 0.
The relevant stack traces of the two stuck threads:
CPU: 1 PID: 1415 Comm: mkdir Tainted: G L 4.9.36-00104-g540c51286237 #4
Hardware name: Default string Default string/Hardware, BIOS 4.29.1-20170526215256 05/26/2017
task: ffff8817f9c28000 task.stack: ffffc9000ffa4000
RIP: smp_call_function_many+0x1f9/0x260
Call Trace:
smp_call_function+0x3b/0x70
on_each_cpu+0x2f/0x90
text_poke_bp+0x87/0xd0
arch_jump_label_transform+0x93/0x100
__jump_label_update+0x77/0x90
jump_label_update+0xaa/0xc0
static_key_slow_inc+0x9e/0xb0
cpuset_css_online+0x70/0x2e0
online_css+0x2c/0xa0
cgroup_apply_control_enable+0x27f/0x3d0
cgroup_mkdir+0x2b7/0x420
kernfs_iop_mkdir+0x5a/0x80
vfs_mkdir+0xf6/0x1a0
SyS_mkdir+0xb7/0xe0
entry_SYSCALL_64_fastpath+0x18/0xad
...
CPU: 2 PID: 1 Comm: init Tainted: G L 4.9.36-00104-g540c51286237 #4
Hardware name: Default string Default string/Hardware, BIOS 4.29.1-20170526215256 05/26/2017
task: ffff8818087c0000 task.stack: ffffc90000030000
RIP: int3+0x39/0x70
Call Trace:
<#DB> ? ___slab_alloc+0x28b/0x5a0
<EOE> ? copy_process.part.40+0xf7/0x1de0
__slab_alloc.isra.80+0x54/0x90
copy_process.part.40+0xf7/0x1de0
copy_process.part.40+0xf7/0x1de0
kmem_cache_alloc_node+0x8a/0x280
copy_process.part.40+0xf7/0x1de0
_do_fork+0xe7/0x6c0
_raw_spin_unlock_irq+0x2d/0x60
trace_hardirqs_on_caller+0x136/0x1d0
entry_SYSCALL_64_fastpath+0x5/0xad
do_syscall_64+0x27/0x350
SyS_clone+0x19/0x20
do_syscall_64+0x60/0x350
entry_SYSCALL64_slow_path+0x25/0x25
Link: http://lkml.kernel.org/r/20170731040113.14197-1-dmitriyz@waymo.com
Fixes: 46e700abc44c ("mm, page_alloc: remove unnecessary taking of a seqlock when cpusets are disabled")
Signed-off-by: Dima Zavin <dmitriyz@waymo.com>
Reported-by: Cliff Spradlin <cspradlin@waymo.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christopher Lameter <cl@linux.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-08-03 04:32:18 +08:00
|
|
|
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
|
2016-05-20 08:14:30 +08:00
|
|
|
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
|
2006-01-08 17:01:57 +08:00
|
|
|
|
mm/page_alloc: detect allocation forbidden by cpuset and bail out early
There was a report that starting an Ubuntu in docker while using cpuset
to bind it to movable nodes (a node only has movable zone, like a node
for hotplug or a Persistent Memory node in normal usage) will fail due
to memory allocation failure, and then OOM is involved and many other
innocent processes got killed.
It can be reproduced with command:
$ docker run -it --rm --cpuset-mems 4 ubuntu:latest bash -c "grep Mems_allowed /proc/self/status"
(where node 4 is a movable node)
runc:[2:INIT] invoked oom-killer: gfp_mask=0x500cc2(GFP_HIGHUSER|__GFP_ACCOUNT), order=0, oom_score_adj=0
CPU: 8 PID: 8291 Comm: runc:[2:INIT] Tainted: G W I E 5.8.2-0.g71b519a-default #1 openSUSE Tumbleweed (unreleased)
Hardware name: Dell Inc. PowerEdge R640/0PHYDR, BIOS 2.6.4 04/09/2020
Call Trace:
dump_stack+0x6b/0x88
dump_header+0x4a/0x1e2
oom_kill_process.cold+0xb/0x10
out_of_memory.part.0+0xaf/0x230
out_of_memory+0x3d/0x80
__alloc_pages_slowpath.constprop.0+0x954/0xa20
__alloc_pages_nodemask+0x2d3/0x300
pipe_write+0x322/0x590
new_sync_write+0x196/0x1b0
vfs_write+0x1c3/0x1f0
ksys_write+0xa7/0xe0
do_syscall_64+0x52/0xd0
entry_SYSCALL_64_after_hwframe+0x44/0xa9
Mem-Info:
active_anon:392832 inactive_anon:182 isolated_anon:0
active_file:68130 inactive_file:151527 isolated_file:0
unevictable:2701 dirty:0 writeback:7
slab_reclaimable:51418 slab_unreclaimable:116300
mapped:45825 shmem:735 pagetables:2540 bounce:0
free:159849484 free_pcp:73 free_cma:0
Node 4 active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:0kB dirty:0kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 0kB writeback_tmp:0kB all_unreclaimable? no
Node 4 Movable free:130021408kB min:9140kB low:139160kB high:269180kB reserved_highatomic:0KB active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:130023424kB managed:130023424kB mlocked:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:292kB local_pcp:84kB free_cma:0kB
lowmem_reserve[]: 0 0 0 0 0
Node 4 Movable: 1*4kB (M) 0*8kB 0*16kB 1*32kB (M) 0*64kB 0*128kB 1*256kB (M) 1*512kB (M) 1*1024kB (M) 0*2048kB 31743*4096kB (M) = 130021156kB
oom-kill:constraint=CONSTRAINT_CPUSET,nodemask=(null),cpuset=docker-9976a269caec812c134fa317f27487ee36e1129beba7278a463dd53e5fb9997b.scope,mems_allowed=4,global_oom,task_memcg=/system.slice/containerd.service,task=containerd,pid=4100,uid=0
Out of memory: Killed process 4100 (containerd) total-vm:4077036kB, anon-rss:51184kB, file-rss:26016kB, shmem-rss:0kB, UID:0 pgtables:676kB oom_score_adj:0
oom_reaper: reaped process 8248 (docker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
oom_reaper: reaped process 2054 (node_exporter), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
oom_reaper: reaped process 1452 (systemd-journal), now anon-rss:0kB, file-rss:8564kB, shmem-rss:4kB
oom_reaper: reaped process 2146 (munin-node), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
oom_reaper: reaped process 8291 (runc:[2:INIT]), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
The reason is that in this case, the target cpuset nodes only have
movable zone, while the creation of an OS in docker sometimes needs to
allocate memory in non-movable zones (dma/dma32/normal) like
GFP_HIGHUSER, and the cpuset limit forbids the allocation, then
out-of-memory killing is involved even when normal nodes and movable
nodes both have many free memory.
The OOM killer cannot help to resolve the situation as there is no
usable memory for the request in the cpuset scope. The only reasonable
measure to take is to fail the allocation right away and have the caller
to deal with it.
So add a check for cases like this in the slowpath of allocation, and
bail out early returning NULL for the allocation.
As page allocation is one of the hottest path in kernel, this check will
hurt all users with sane cpuset configuration, add a static branch check
and detect the abnormal config in cpuset memory binding setup so that
the extra check cost in page allocation is not paid by everyone.
[thanks to Micho Hocko and David Rientjes for suggesting not handling
it inside OOM code, adding cpuset check, refining comments]
Link: https://lkml.kernel.org/r/1632481657-68112-1-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-06 04:40:34 +08:00
|
|
|
/*
|
|
|
|
* There could be abnormal cpuset configurations for cpu or memory
|
2022-03-06 04:46:57 +08:00
|
|
|
* node binding, add this key to provide a quick low-cost judgment
|
mm/page_alloc: detect allocation forbidden by cpuset and bail out early
There was a report that starting an Ubuntu in docker while using cpuset
to bind it to movable nodes (a node only has movable zone, like a node
for hotplug or a Persistent Memory node in normal usage) will fail due
to memory allocation failure, and then OOM is involved and many other
innocent processes got killed.
It can be reproduced with command:
$ docker run -it --rm --cpuset-mems 4 ubuntu:latest bash -c "grep Mems_allowed /proc/self/status"
(where node 4 is a movable node)
runc:[2:INIT] invoked oom-killer: gfp_mask=0x500cc2(GFP_HIGHUSER|__GFP_ACCOUNT), order=0, oom_score_adj=0
CPU: 8 PID: 8291 Comm: runc:[2:INIT] Tainted: G W I E 5.8.2-0.g71b519a-default #1 openSUSE Tumbleweed (unreleased)
Hardware name: Dell Inc. PowerEdge R640/0PHYDR, BIOS 2.6.4 04/09/2020
Call Trace:
dump_stack+0x6b/0x88
dump_header+0x4a/0x1e2
oom_kill_process.cold+0xb/0x10
out_of_memory.part.0+0xaf/0x230
out_of_memory+0x3d/0x80
__alloc_pages_slowpath.constprop.0+0x954/0xa20
__alloc_pages_nodemask+0x2d3/0x300
pipe_write+0x322/0x590
new_sync_write+0x196/0x1b0
vfs_write+0x1c3/0x1f0
ksys_write+0xa7/0xe0
do_syscall_64+0x52/0xd0
entry_SYSCALL_64_after_hwframe+0x44/0xa9
Mem-Info:
active_anon:392832 inactive_anon:182 isolated_anon:0
active_file:68130 inactive_file:151527 isolated_file:0
unevictable:2701 dirty:0 writeback:7
slab_reclaimable:51418 slab_unreclaimable:116300
mapped:45825 shmem:735 pagetables:2540 bounce:0
free:159849484 free_pcp:73 free_cma:0
Node 4 active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:0kB dirty:0kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 0kB writeback_tmp:0kB all_unreclaimable? no
Node 4 Movable free:130021408kB min:9140kB low:139160kB high:269180kB reserved_highatomic:0KB active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:130023424kB managed:130023424kB mlocked:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:292kB local_pcp:84kB free_cma:0kB
lowmem_reserve[]: 0 0 0 0 0
Node 4 Movable: 1*4kB (M) 0*8kB 0*16kB 1*32kB (M) 0*64kB 0*128kB 1*256kB (M) 1*512kB (M) 1*1024kB (M) 0*2048kB 31743*4096kB (M) = 130021156kB
oom-kill:constraint=CONSTRAINT_CPUSET,nodemask=(null),cpuset=docker-9976a269caec812c134fa317f27487ee36e1129beba7278a463dd53e5fb9997b.scope,mems_allowed=4,global_oom,task_memcg=/system.slice/containerd.service,task=containerd,pid=4100,uid=0
Out of memory: Killed process 4100 (containerd) total-vm:4077036kB, anon-rss:51184kB, file-rss:26016kB, shmem-rss:0kB, UID:0 pgtables:676kB oom_score_adj:0
oom_reaper: reaped process 8248 (docker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
oom_reaper: reaped process 2054 (node_exporter), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
oom_reaper: reaped process 1452 (systemd-journal), now anon-rss:0kB, file-rss:8564kB, shmem-rss:4kB
oom_reaper: reaped process 2146 (munin-node), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
oom_reaper: reaped process 8291 (runc:[2:INIT]), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
The reason is that in this case, the target cpuset nodes only have
movable zone, while the creation of an OS in docker sometimes needs to
allocate memory in non-movable zones (dma/dma32/normal) like
GFP_HIGHUSER, and the cpuset limit forbids the allocation, then
out-of-memory killing is involved even when normal nodes and movable
nodes both have many free memory.
The OOM killer cannot help to resolve the situation as there is no
usable memory for the request in the cpuset scope. The only reasonable
measure to take is to fail the allocation right away and have the caller
to deal with it.
So add a check for cases like this in the slowpath of allocation, and
bail out early returning NULL for the allocation.
As page allocation is one of the hottest path in kernel, this check will
hurt all users with sane cpuset configuration, add a static branch check
and detect the abnormal config in cpuset memory binding setup so that
the extra check cost in page allocation is not paid by everyone.
[thanks to Micho Hocko and David Rientjes for suggesting not handling
it inside OOM code, adding cpuset check, refining comments]
Link: https://lkml.kernel.org/r/1632481657-68112-1-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-06 04:40:34 +08:00
|
|
|
* of the situation.
|
|
|
|
*/
|
|
|
|
DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
|
|
|
|
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
/* See "Frequency meter" comments, below. */
|
|
|
|
|
|
|
|
struct fmeter {
|
|
|
|
int cnt; /* unprocessed events count */
|
|
|
|
int val; /* most recent output value */
|
2015-11-25 23:16:55 +08:00
|
|
|
time64_t time; /* clock (secs) when val computed */
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
spinlock_t lock; /* guards read or write of above */
|
|
|
|
};
|
|
|
|
|
2022-09-02 04:57:41 +08:00
|
|
|
/*
|
|
|
|
* Invalid partition error code
|
|
|
|
*/
|
|
|
|
enum prs_errcode {
|
|
|
|
PERR_NONE = 0,
|
|
|
|
PERR_INVCPUS,
|
|
|
|
PERR_INVPARENT,
|
|
|
|
PERR_NOTPART,
|
|
|
|
PERR_NOTEXCL,
|
|
|
|
PERR_NOCPUS,
|
|
|
|
PERR_HOTPLUG,
|
|
|
|
PERR_CPUSEMPTY,
|
|
|
|
};
|
|
|
|
|
|
|
|
static const char * const perr_strings[] = {
|
|
|
|
[PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus",
|
|
|
|
[PERR_INVPARENT] = "Parent is an invalid partition root",
|
|
|
|
[PERR_NOTPART] = "Parent is not a partition root",
|
|
|
|
[PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
|
|
|
|
[PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
|
|
|
|
[PERR_HOTPLUG] = "No cpu available due to hotplug",
|
|
|
|
[PERR_CPUSEMPTY] = "cpuset.cpus is empty",
|
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct cpuset {
|
2007-10-19 14:39:39 +08:00
|
|
|
struct cgroup_subsys_state css;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long flags; /* "unsigned long" so bitops work */
|
cpuset: add cs->effective_cpus and cs->effective_mems
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierachy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
This patch adds the effective masks to struct cpuset and initializes
them. The effective masks of the top cpuset is the same with configured
masks, and a child cpuset inherits its parent's effective masks.
This won't introduce behavior change.
v2:
- s/real_{mems,cpus}_allowed/effective_{mems,cpus}, suggested by Tejun.
- don't init effective masks in cpuset_css_online() if !cgroup_on_dfl.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:03 +08:00
|
|
|
|
2014-07-09 16:48:42 +08:00
|
|
|
/*
|
|
|
|
* On default hierarchy:
|
|
|
|
*
|
|
|
|
* The user-configured masks can only be changed by writing to
|
|
|
|
* cpuset.cpus and cpuset.mems, and won't be limited by the
|
|
|
|
* parent masks.
|
|
|
|
*
|
|
|
|
* The effective masks is the real masks that apply to the tasks
|
|
|
|
* in the cpuset. They may be changed if the configured masks are
|
|
|
|
* changed or hotplug happens.
|
|
|
|
*
|
|
|
|
* effective_mask == configured_mask & parent's effective_mask,
|
|
|
|
* and if it ends up empty, it will inherit the parent's mask.
|
|
|
|
*
|
|
|
|
*
|
2021-01-13 12:37:41 +08:00
|
|
|
* On legacy hierarchy:
|
2014-07-09 16:48:42 +08:00
|
|
|
*
|
|
|
|
* The user-configured masks are always the same with effective masks.
|
|
|
|
*/
|
|
|
|
|
cpuset: add cs->effective_cpus and cs->effective_mems
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierachy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
This patch adds the effective masks to struct cpuset and initializes
them. The effective masks of the top cpuset is the same with configured
masks, and a child cpuset inherits its parent's effective masks.
This won't introduce behavior change.
v2:
- s/real_{mems,cpus}_allowed/effective_{mems,cpus}, suggested by Tejun.
- don't init effective masks in cpuset_css_online() if !cgroup_on_dfl.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:03 +08:00
|
|
|
/* user-configured CPUs and Memory Nodes allow to tasks */
|
|
|
|
cpumask_var_t cpus_allowed;
|
|
|
|
nodemask_t mems_allowed;
|
|
|
|
|
|
|
|
/* effective CPUs and Memory Nodes allow to tasks */
|
|
|
|
cpumask_var_t effective_cpus;
|
|
|
|
nodemask_t effective_mems;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2018-11-08 23:08:36 +08:00
|
|
|
/*
|
|
|
|
* CPUs allocated to child sub-partitions (default hierarchy only)
|
|
|
|
* - CPUs granted by the parent = effective_cpus U subparts_cpus
|
|
|
|
* - effective_cpus and subparts_cpus are mutually exclusive.
|
2018-11-08 23:08:41 +08:00
|
|
|
*
|
|
|
|
* effective_cpus contains only onlined CPUs, but subparts_cpus
|
|
|
|
* may have offlined ones.
|
2018-11-08 23:08:36 +08:00
|
|
|
*/
|
|
|
|
cpumask_var_t subparts_cpus;
|
|
|
|
|
2013-06-09 17:15:08 +08:00
|
|
|
/*
|
|
|
|
* This is old Memory Nodes tasks took on.
|
|
|
|
*
|
|
|
|
* - top_cpuset.old_mems_allowed is initialized to mems_allowed.
|
|
|
|
* - A new cpuset's old_mems_allowed is initialized when some
|
|
|
|
* task is moved into it.
|
|
|
|
* - old_mems_allowed is used in cpuset_migrate_mm() when we change
|
|
|
|
* cpuset.mems_allowed and have tasks' nodemask updated, and
|
|
|
|
* then old_mems_allowed is updated to mems_allowed.
|
|
|
|
*/
|
|
|
|
nodemask_t old_mems_allowed;
|
|
|
|
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
struct fmeter fmeter; /* memory_pressure filter */
|
2007-10-19 14:40:20 +08:00
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
/*
|
|
|
|
* Tasks are being attached to this cpuset. Used to prevent
|
|
|
|
* zeroing cpus/mems_allowed between ->can_attach() and ->attach().
|
|
|
|
*/
|
|
|
|
int attach_in_progress;
|
|
|
|
|
2007-10-19 14:40:20 +08:00
|
|
|
/* partition number for rebuild_sched_domains() */
|
|
|
|
int pn;
|
2008-02-07 16:14:43 +08:00
|
|
|
|
2008-04-15 13:04:23 +08:00
|
|
|
/* for custom sched domain */
|
|
|
|
int relax_domain_level;
|
2018-11-08 23:08:36 +08:00
|
|
|
|
|
|
|
/* number of CPUs in subparts_cpus */
|
|
|
|
int nr_subparts_cpus;
|
|
|
|
|
|
|
|
/* partition root state */
|
|
|
|
int partition_root_state;
|
2018-11-08 23:08:40 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Default hierarchy only:
|
|
|
|
* use_parent_ecpus - set if using parent's effective_cpus
|
|
|
|
* child_ecpus_count - # of children with use_parent_ecpus set
|
|
|
|
*/
|
|
|
|
int use_parent_ecpus;
|
|
|
|
int child_ecpus_count;
|
2021-08-11 11:06:02 +08:00
|
|
|
|
2023-05-08 15:58:51 +08:00
|
|
|
/*
|
|
|
|
* number of SCHED_DEADLINE tasks attached to this cpuset, so that we
|
|
|
|
* know when to rebuild associated root domain bandwidth information.
|
|
|
|
*/
|
|
|
|
int nr_deadline_tasks;
|
2023-05-08 15:58:54 +08:00
|
|
|
int nr_migrate_dl_tasks;
|
|
|
|
u64 sum_migrate_dl_bw;
|
2023-05-08 15:58:51 +08:00
|
|
|
|
2022-09-02 04:57:41 +08:00
|
|
|
/* Invalid partition error code, not lock protected */
|
|
|
|
enum prs_errcode prs_err;
|
|
|
|
|
2021-08-11 11:06:02 +08:00
|
|
|
/* Handle for cpuset.cpus.partition */
|
|
|
|
struct cgroup_file partition_file;
|
2018-11-08 23:08:36 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Partition root states:
|
|
|
|
*
|
2022-09-02 04:57:37 +08:00
|
|
|
* 0 - member (not a partition root)
|
2018-11-08 23:08:36 +08:00
|
|
|
* 1 - partition root
|
2022-09-02 04:57:40 +08:00
|
|
|
* 2 - partition root without load balancing (isolated)
|
2018-11-08 23:08:39 +08:00
|
|
|
* -1 - invalid partition root
|
2022-09-02 04:57:40 +08:00
|
|
|
* -2 - invalid isolated partition root
|
2018-11-08 23:08:36 +08:00
|
|
|
*/
|
2022-09-02 04:57:37 +08:00
|
|
|
#define PRS_MEMBER 0
|
|
|
|
#define PRS_ROOT 1
|
2022-09-02 04:57:40 +08:00
|
|
|
#define PRS_ISOLATED 2
|
2022-09-02 04:57:37 +08:00
|
|
|
#define PRS_INVALID_ROOT -1
|
2022-09-02 04:57:40 +08:00
|
|
|
#define PRS_INVALID_ISOLATED -2
|
2022-09-02 04:57:37 +08:00
|
|
|
|
|
|
|
static inline bool is_prs_invalid(int prs_state)
|
|
|
|
{
|
|
|
|
return prs_state < 0;
|
|
|
|
}
|
2018-11-08 23:08:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Temporary cpumasks for working with partitions that are passed among
|
|
|
|
* functions to avoid memory allocation in inner functions.
|
|
|
|
*/
|
|
|
|
struct tmpmasks {
|
|
|
|
cpumask_var_t addmask, delmask; /* For partition root */
|
|
|
|
cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2013-08-09 08:11:23 +08:00
|
|
|
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
|
2007-10-19 14:39:39 +08:00
|
|
|
{
|
2013-08-09 08:11:23 +08:00
|
|
|
return css ? container_of(css, struct cpuset, css) : NULL;
|
2007-10-19 14:39:39 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Retrieve the cpuset for a task */
|
|
|
|
static inline struct cpuset *task_cs(struct task_struct *task)
|
|
|
|
{
|
2014-02-08 23:36:58 +08:00
|
|
|
return css_cs(task_css(task, cpuset_cgrp_id));
|
2007-10-19 14:39:39 +08:00
|
|
|
}
|
|
|
|
|
2013-08-09 08:11:22 +08:00
|
|
|
static inline struct cpuset *parent_cs(struct cpuset *cs)
|
2013-01-08 00:51:08 +08:00
|
|
|
{
|
2014-05-17 01:22:48 +08:00
|
|
|
return css_cs(cs->css.parent);
|
2013-01-08 00:51:08 +08:00
|
|
|
}
|
|
|
|
|
2023-05-08 15:58:51 +08:00
|
|
|
void inc_dl_tasks_cs(struct task_struct *p)
|
|
|
|
{
|
|
|
|
struct cpuset *cs = task_cs(p);
|
|
|
|
|
|
|
|
cs->nr_deadline_tasks++;
|
|
|
|
}
|
|
|
|
|
|
|
|
void dec_dl_tasks_cs(struct task_struct *p)
|
|
|
|
{
|
|
|
|
struct cpuset *cs = task_cs(p);
|
|
|
|
|
|
|
|
cs->nr_deadline_tasks--;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* bits in struct cpuset flags field */
|
|
|
|
typedef enum {
|
2013-01-08 00:51:07 +08:00
|
|
|
CS_ONLINE,
|
2005-04-17 06:20:36 +08:00
|
|
|
CS_CPU_EXCLUSIVE,
|
|
|
|
CS_MEM_EXCLUSIVE,
|
2008-04-29 16:00:26 +08:00
|
|
|
CS_MEM_HARDWALL,
|
[PATCH] cpusets: swap migration interface
Add a boolean "memory_migrate" to each cpuset, represented by a file
containing "0" or "1" in each directory below /dev/cpuset.
It defaults to false (file contains "0"). It can be set true by writing
"1" to the file.
If true, then anytime that a task is attached to the cpuset so marked, the
pages of that task will be moved to that cpuset, preserving, to the extent
practical, the cpuset-relative placement of the pages.
Also anytime that a cpuset so marked has its memory placement changed (by
writing to its "mems" file), the tasks in that cpuset will have their pages
moved to the cpusets new nodes, preserving, to the extent practical, the
cpuset-relative placement of the moved pages.
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:00:56 +08:00
|
|
|
CS_MEMORY_MIGRATE,
|
2007-10-19 14:40:20 +08:00
|
|
|
CS_SCHED_LOAD_BALANCE,
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
|
|
|
CS_SPREAD_PAGE,
|
|
|
|
CS_SPREAD_SLAB,
|
2005-04-17 06:20:36 +08:00
|
|
|
} cpuset_flagbits_t;
|
|
|
|
|
|
|
|
/* convenient tests for these bits */
|
2017-05-25 00:03:48 +08:00
|
|
|
static inline bool is_cpuset_online(struct cpuset *cs)
|
2013-01-08 00:51:07 +08:00
|
|
|
{
|
2017-05-25 00:03:48 +08:00
|
|
|
return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
|
2013-01-08 00:51:07 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static inline int is_cpu_exclusive(const struct cpuset *cs)
|
|
|
|
{
|
2006-03-24 19:16:00 +08:00
|
|
|
return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int is_mem_exclusive(const struct cpuset *cs)
|
|
|
|
{
|
2006-03-24 19:16:00 +08:00
|
|
|
return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2008-04-29 16:00:26 +08:00
|
|
|
static inline int is_mem_hardwall(const struct cpuset *cs)
|
|
|
|
{
|
|
|
|
return test_bit(CS_MEM_HARDWALL, &cs->flags);
|
|
|
|
}
|
|
|
|
|
2007-10-19 14:40:20 +08:00
|
|
|
static inline int is_sched_load_balance(const struct cpuset *cs)
|
|
|
|
{
|
|
|
|
return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
|
|
|
|
}
|
|
|
|
|
[PATCH] cpusets: swap migration interface
Add a boolean "memory_migrate" to each cpuset, represented by a file
containing "0" or "1" in each directory below /dev/cpuset.
It defaults to false (file contains "0"). It can be set true by writing
"1" to the file.
If true, then anytime that a task is attached to the cpuset so marked, the
pages of that task will be moved to that cpuset, preserving, to the extent
practical, the cpuset-relative placement of the pages.
Also anytime that a cpuset so marked has its memory placement changed (by
writing to its "mems" file), the tasks in that cpuset will have their pages
moved to the cpusets new nodes, preserving, to the extent practical, the
cpuset-relative placement of the moved pages.
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:00:56 +08:00
|
|
|
static inline int is_memory_migrate(const struct cpuset *cs)
|
|
|
|
{
|
2006-03-24 19:16:00 +08:00
|
|
|
return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
|
[PATCH] cpusets: swap migration interface
Add a boolean "memory_migrate" to each cpuset, represented by a file
containing "0" or "1" in each directory below /dev/cpuset.
It defaults to false (file contains "0"). It can be set true by writing
"1" to the file.
If true, then anytime that a task is attached to the cpuset so marked, the
pages of that task will be moved to that cpuset, preserving, to the extent
practical, the cpuset-relative placement of the pages.
Also anytime that a cpuset so marked has its memory placement changed (by
writing to its "mems" file), the tasks in that cpuset will have their pages
moved to the cpusets new nodes, preserving, to the extent practical, the
cpuset-relative placement of the moved pages.
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:00:56 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
|
|
|
static inline int is_spread_page(const struct cpuset *cs)
|
|
|
|
{
|
|
|
|
return test_bit(CS_SPREAD_PAGE, &cs->flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int is_spread_slab(const struct cpuset *cs)
|
|
|
|
{
|
|
|
|
return test_bit(CS_SPREAD_SLAB, &cs->flags);
|
|
|
|
}
|
|
|
|
|
2022-09-02 04:57:37 +08:00
|
|
|
static inline int is_partition_valid(const struct cpuset *cs)
|
2018-11-08 23:08:36 +08:00
|
|
|
{
|
2018-11-08 23:08:39 +08:00
|
|
|
return cs->partition_root_state > 0;
|
2018-11-08 23:08:36 +08:00
|
|
|
}
|
|
|
|
|
2022-09-02 04:57:37 +08:00
|
|
|
static inline int is_partition_invalid(const struct cpuset *cs)
|
|
|
|
{
|
|
|
|
return cs->partition_root_state < 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Callers should hold callback_lock to modify partition_root_state.
|
|
|
|
*/
|
|
|
|
static inline void make_partition_invalid(struct cpuset *cs)
|
|
|
|
{
|
2022-09-02 04:57:40 +08:00
|
|
|
if (is_partition_valid(cs))
|
|
|
|
cs->partition_root_state = -cs->partition_root_state;
|
2022-09-02 04:57:37 +08:00
|
|
|
}
|
|
|
|
|
2021-08-11 11:06:02 +08:00
|
|
|
/*
|
|
|
|
* Send notification event of whenever partition_root_state changes.
|
|
|
|
*/
|
2022-09-02 04:57:37 +08:00
|
|
|
static inline void notify_partition_change(struct cpuset *cs, int old_prs)
|
2021-08-11 11:06:02 +08:00
|
|
|
{
|
2022-09-02 04:57:37 +08:00
|
|
|
if (old_prs == cs->partition_root_state)
|
|
|
|
return;
|
|
|
|
cgroup_file_notify(&cs->partition_file);
|
2022-09-02 04:57:41 +08:00
|
|
|
|
|
|
|
/* Reset prs_err if not invalid */
|
|
|
|
if (is_partition_valid(cs))
|
|
|
|
WRITE_ONCE(cs->prs_err, PERR_NONE);
|
2021-08-11 11:06:02 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static struct cpuset top_cpuset = {
|
2013-01-08 00:51:07 +08:00
|
|
|
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
|
|
|
|
(1 << CS_MEM_EXCLUSIVE)),
|
2022-09-02 04:57:37 +08:00
|
|
|
.partition_root_state = PRS_ROOT,
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
/**
|
|
|
|
* cpuset_for_each_child - traverse online children of a cpuset
|
|
|
|
* @child_cs: loop cursor pointing to the current child
|
2013-08-09 08:11:25 +08:00
|
|
|
* @pos_css: used for iteration
|
2013-01-08 00:51:07 +08:00
|
|
|
* @parent_cs: target cpuset to walk children of
|
|
|
|
*
|
|
|
|
* Walk @child_cs through the online children of @parent_cs. Must be used
|
|
|
|
* with RCU read locked.
|
|
|
|
*/
|
2013-08-09 08:11:25 +08:00
|
|
|
#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
|
|
|
|
css_for_each_child((pos_css), &(parent_cs)->css) \
|
|
|
|
if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
|
2013-01-08 00:51:07 +08:00
|
|
|
|
2013-01-08 00:51:08 +08:00
|
|
|
/**
|
|
|
|
* cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
|
|
|
|
* @des_cs: loop cursor pointing to the current descendant
|
2013-08-09 08:11:25 +08:00
|
|
|
* @pos_css: used for iteration
|
2013-01-08 00:51:08 +08:00
|
|
|
* @root_cs: target cpuset to walk ancestor of
|
|
|
|
*
|
|
|
|
* Walk @des_cs through the online descendants of @root_cs. Must be used
|
2013-08-09 08:11:25 +08:00
|
|
|
* with RCU read locked. The caller may modify @pos_css by calling
|
2013-08-09 08:11:27 +08:00
|
|
|
* css_rightmost_descendant() to skip subtree. @root_cs is included in the
|
|
|
|
* iteration and the first node to be visited.
|
2013-01-08 00:51:08 +08:00
|
|
|
*/
|
2013-08-09 08:11:25 +08:00
|
|
|
#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
|
|
|
|
css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
|
|
|
|
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
|
2013-01-08 00:51:08 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2023-05-08 15:58:50 +08:00
|
|
|
* There are two global locks guarding cpuset structures - cpuset_mutex and
|
2014-10-20 19:50:29 +08:00
|
|
|
* callback_lock. We also require taking task_lock() when dereferencing a
|
|
|
|
* task's cpuset pointer. See "The task_lock() exception", at the end of this
|
2023-05-08 15:58:50 +08:00
|
|
|
* comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems
|
|
|
|
* can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
|
|
|
|
* structures. Note that cpuset_mutex needs to be a mutex as it is used in
|
|
|
|
* paths that rely on priority inheritance (e.g. scheduler - on RT) for
|
|
|
|
* correctness.
|
2013-01-08 00:51:08 +08:00
|
|
|
*
|
2014-10-20 19:50:29 +08:00
|
|
|
* A task must hold both locks to modify cpusets. If a task holds
|
2023-05-08 15:58:50 +08:00
|
|
|
* cpuset_mutex, it blocks others, ensuring that it is the only task able to
|
|
|
|
* also acquire callback_lock and be able to modify cpusets. It can perform
|
|
|
|
* various checks on the cpuset structure first, knowing nothing will change.
|
|
|
|
* It can also allocate memory while just holding cpuset_mutex. While it is
|
|
|
|
* performing these checks, various callback routines can briefly acquire
|
|
|
|
* callback_lock to query cpusets. Once it is ready to make the changes, it
|
|
|
|
* takes callback_lock, blocking everyone else.
|
[PATCH] cpusets: dual semaphore locking overhaul
Overhaul cpuset locking. Replace single semaphore with two semaphores.
The suggestion to use two locks was made by Roman Zippel.
Both locks are global. Code that wants to modify cpusets must first
acquire the exclusive manage_sem, which allows them read-only access to
cpusets, and holds off other would-be modifiers. Before making actual
changes, the second semaphore, callback_sem must be acquired as well. Code
that needs only to query cpusets must acquire callback_sem, which is also a
global exclusive lock.
The earlier problems with double tripping are avoided, because it is
allowed for holders of manage_sem to nest the second callback_sem lock, and
only callback_sem is needed by code called from within __alloc_pages(),
where the double tripping had been possible.
This is not quite the same as a normal read/write semaphore, because
obtaining read-only access with intent to change must hold off other such
attempts, while allowing read-only access w/o such intention. Changing
cpusets involves several related checks and changes, which must be done
while allowing read-only queries (to avoid the double trip), but while
ensuring nothing changes (holding off other would be modifiers.)
This overhaul of cpuset locking also makes careful use of task_lock() to
guard access to the task->cpuset pointer, closing a couple of race
conditions noticed while reading this code (thanks, Roman). I've never
seen these races fail in any use or test.
See further the comments in the code.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-31 07:02:30 +08:00
|
|
|
*
|
|
|
|
* Calls to the kernel memory allocator can not be made while holding
|
2014-10-20 19:50:29 +08:00
|
|
|
* callback_lock, as that would risk double tripping on callback_lock
|
[PATCH] cpusets: dual semaphore locking overhaul
Overhaul cpuset locking. Replace single semaphore with two semaphores.
The suggestion to use two locks was made by Roman Zippel.
Both locks are global. Code that wants to modify cpusets must first
acquire the exclusive manage_sem, which allows them read-only access to
cpusets, and holds off other would-be modifiers. Before making actual
changes, the second semaphore, callback_sem must be acquired as well. Code
that needs only to query cpusets must acquire callback_sem, which is also a
global exclusive lock.
The earlier problems with double tripping are avoided, because it is
allowed for holders of manage_sem to nest the second callback_sem lock, and
only callback_sem is needed by code called from within __alloc_pages(),
where the double tripping had been possible.
This is not quite the same as a normal read/write semaphore, because
obtaining read-only access with intent to change must hold off other such
attempts, while allowing read-only access w/o such intention. Changing
cpusets involves several related checks and changes, which must be done
while allowing read-only queries (to avoid the double trip), but while
ensuring nothing changes (holding off other would be modifiers.)
This overhaul of cpuset locking also makes careful use of task_lock() to
guard access to the task->cpuset pointer, closing a couple of race
conditions noticed while reading this code (thanks, Roman). I've never
seen these races fail in any use or test.
See further the comments in the code.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-31 07:02:30 +08:00
|
|
|
* from one of the callbacks into the cpuset code from within
|
|
|
|
* __alloc_pages().
|
|
|
|
*
|
2014-10-20 19:50:29 +08:00
|
|
|
* If a task is only holding callback_lock, then it has read-only
|
[PATCH] cpusets: dual semaphore locking overhaul
Overhaul cpuset locking. Replace single semaphore with two semaphores.
The suggestion to use two locks was made by Roman Zippel.
Both locks are global. Code that wants to modify cpusets must first
acquire the exclusive manage_sem, which allows them read-only access to
cpusets, and holds off other would-be modifiers. Before making actual
changes, the second semaphore, callback_sem must be acquired as well. Code
that needs only to query cpusets must acquire callback_sem, which is also a
global exclusive lock.
The earlier problems with double tripping are avoided, because it is
allowed for holders of manage_sem to nest the second callback_sem lock, and
only callback_sem is needed by code called from within __alloc_pages(),
where the double tripping had been possible.
This is not quite the same as a normal read/write semaphore, because
obtaining read-only access with intent to change must hold off other such
attempts, while allowing read-only access w/o such intention. Changing
cpusets involves several related checks and changes, which must be done
while allowing read-only queries (to avoid the double trip), but while
ensuring nothing changes (holding off other would be modifiers.)
This overhaul of cpuset locking also makes careful use of task_lock() to
guard access to the task->cpuset pointer, closing a couple of race
conditions noticed while reading this code (thanks, Roman). I've never
seen these races fail in any use or test.
See further the comments in the code.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-31 07:02:30 +08:00
|
|
|
* access to cpusets.
|
|
|
|
*
|
cpuset,mm: update tasks' mems_allowed in time
Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.
In order to update tasks' mems_allowed in time, we must modify the code of
memory policy. Because the memory policy is applied in the process's
context originally. After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.
But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression. But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set. In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.
[lee.schermerhorn@hp.com:
The rework of mpol_new() to extract the adjusting of the node mask to
apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
allocation. Fix this by adding the check for MPOL_PREFERRED and empty
node mask to mpol_new_mpolicy().
Remove the now unneeded 'nodes = NULL' from mpol_new().
Note that mpol_new_mempolicy() is always called with a non-NULL
'nodes' parameter now that it has been removed from mpol_new().
Therefore, we don't need to test nodes for NULL before testing it for
'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to
verify this assumption.]
[lee.schermerhorn@hp.com:
I don't think the function name 'mpol_new_mempolicy' is descriptive
enough to differentiate it from mpol_new().
This function applies cpuset set context, usually constraining nodes
to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag
is set, it also translates the nodes. So I settled on
'mpol_set_nodemask()', because the comment block for mpol_new() mentions
that we need to call this function to "set nodes".
Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-17 06:31:49 +08:00
|
|
|
* Now, the task_struct fields mems_allowed and mempolicy may be changed
|
|
|
|
* by other task, we use alloc_lock in the task_struct fields to protect
|
|
|
|
* them.
|
[PATCH] cpusets: dual semaphore locking overhaul
Overhaul cpuset locking. Replace single semaphore with two semaphores.
The suggestion to use two locks was made by Roman Zippel.
Both locks are global. Code that wants to modify cpusets must first
acquire the exclusive manage_sem, which allows them read-only access to
cpusets, and holds off other would-be modifiers. Before making actual
changes, the second semaphore, callback_sem must be acquired as well. Code
that needs only to query cpusets must acquire callback_sem, which is also a
global exclusive lock.
The earlier problems with double tripping are avoided, because it is
allowed for holders of manage_sem to nest the second callback_sem lock, and
only callback_sem is needed by code called from within __alloc_pages(),
where the double tripping had been possible.
This is not quite the same as a normal read/write semaphore, because
obtaining read-only access with intent to change must hold off other such
attempts, while allowing read-only access w/o such intention. Changing
cpusets involves several related checks and changes, which must be done
while allowing read-only queries (to avoid the double trip), but while
ensuring nothing changes (holding off other would be modifiers.)
This overhaul of cpuset locking also makes careful use of task_lock() to
guard access to the task->cpuset pointer, closing a couple of race
conditions noticed while reading this code (thanks, Roman). I've never
seen these races fail in any use or test.
See further the comments in the code.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-31 07:02:30 +08:00
|
|
|
*
|
2014-10-20 19:50:29 +08:00
|
|
|
* The cpuset_common_file_read() handlers only hold callback_lock across
|
[PATCH] cpusets: dual semaphore locking overhaul
Overhaul cpuset locking. Replace single semaphore with two semaphores.
The suggestion to use two locks was made by Roman Zippel.
Both locks are global. Code that wants to modify cpusets must first
acquire the exclusive manage_sem, which allows them read-only access to
cpusets, and holds off other would-be modifiers. Before making actual
changes, the second semaphore, callback_sem must be acquired as well. Code
that needs only to query cpusets must acquire callback_sem, which is also a
global exclusive lock.
The earlier problems with double tripping are avoided, because it is
allowed for holders of manage_sem to nest the second callback_sem lock, and
only callback_sem is needed by code called from within __alloc_pages(),
where the double tripping had been possible.
This is not quite the same as a normal read/write semaphore, because
obtaining read-only access with intent to change must hold off other such
attempts, while allowing read-only access w/o such intention. Changing
cpusets involves several related checks and changes, which must be done
while allowing read-only queries (to avoid the double trip), but while
ensuring nothing changes (holding off other would be modifiers.)
This overhaul of cpuset locking also makes careful use of task_lock() to
guard access to the task->cpuset pointer, closing a couple of race
conditions noticed while reading this code (thanks, Roman). I've never
seen these races fail in any use or test.
See further the comments in the code.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-31 07:02:30 +08:00
|
|
|
* small pieces of code, such as when reading out possibly multi-word
|
|
|
|
* cpumasks and nodemasks.
|
|
|
|
*
|
2008-02-07 16:14:45 +08:00
|
|
|
* Accessing a task's cpuset should be done in accordance with the
|
|
|
|
* guidelines for accessing subsystem state in kernel/cgroup.c
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
static DEFINE_MUTEX(cpuset_mutex);
|
2019-07-19 22:00:00 +08:00
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
void cpuset_lock(void)
|
2019-07-19 22:00:00 +08:00
|
|
|
{
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2019-07-19 22:00:00 +08:00
|
|
|
}
|
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
void cpuset_unlock(void)
|
2019-07-19 22:00:00 +08:00
|
|
|
{
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2019-07-19 22:00:00 +08:00
|
|
|
}
|
|
|
|
|
2014-10-20 19:50:29 +08:00
|
|
|
static DEFINE_SPINLOCK(callback_lock);
|
[PATCH] cpuset semaphore depth check deadlock fix
The cpusets-formalize-intermediate-gfp_kernel-containment patch
has a deadlock problem.
This patch was part of a set of four patches to make more
extensive use of the cpuset 'mem_exclusive' attribute to
manage kernel GFP_KERNEL memory allocations and to constrain
the out-of-memory (oom) killer.
A task that is changing cpusets in particular ways on a system
when it is very short of free memory could double trip over
the global cpuset_sem semaphore (get the lock and then deadlock
trying to get it again).
The second attempt to get cpuset_sem would be in the routine
cpuset_zone_allowed(). This was discovered by code inspection.
I can not reproduce the problem except with an artifically
hacked kernel and a specialized stress test.
In real life you cannot hit this unless you are manipulating
cpusets, and are very unlikely to hit it unless you are rapidly
modifying cpusets on a memory tight system. Even then it would
be a rare occurence.
If you did hit it, the task double tripping over cpuset_sem
would deadlock in the kernel, and any other task also trying
to manipulate cpusets would deadlock there too, on cpuset_sem.
Your batch manager would be wedged solid (if it was cpuset
savvy), but classic Unix shells and utilities would work well
enough to reboot the system.
The unusual condition that led to this bug is that unlike most
semaphores, cpuset_sem _can_ be acquired while in the page
allocation code, when __alloc_pages() calls cpuset_zone_allowed.
So it easy to mistakenly perform the following sequence:
1) task makes system call to alter a cpuset
2) take cpuset_sem
3) try to allocate memory
4) memory allocator, via cpuset_zone_allowed, trys to take cpuset_sem
5) deadlock
The reason that this is not a serious bug for most users
is that almost all calls to allocate memory don't require
taking cpuset_sem. Only some code paths off the beaten
track require taking cpuset_sem -- which is good. Taking
a global semaphore on the main code path for allocating
memory would not scale well.
This patch fixes this deadlock by wrapping the up() and down()
calls on cpuset_sem in kernel/cpuset.c with code that tracks
the nesting depth of the current task on that semaphore, and
only does the real down() if the task doesn't hold the lock
already, and only does the real up() if the nesting depth
(number of unmatched downs) is exactly one.
The previous required use of refresh_mems(), anytime that
the cpuset_sem semaphore was acquired and the code executed
while holding that semaphore might try to allocate memory, is
no longer required. Two refresh_mems() calls were removed
thanks to this. This is a good change, as failing to get
all the necessary refresh_mems() calls placed was a primary
source of bugs in this cpuset code. The only remaining call
to refresh_mems() is made while doing a memory allocation,
if certain task memory placement data needs to be updated
from its cpuset, due to the cpuset having been changed behind
the tasks back.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-10 15:26:06 +08:00
|
|
|
|
2016-01-20 01:18:41 +08:00
|
|
|
static struct workqueue_struct *cpuset_migrate_mm_wq;
|
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
/*
|
|
|
|
* CPU / memory hotplug is handled asynchronously.
|
|
|
|
*/
|
|
|
|
static void cpuset_hotplug_workfn(struct work_struct *work);
|
|
|
|
static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
|
|
|
|
|
2013-06-09 17:14:22 +08:00
|
|
|
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
|
|
|
|
|
mm/page_alloc: detect allocation forbidden by cpuset and bail out early
There was a report that starting an Ubuntu in docker while using cpuset
to bind it to movable nodes (a node only has movable zone, like a node
for hotplug or a Persistent Memory node in normal usage) will fail due
to memory allocation failure, and then OOM is involved and many other
innocent processes got killed.
It can be reproduced with command:
$ docker run -it --rm --cpuset-mems 4 ubuntu:latest bash -c "grep Mems_allowed /proc/self/status"
(where node 4 is a movable node)
runc:[2:INIT] invoked oom-killer: gfp_mask=0x500cc2(GFP_HIGHUSER|__GFP_ACCOUNT), order=0, oom_score_adj=0
CPU: 8 PID: 8291 Comm: runc:[2:INIT] Tainted: G W I E 5.8.2-0.g71b519a-default #1 openSUSE Tumbleweed (unreleased)
Hardware name: Dell Inc. PowerEdge R640/0PHYDR, BIOS 2.6.4 04/09/2020
Call Trace:
dump_stack+0x6b/0x88
dump_header+0x4a/0x1e2
oom_kill_process.cold+0xb/0x10
out_of_memory.part.0+0xaf/0x230
out_of_memory+0x3d/0x80
__alloc_pages_slowpath.constprop.0+0x954/0xa20
__alloc_pages_nodemask+0x2d3/0x300
pipe_write+0x322/0x590
new_sync_write+0x196/0x1b0
vfs_write+0x1c3/0x1f0
ksys_write+0xa7/0xe0
do_syscall_64+0x52/0xd0
entry_SYSCALL_64_after_hwframe+0x44/0xa9
Mem-Info:
active_anon:392832 inactive_anon:182 isolated_anon:0
active_file:68130 inactive_file:151527 isolated_file:0
unevictable:2701 dirty:0 writeback:7
slab_reclaimable:51418 slab_unreclaimable:116300
mapped:45825 shmem:735 pagetables:2540 bounce:0
free:159849484 free_pcp:73 free_cma:0
Node 4 active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:0kB dirty:0kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 0kB writeback_tmp:0kB all_unreclaimable? no
Node 4 Movable free:130021408kB min:9140kB low:139160kB high:269180kB reserved_highatomic:0KB active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:130023424kB managed:130023424kB mlocked:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:292kB local_pcp:84kB free_cma:0kB
lowmem_reserve[]: 0 0 0 0 0
Node 4 Movable: 1*4kB (M) 0*8kB 0*16kB 1*32kB (M) 0*64kB 0*128kB 1*256kB (M) 1*512kB (M) 1*1024kB (M) 0*2048kB 31743*4096kB (M) = 130021156kB
oom-kill:constraint=CONSTRAINT_CPUSET,nodemask=(null),cpuset=docker-9976a269caec812c134fa317f27487ee36e1129beba7278a463dd53e5fb9997b.scope,mems_allowed=4,global_oom,task_memcg=/system.slice/containerd.service,task=containerd,pid=4100,uid=0
Out of memory: Killed process 4100 (containerd) total-vm:4077036kB, anon-rss:51184kB, file-rss:26016kB, shmem-rss:0kB, UID:0 pgtables:676kB oom_score_adj:0
oom_reaper: reaped process 8248 (docker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
oom_reaper: reaped process 2054 (node_exporter), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
oom_reaper: reaped process 1452 (systemd-journal), now anon-rss:0kB, file-rss:8564kB, shmem-rss:4kB
oom_reaper: reaped process 2146 (munin-node), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
oom_reaper: reaped process 8291 (runc:[2:INIT]), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
The reason is that in this case, the target cpuset nodes only have
movable zone, while the creation of an OS in docker sometimes needs to
allocate memory in non-movable zones (dma/dma32/normal) like
GFP_HIGHUSER, and the cpuset limit forbids the allocation, then
out-of-memory killing is involved even when normal nodes and movable
nodes both have many free memory.
The OOM killer cannot help to resolve the situation as there is no
usable memory for the request in the cpuset scope. The only reasonable
measure to take is to fail the allocation right away and have the caller
to deal with it.
So add a check for cases like this in the slowpath of allocation, and
bail out early returning NULL for the allocation.
As page allocation is one of the hottest path in kernel, this check will
hurt all users with sane cpuset configuration, add a static branch check
and detect the abnormal config in cpuset memory binding setup so that
the extra check cost in page allocation is not paid by everyone.
[thanks to Micho Hocko and David Rientjes for suggesting not handling
it inside OOM code, adding cpuset check, refining comments]
Link: https://lkml.kernel.org/r/1632481657-68112-1-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-06 04:40:34 +08:00
|
|
|
static inline void check_insane_mems_config(nodemask_t *nodes)
|
|
|
|
{
|
|
|
|
if (!cpusets_insane_config() &&
|
|
|
|
movable_only_nodes(nodes)) {
|
|
|
|
static_branch_enable(&cpusets_insane_config_key);
|
|
|
|
pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
|
|
|
|
"Cpuset allocations might fail even with a lot of memory available.\n",
|
|
|
|
nodemask_pr_args(nodes));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-08-18 03:33:10 +08:00
|
|
|
/*
|
2020-03-30 22:06:15 +08:00
|
|
|
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
|
|
|
|
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
|
|
|
|
* the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
|
|
|
|
* With v2 behavior, "cpus" and "mems" are always what the users have
|
|
|
|
* requested and won't be changed by hotplug events. Only the effective
|
|
|
|
* cpus or mems will be affected.
|
2017-08-18 03:33:10 +08:00
|
|
|
*/
|
|
|
|
static inline bool is_in_v2_mode(void)
|
|
|
|
{
|
|
|
|
return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
|
|
|
|
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
|
|
|
|
}
|
|
|
|
|
2022-09-02 04:57:38 +08:00
|
|
|
/**
|
|
|
|
* partition_is_populated - check if partition has tasks
|
|
|
|
* @cs: partition root to be checked
|
|
|
|
* @excluded_child: a child cpuset to be excluded in task checking
|
|
|
|
* Return: true if there are tasks, false otherwise
|
|
|
|
*
|
|
|
|
* It is assumed that @cs is a valid partition root. @excluded_child should
|
|
|
|
* be non-NULL when this cpuset is going to become a partition itself.
|
|
|
|
*/
|
|
|
|
static inline bool partition_is_populated(struct cpuset *cs,
|
|
|
|
struct cpuset *excluded_child)
|
|
|
|
{
|
|
|
|
struct cgroup_subsys_state *css;
|
|
|
|
struct cpuset *child;
|
|
|
|
|
|
|
|
if (cs->css.cgroup->nr_populated_csets)
|
|
|
|
return true;
|
|
|
|
if (!excluded_child && !cs->nr_subparts_cpus)
|
|
|
|
return cgroup_is_populated(cs->css.cgroup);
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
cpuset_for_each_child(child, css, cs) {
|
|
|
|
if (child == excluded_child)
|
|
|
|
continue;
|
|
|
|
if (is_partition_valid(child))
|
|
|
|
continue;
|
|
|
|
if (cgroup_is_populated(child->css.cgroup)) {
|
|
|
|
rcu_read_unlock();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2021-07-30 19:24:30 +08:00
|
|
|
* Return in pmask the portion of a task's cpusets's cpus_allowed that
|
|
|
|
* are online and are capable of running the task. If none are found,
|
|
|
|
* walk up the cpuset hierarchy until we find one that does have some
|
|
|
|
* appropriate cpus.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* One way or another, we guarantee to return some non-empty subset
|
2012-03-29 13:08:31 +08:00
|
|
|
* of cpu_online_mask.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2023-05-08 15:58:50 +08:00
|
|
|
* Call with callback_lock or cpuset_mutex held.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2021-07-30 19:24:30 +08:00
|
|
|
static void guarantee_online_cpus(struct task_struct *tsk,
|
|
|
|
struct cpumask *pmask)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2021-07-30 19:24:30 +08:00
|
|
|
const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
|
|
|
|
struct cpuset *cs;
|
|
|
|
|
|
|
|
if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
|
|
|
|
cpumask_copy(pmask, cpu_online_mask);
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
cs = task_cs(tsk);
|
|
|
|
|
|
|
|
while (!cpumask_intersects(cs->effective_cpus, pmask)) {
|
2013-01-08 00:51:08 +08:00
|
|
|
cs = parent_cs(cs);
|
2016-09-12 12:14:58 +08:00
|
|
|
if (unlikely(!cs)) {
|
|
|
|
/*
|
|
|
|
* The top cpuset doesn't have any online cpu as a
|
|
|
|
* consequence of a race between cpuset_hotplug_work
|
|
|
|
* and cpu hotplug notifier. But we know the top
|
2020-10-16 11:10:28 +08:00
|
|
|
* cpuset's effective_cpus is on its way to be
|
2016-09-12 12:14:58 +08:00
|
|
|
* identical to cpu_online_mask.
|
|
|
|
*/
|
2021-07-30 19:24:30 +08:00
|
|
|
goto out_unlock;
|
2016-09-12 12:14:58 +08:00
|
|
|
}
|
|
|
|
}
|
2021-07-30 19:24:30 +08:00
|
|
|
cpumask_and(pmask, pmask, cs->effective_cpus);
|
|
|
|
|
|
|
|
out_unlock:
|
|
|
|
rcu_read_unlock();
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return in *pmask the portion of a cpusets's mems_allowed that
|
2007-10-16 16:25:38 +08:00
|
|
|
* are online, with memory. If none are online with memory, walk
|
|
|
|
* up the cpuset hierarchy until we find one that does have some
|
2013-06-05 17:15:23 +08:00
|
|
|
* online mems. The top cpuset always has some mems online.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* One way or another, we guarantee to return some non-empty subset
|
2012-12-13 05:51:24 +08:00
|
|
|
* of node_states[N_MEMORY].
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2023-05-08 15:58:50 +08:00
|
|
|
* Call with callback_lock or cpuset_mutex held.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2013-08-09 08:11:22 +08:00
|
|
|
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2014-07-09 16:48:32 +08:00
|
|
|
while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
|
2013-01-08 00:51:08 +08:00
|
|
|
cs = parent_cs(cs);
|
2014-07-09 16:48:32 +08:00
|
|
|
nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2009-06-17 06:31:46 +08:00
|
|
|
/*
|
|
|
|
* update task's spread flag if cpuset's page/slab spread flag is set
|
|
|
|
*
|
2023-05-08 15:58:50 +08:00
|
|
|
* Call with callback_lock or cpuset_mutex held. The check can be skipped
|
2022-11-13 06:19:38 +08:00
|
|
|
* if on default hierarchy.
|
2009-06-17 06:31:46 +08:00
|
|
|
*/
|
2022-11-13 06:19:38 +08:00
|
|
|
static void cpuset_update_task_spread_flags(struct cpuset *cs,
|
2009-06-17 06:31:46 +08:00
|
|
|
struct task_struct *tsk)
|
|
|
|
{
|
2022-11-13 06:19:38 +08:00
|
|
|
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
|
|
|
|
return;
|
|
|
|
|
2009-06-17 06:31:46 +08:00
|
|
|
if (is_spread_page(cs))
|
2014-09-25 09:41:02 +08:00
|
|
|
task_set_spread_page(tsk);
|
2009-06-17 06:31:46 +08:00
|
|
|
else
|
2014-09-25 09:41:02 +08:00
|
|
|
task_clear_spread_page(tsk);
|
|
|
|
|
2009-06-17 06:31:46 +08:00
|
|
|
if (is_spread_slab(cs))
|
2014-09-25 09:41:02 +08:00
|
|
|
task_set_spread_slab(tsk);
|
2009-06-17 06:31:46 +08:00
|
|
|
else
|
2014-09-25 09:41:02 +08:00
|
|
|
task_clear_spread_slab(tsk);
|
2009-06-17 06:31:46 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
|
|
|
|
*
|
|
|
|
* One cpuset is a subset of another if all its allowed CPUs and
|
|
|
|
* Memory Nodes are a subset of the other, and its exclusive flags
|
2023-05-08 15:58:50 +08:00
|
|
|
* are only set if the other's are set. Call holding cpuset_mutex.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
|
|
|
|
{
|
2009-01-08 10:08:44 +08:00
|
|
|
return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
|
2005-04-17 06:20:36 +08:00
|
|
|
nodes_subset(p->mems_allowed, q->mems_allowed) &&
|
|
|
|
is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
|
|
|
|
is_mem_exclusive(p) <= is_mem_exclusive(q);
|
|
|
|
}
|
|
|
|
|
2018-11-08 23:08:37 +08:00
|
|
|
/**
|
|
|
|
* alloc_cpumasks - allocate three cpumasks for cpuset
|
|
|
|
* @cs: the cpuset that have cpumasks to be allocated.
|
|
|
|
* @tmp: the tmpmasks structure pointer
|
|
|
|
* Return: 0 if successful, -ENOMEM otherwise.
|
|
|
|
*
|
|
|
|
* Only one of the two input arguments should be non-NULL.
|
|
|
|
*/
|
|
|
|
static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
|
|
|
|
{
|
|
|
|
cpumask_var_t *pmask1, *pmask2, *pmask3;
|
|
|
|
|
|
|
|
if (cs) {
|
|
|
|
pmask1 = &cs->cpus_allowed;
|
|
|
|
pmask2 = &cs->effective_cpus;
|
|
|
|
pmask3 = &cs->subparts_cpus;
|
|
|
|
} else {
|
|
|
|
pmask1 = &tmp->new_cpus;
|
|
|
|
pmask2 = &tmp->addmask;
|
|
|
|
pmask3 = &tmp->delmask;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
|
|
|
|
goto free_one;
|
|
|
|
|
|
|
|
if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
|
|
|
|
goto free_two;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
free_two:
|
|
|
|
free_cpumask_var(*pmask2);
|
|
|
|
free_one:
|
|
|
|
free_cpumask_var(*pmask1);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* free_cpumasks - free cpumasks in a tmpmasks structure
|
|
|
|
* @cs: the cpuset that have cpumasks to be free.
|
|
|
|
* @tmp: the tmpmasks structure pointer
|
|
|
|
*/
|
|
|
|
static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
|
|
|
|
{
|
|
|
|
if (cs) {
|
|
|
|
free_cpumask_var(cs->cpus_allowed);
|
|
|
|
free_cpumask_var(cs->effective_cpus);
|
|
|
|
free_cpumask_var(cs->subparts_cpus);
|
|
|
|
}
|
|
|
|
if (tmp) {
|
|
|
|
free_cpumask_var(tmp->new_cpus);
|
|
|
|
free_cpumask_var(tmp->addmask);
|
|
|
|
free_cpumask_var(tmp->delmask);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-01-08 10:08:43 +08:00
|
|
|
/**
|
|
|
|
* alloc_trial_cpuset - allocate a trial cpuset
|
|
|
|
* @cs: the cpuset that the trial cpuset duplicates
|
|
|
|
*/
|
2013-08-09 08:11:22 +08:00
|
|
|
static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
|
2009-01-08 10:08:43 +08:00
|
|
|
{
|
2009-01-08 10:08:44 +08:00
|
|
|
struct cpuset *trial;
|
|
|
|
|
|
|
|
trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
|
|
|
|
if (!trial)
|
|
|
|
return NULL;
|
|
|
|
|
2018-11-08 23:08:37 +08:00
|
|
|
if (alloc_cpumasks(trial, NULL)) {
|
|
|
|
kfree(trial);
|
|
|
|
return NULL;
|
|
|
|
}
|
2009-01-08 10:08:44 +08:00
|
|
|
|
cpuset: add cs->effective_cpus and cs->effective_mems
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierachy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
This patch adds the effective masks to struct cpuset and initializes
them. The effective masks of the top cpuset is the same with configured
masks, and a child cpuset inherits its parent's effective masks.
This won't introduce behavior change.
v2:
- s/real_{mems,cpus}_allowed/effective_{mems,cpus}, suggested by Tejun.
- don't init effective masks in cpuset_css_online() if !cgroup_on_dfl.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:03 +08:00
|
|
|
cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
|
|
|
|
cpumask_copy(trial->effective_cpus, cs->effective_cpus);
|
2009-01-08 10:08:44 +08:00
|
|
|
return trial;
|
2009-01-08 10:08:43 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2018-11-08 23:08:37 +08:00
|
|
|
* free_cpuset - free the cpuset
|
|
|
|
* @cs: the cpuset to be freed
|
2009-01-08 10:08:43 +08:00
|
|
|
*/
|
2018-11-08 23:08:37 +08:00
|
|
|
static inline void free_cpuset(struct cpuset *cs)
|
2009-01-08 10:08:43 +08:00
|
|
|
{
|
2018-11-08 23:08:37 +08:00
|
|
|
free_cpumasks(cs, NULL);
|
|
|
|
kfree(cs);
|
2009-01-08 10:08:43 +08:00
|
|
|
}
|
|
|
|
|
2021-12-17 23:48:54 +08:00
|
|
|
/*
|
|
|
|
* validate_change_legacy() - Validate conditions specific to legacy (v1)
|
|
|
|
* behavior.
|
|
|
|
*/
|
|
|
|
static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
|
|
|
|
{
|
|
|
|
struct cgroup_subsys_state *css;
|
|
|
|
struct cpuset *c, *par;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
WARN_ON_ONCE(!rcu_read_lock_held());
|
|
|
|
|
|
|
|
/* Each of our child cpusets must be a subset of us */
|
|
|
|
ret = -EBUSY;
|
|
|
|
cpuset_for_each_child(c, css, cur)
|
|
|
|
if (!is_cpuset_subset(c, trial))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* On legacy hierarchy, we must be a subset of our parent cpuset. */
|
|
|
|
ret = -EACCES;
|
|
|
|
par = parent_cs(cur);
|
|
|
|
if (par && !is_cpuset_subset(trial, par))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* validate_change() - Used to validate that any proposed cpuset change
|
|
|
|
* follows the structural rules for cpusets.
|
|
|
|
*
|
|
|
|
* If we replaced the flag and mask values of the current cpuset
|
|
|
|
* (cur) with those values in the trial cpuset (trial), would
|
|
|
|
* our various subset and exclusive rules still be valid? Presumes
|
2023-05-08 15:58:50 +08:00
|
|
|
* cpuset_mutex held.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* 'cur' is the address of an actual, in-use cpuset. Operations
|
|
|
|
* such as list traversal that depend on the actual address of the
|
|
|
|
* cpuset in the list must use cur below, not trial.
|
|
|
|
*
|
|
|
|
* 'trial' is the address of bulk structure copy of cur, with
|
|
|
|
* perhaps one or more of the fields cpus_allowed, mems_allowed,
|
|
|
|
* or flags changed to new, trial values.
|
|
|
|
*
|
|
|
|
* Return 0 if valid, -errno if not.
|
|
|
|
*/
|
|
|
|
|
2013-08-09 08:11:22 +08:00
|
|
|
static int validate_change(struct cpuset *cur, struct cpuset *trial)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2013-08-09 08:11:25 +08:00
|
|
|
struct cgroup_subsys_state *css;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct cpuset *c, *par;
|
2021-12-17 23:48:54 +08:00
|
|
|
int ret = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2021-12-06 02:32:14 +08:00
|
|
|
rcu_read_lock();
|
2006-12-07 12:36:15 +08:00
|
|
|
|
2021-12-17 23:48:54 +08:00
|
|
|
if (!is_in_v2_mode())
|
|
|
|
ret = validate_change_legacy(cur, trial);
|
|
|
|
if (ret)
|
2013-01-08 00:51:07 +08:00
|
|
|
goto out;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2021-12-17 23:48:54 +08:00
|
|
|
/* Remaining checks don't apply to root cpuset */
|
|
|
|
if (cur == &top_cpuset)
|
2013-01-08 00:51:07 +08:00
|
|
|
goto out;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2021-12-17 23:48:54 +08:00
|
|
|
par = parent_cs(cur);
|
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
/*
|
|
|
|
* Cpusets with tasks - existing or newly being attached - can't
|
2013-08-21 10:22:28 +08:00
|
|
|
* be changed to have empty cpus_allowed or mems_allowed.
|
2013-01-08 00:51:07 +08:00
|
|
|
*/
|
2013-01-08 00:51:07 +08:00
|
|
|
ret = -ENOSPC;
|
2015-10-16 04:41:50 +08:00
|
|
|
if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
|
2013-08-21 10:22:28 +08:00
|
|
|
if (!cpumask_empty(cur->cpus_allowed) &&
|
|
|
|
cpumask_empty(trial->cpus_allowed))
|
|
|
|
goto out;
|
|
|
|
if (!nodes_empty(cur->mems_allowed) &&
|
|
|
|
nodes_empty(trial->mems_allowed))
|
|
|
|
goto out;
|
|
|
|
}
|
2007-10-19 14:40:21 +08:00
|
|
|
|
2014-10-07 16:52:11 +08:00
|
|
|
/*
|
|
|
|
* We can't shrink if we won't have enough room for SCHED_DEADLINE
|
|
|
|
* tasks.
|
|
|
|
*/
|
|
|
|
ret = -EBUSY;
|
|
|
|
if (is_cpu_exclusive(cur) &&
|
|
|
|
!cpuset_cpumask_can_shrink(cur->cpus_allowed,
|
|
|
|
trial->cpus_allowed))
|
|
|
|
goto out;
|
|
|
|
|
2022-09-02 04:57:42 +08:00
|
|
|
/*
|
|
|
|
* If either I or some sibling (!= me) is exclusive, we can't
|
|
|
|
* overlap
|
|
|
|
*/
|
|
|
|
ret = -EINVAL;
|
|
|
|
cpuset_for_each_child(c, css, par) {
|
|
|
|
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
|
|
|
|
c != cur &&
|
|
|
|
cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
|
|
|
|
goto out;
|
|
|
|
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
|
|
|
|
c != cur &&
|
|
|
|
nodes_intersects(trial->mems_allowed, c->mems_allowed))
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2009-04-03 07:57:55 +08:00
|
|
|
#ifdef CONFIG_SMP
|
2007-10-19 14:40:20 +08:00
|
|
|
/*
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
* Helper routine for generate_sched_domains().
|
2014-07-09 16:47:50 +08:00
|
|
|
* Do cpusets a, b have overlapping effective cpus_allowed masks?
|
2007-10-19 14:40:20 +08:00
|
|
|
*/
|
|
|
|
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
|
|
|
|
{
|
2014-07-09 16:47:50 +08:00
|
|
|
return cpumask_intersects(a->effective_cpus, b->effective_cpus);
|
2007-10-19 14:40:20 +08:00
|
|
|
}
|
|
|
|
|
2008-04-15 13:04:23 +08:00
|
|
|
static void
|
|
|
|
update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
|
|
|
|
{
|
|
|
|
if (dattr->relax_domain_level < c->relax_domain_level)
|
|
|
|
dattr->relax_domain_level = c->relax_domain_level;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2013-01-08 00:51:08 +08:00
|
|
|
static void update_domain_attr_tree(struct sched_domain_attr *dattr,
|
|
|
|
struct cpuset *root_cs)
|
2008-07-30 13:33:22 +08:00
|
|
|
{
|
2013-01-08 00:51:08 +08:00
|
|
|
struct cpuset *cp;
|
2013-08-09 08:11:25 +08:00
|
|
|
struct cgroup_subsys_state *pos_css;
|
2008-07-30 13:33:22 +08:00
|
|
|
|
2013-01-08 00:51:08 +08:00
|
|
|
rcu_read_lock();
|
2013-08-09 08:11:25 +08:00
|
|
|
cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
|
2013-01-08 00:51:08 +08:00
|
|
|
/* skip the whole subtree if @cp doesn't have any CPU */
|
|
|
|
if (cpumask_empty(cp->cpus_allowed)) {
|
2013-08-09 08:11:25 +08:00
|
|
|
pos_css = css_rightmost_descendant(pos_css);
|
2008-07-30 13:33:22 +08:00
|
|
|
continue;
|
2013-01-08 00:51:08 +08:00
|
|
|
}
|
2008-07-30 13:33:22 +08:00
|
|
|
|
|
|
|
if (is_sched_load_balance(cp))
|
|
|
|
update_domain_attr(dattr, cp);
|
|
|
|
}
|
2013-01-08 00:51:08 +08:00
|
|
|
rcu_read_unlock();
|
2008-07-30 13:33:22 +08:00
|
|
|
}
|
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
/* Must be called with cpuset_mutex held. */
|
2017-08-01 23:24:06 +08:00
|
|
|
static inline int nr_cpusets(void)
|
|
|
|
{
|
|
|
|
/* jump label reference count + the top-level cpuset */
|
|
|
|
return static_key_count(&cpusets_enabled_key.key) + 1;
|
|
|
|
}
|
|
|
|
|
2007-10-19 14:40:20 +08:00
|
|
|
/*
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
* generate_sched_domains()
|
|
|
|
*
|
|
|
|
* This function builds a partial partition of the systems CPUs
|
|
|
|
* A 'partial partition' is a set of non-overlapping subsets whose
|
|
|
|
* union is a subset of that set.
|
2013-06-04 15:40:24 +08:00
|
|
|
* The output of this function needs to be passed to kernel/sched/core.c
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
* partition_sched_domains() routine, which will rebuild the scheduler's
|
|
|
|
* load balancing domains (sched domains) as specified by that partial
|
|
|
|
* partition.
|
2007-10-19 14:40:20 +08:00
|
|
|
*
|
2019-06-28 00:08:35 +08:00
|
|
|
* See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
|
2007-10-19 14:40:20 +08:00
|
|
|
* for a background explanation of this.
|
|
|
|
*
|
|
|
|
* Does not return errors, on the theory that the callers of this
|
|
|
|
* routine would rather not worry about failures to rebuild sched
|
|
|
|
* domains when operating in the severe memory shortage situations
|
|
|
|
* that could cause allocation failures below.
|
|
|
|
*
|
2023-05-08 15:58:50 +08:00
|
|
|
* Must be called with cpuset_mutex held.
|
2007-10-19 14:40:20 +08:00
|
|
|
*
|
|
|
|
* The three key local variables below are:
|
2018-12-19 21:34:44 +08:00
|
|
|
* cp - cpuset pointer, used (together with pos_css) to perform a
|
|
|
|
* top-down scan of all cpusets. For our purposes, rebuilding
|
|
|
|
* the schedulers sched domains, we can ignore !is_sched_load_
|
|
|
|
* balance cpusets.
|
2007-10-19 14:40:20 +08:00
|
|
|
* csa - (for CpuSet Array) Array of pointers to all the cpusets
|
|
|
|
* that need to be load balanced, for convenient iterative
|
|
|
|
* access by the subsequent code that finds the best partition,
|
|
|
|
* i.e the set of domains (subsets) of CPUs such that the
|
|
|
|
* cpus_allowed of every cpuset marked is_sched_load_balance
|
|
|
|
* is a subset of one of these domains, while there are as
|
|
|
|
* many such domains as possible, each as small as possible.
|
|
|
|
* doms - Conversion of 'csa' to an array of cpumasks, for passing to
|
2013-06-04 15:40:24 +08:00
|
|
|
* the kernel/sched/core.c routine partition_sched_domains() in a
|
2007-10-19 14:40:20 +08:00
|
|
|
* convenient format, that can be easily compared to the prior
|
|
|
|
* value to determine what partition elements (sched domains)
|
|
|
|
* were changed (added or removed.)
|
|
|
|
*
|
|
|
|
* Finding the best partition (set of domains):
|
|
|
|
* The triple nested loops below over i, j, k scan over the
|
|
|
|
* load balanced cpusets (using the array of cpuset pointers in
|
|
|
|
* csa[]) looking for pairs of cpusets that have overlapping
|
|
|
|
* cpus_allowed, but which don't have the same 'pn' partition
|
|
|
|
* number and gives them in the same partition number. It keeps
|
|
|
|
* looping on the 'restart' label until it can no longer find
|
|
|
|
* any such pairs.
|
|
|
|
*
|
|
|
|
* The union of the cpus_allowed masks from the set of
|
|
|
|
* all cpusets having the same 'pn' value then form the one
|
|
|
|
* element of the partition (one sched domain) to be passed to
|
|
|
|
* partition_sched_domains().
|
|
|
|
*/
|
2009-11-03 12:23:40 +08:00
|
|
|
static int generate_sched_domains(cpumask_var_t **domains,
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
struct sched_domain_attr **attributes)
|
2007-10-19 14:40:20 +08:00
|
|
|
{
|
2018-12-19 21:34:44 +08:00
|
|
|
struct cpuset *cp; /* top-down scan of cpusets */
|
2007-10-19 14:40:20 +08:00
|
|
|
struct cpuset **csa; /* array of all cpuset ptrs */
|
|
|
|
int csn; /* how many cpuset ptrs in csa so far */
|
|
|
|
int i, j, k; /* indices for partition finding loops */
|
2009-11-03 12:23:40 +08:00
|
|
|
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
|
2008-04-15 13:04:23 +08:00
|
|
|
struct sched_domain_attr *dattr; /* attributes for custom domains */
|
2008-11-25 17:27:49 +08:00
|
|
|
int ndoms = 0; /* number of sched domains in result */
|
2009-01-08 10:08:45 +08:00
|
|
|
int nslot; /* next empty doms[] struct cpumask slot */
|
2013-08-09 08:11:25 +08:00
|
|
|
struct cgroup_subsys_state *pos_css;
|
2018-11-08 23:08:42 +08:00
|
|
|
bool root_load_balance = is_sched_load_balance(&top_cpuset);
|
2007-10-19 14:40:20 +08:00
|
|
|
|
|
|
|
doms = NULL;
|
2008-04-15 13:04:23 +08:00
|
|
|
dattr = NULL;
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
csa = NULL;
|
2007-10-19 14:40:20 +08:00
|
|
|
|
|
|
|
/* Special case for the 99% of systems with one, full, sched domain */
|
2018-11-08 23:08:42 +08:00
|
|
|
if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
|
2009-11-03 12:23:40 +08:00
|
|
|
ndoms = 1;
|
|
|
|
doms = alloc_sched_domains(ndoms);
|
2007-10-19 14:40:20 +08:00
|
|
|
if (!doms)
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
goto done;
|
|
|
|
|
2008-04-15 13:04:23 +08:00
|
|
|
dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
|
|
|
|
if (dattr) {
|
|
|
|
*dattr = SD_ATTR_INIT;
|
2008-07-30 13:33:23 +08:00
|
|
|
update_domain_attr_tree(dattr, &top_cpuset);
|
2008-04-15 13:04:23 +08:00
|
|
|
}
|
2015-03-10 00:12:08 +08:00
|
|
|
cpumask_and(doms[0], top_cpuset.effective_cpus,
|
2022-02-07 23:59:06 +08:00
|
|
|
housekeeping_cpumask(HK_TYPE_DOMAIN));
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
|
|
|
|
goto done;
|
2007-10-19 14:40:20 +08:00
|
|
|
}
|
|
|
|
|
treewide: kmalloc() -> kmalloc_array()
The kmalloc() function has a 2-factor argument form, kmalloc_array(). This
patch replaces cases of:
kmalloc(a * b, gfp)
with:
kmalloc_array(a * b, gfp)
as well as handling cases of:
kmalloc(a * b * c, gfp)
with:
kmalloc(array3_size(a, b, c), gfp)
as it's slightly less ugly than:
kmalloc_array(array_size(a, b), c, gfp)
This does, however, attempt to ignore constant size factors like:
kmalloc(4 * 1024, gfp)
though any constants defined via macros get caught up in the conversion.
Any factors with a sizeof() of "unsigned char", "char", and "u8" were
dropped, since they're redundant.
The tools/ directory was manually excluded, since it has its own
implementation of kmalloc().
The Coccinelle script used for this was:
// Fix redundant parens around sizeof().
@@
type TYPE;
expression THING, E;
@@
(
kmalloc(
- (sizeof(TYPE)) * E
+ sizeof(TYPE) * E
, ...)
|
kmalloc(
- (sizeof(THING)) * E
+ sizeof(THING) * E
, ...)
)
// Drop single-byte sizes and redundant parens.
@@
expression COUNT;
typedef u8;
typedef __u8;
@@
(
kmalloc(
- sizeof(u8) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(__u8) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(char) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(unsigned char) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(u8) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(__u8) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(char) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(unsigned char) * COUNT
+ COUNT
, ...)
)
// 2-factor product with sizeof(type/expression) and identifier or constant.
@@
type TYPE;
expression THING;
identifier COUNT_ID;
constant COUNT_CONST;
@@
(
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (COUNT_ID)
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * COUNT_ID
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (COUNT_CONST)
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * COUNT_CONST
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (COUNT_ID)
+ COUNT_ID, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * COUNT_ID
+ COUNT_ID, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (COUNT_CONST)
+ COUNT_CONST, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * COUNT_CONST
+ COUNT_CONST, sizeof(THING)
, ...)
)
// 2-factor product, only identifiers.
@@
identifier SIZE, COUNT;
@@
- kmalloc
+ kmalloc_array
(
- SIZE * COUNT
+ COUNT, SIZE
, ...)
// 3-factor product with 1 sizeof(type) or sizeof(expression), with
// redundant parens removed.
@@
expression THING;
identifier STRIDE, COUNT;
type TYPE;
@@
(
kmalloc(
- sizeof(TYPE) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(THING) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
)
// 3-factor product with 2 sizeof(variable), with redundant parens removed.
@@
expression THING1, THING2;
identifier COUNT;
type TYPE1, TYPE2;
@@
(
kmalloc(
- sizeof(TYPE1) * sizeof(TYPE2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kmalloc(
- sizeof(THING1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(THING1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
)
// 3-factor product, only identifiers, with redundant parens removed.
@@
identifier STRIDE, SIZE, COUNT;
@@
(
kmalloc(
- (COUNT) * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
)
// Any remaining multi-factor products, first at least 3-factor products,
// when they're not all constants...
@@
expression E1, E2, E3;
constant C1, C2, C3;
@@
(
kmalloc(C1 * C2 * C3, ...)
|
kmalloc(
- (E1) * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- (E1) * (E2) * E3
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- (E1) * (E2) * (E3)
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- E1 * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
)
// And then all remaining 2 factors products when they're not all constants,
// keeping sizeof() as the second factor argument.
@@
expression THING, E1, E2;
type TYPE;
constant C1, C2, C3;
@@
(
kmalloc(sizeof(THING) * C2, ...)
|
kmalloc(sizeof(TYPE) * C2, ...)
|
kmalloc(C1 * C2 * C3, ...)
|
kmalloc(C1 * C2, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (E2)
+ E2, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * E2
+ E2, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (E2)
+ E2, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * E2
+ E2, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- (E1) * E2
+ E1, E2
, ...)
|
- kmalloc
+ kmalloc_array
(
- (E1) * (E2)
+ E1, E2
, ...)
|
- kmalloc
+ kmalloc_array
(
- E1 * E2
+ E1, E2
, ...)
)
Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-13 04:55:00 +08:00
|
|
|
csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
|
2007-10-19 14:40:20 +08:00
|
|
|
if (!csa)
|
|
|
|
goto done;
|
|
|
|
csn = 0;
|
|
|
|
|
2013-01-08 00:51:08 +08:00
|
|
|
rcu_read_lock();
|
2018-11-08 23:08:42 +08:00
|
|
|
if (root_load_balance)
|
|
|
|
csa[csn++] = &top_cpuset;
|
2013-08-09 08:11:25 +08:00
|
|
|
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
|
2013-08-09 08:11:27 +08:00
|
|
|
if (cp == &top_cpuset)
|
|
|
|
continue;
|
2008-07-30 13:33:22 +08:00
|
|
|
/*
|
2013-01-08 00:51:08 +08:00
|
|
|
* Continue traversing beyond @cp iff @cp has some CPUs and
|
|
|
|
* isn't load balancing. The former is obvious. The
|
|
|
|
* latter: All child cpusets contain a subset of the
|
|
|
|
* parent's cpus, so just skip them, and then we call
|
|
|
|
* update_domain_attr_tree() to calc relax_domain_level of
|
|
|
|
* the corresponding sched domain.
|
2018-11-08 23:08:42 +08:00
|
|
|
*
|
|
|
|
* If root is load-balancing, we can skip @cp if it
|
|
|
|
* is a subset of the root's effective_cpus.
|
2008-07-30 13:33:22 +08:00
|
|
|
*/
|
2013-01-08 00:51:08 +08:00
|
|
|
if (!cpumask_empty(cp->cpus_allowed) &&
|
2015-03-10 00:12:08 +08:00
|
|
|
!(is_sched_load_balance(cp) &&
|
2017-10-27 10:42:37 +08:00
|
|
|
cpumask_intersects(cp->cpus_allowed,
|
2022-02-07 23:59:06 +08:00
|
|
|
housekeeping_cpumask(HK_TYPE_DOMAIN))))
|
2008-07-30 13:33:22 +08:00
|
|
|
continue;
|
2008-07-25 16:47:23 +08:00
|
|
|
|
2018-11-08 23:08:42 +08:00
|
|
|
if (root_load_balance &&
|
|
|
|
cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
|
|
|
|
continue;
|
|
|
|
|
2019-10-23 23:37:44 +08:00
|
|
|
if (is_sched_load_balance(cp) &&
|
|
|
|
!cpumask_empty(cp->effective_cpus))
|
2013-01-08 00:51:08 +08:00
|
|
|
csa[csn++] = cp;
|
|
|
|
|
2018-11-08 23:08:42 +08:00
|
|
|
/* skip @cp's subtree if not a partition root */
|
2022-09-02 04:57:37 +08:00
|
|
|
if (!is_partition_valid(cp))
|
2018-11-08 23:08:42 +08:00
|
|
|
pos_css = css_rightmost_descendant(pos_css);
|
2013-01-08 00:51:08 +08:00
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
2007-10-19 14:40:20 +08:00
|
|
|
|
|
|
|
for (i = 0; i < csn; i++)
|
|
|
|
csa[i]->pn = i;
|
|
|
|
ndoms = csn;
|
|
|
|
|
|
|
|
restart:
|
|
|
|
/* Find the best partition (set of sched domains) */
|
|
|
|
for (i = 0; i < csn; i++) {
|
|
|
|
struct cpuset *a = csa[i];
|
|
|
|
int apn = a->pn;
|
|
|
|
|
|
|
|
for (j = 0; j < csn; j++) {
|
|
|
|
struct cpuset *b = csa[j];
|
|
|
|
int bpn = b->pn;
|
|
|
|
|
|
|
|
if (apn != bpn && cpusets_overlap(a, b)) {
|
|
|
|
for (k = 0; k < csn; k++) {
|
|
|
|
struct cpuset *c = csa[k];
|
|
|
|
|
|
|
|
if (c->pn == bpn)
|
|
|
|
c->pn = apn;
|
|
|
|
}
|
|
|
|
ndoms--; /* one less element */
|
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
/*
|
|
|
|
* Now we know how many domains to create.
|
|
|
|
* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
|
|
|
|
*/
|
2009-11-03 12:23:40 +08:00
|
|
|
doms = alloc_sched_domains(ndoms);
|
2008-11-18 14:02:03 +08:00
|
|
|
if (!doms)
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
goto done;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The rest of the code, including the scheduler, can deal with
|
|
|
|
* dattr==NULL case. No need to abort if alloc fails.
|
|
|
|
*/
|
treewide: kmalloc() -> kmalloc_array()
The kmalloc() function has a 2-factor argument form, kmalloc_array(). This
patch replaces cases of:
kmalloc(a * b, gfp)
with:
kmalloc_array(a * b, gfp)
as well as handling cases of:
kmalloc(a * b * c, gfp)
with:
kmalloc(array3_size(a, b, c), gfp)
as it's slightly less ugly than:
kmalloc_array(array_size(a, b), c, gfp)
This does, however, attempt to ignore constant size factors like:
kmalloc(4 * 1024, gfp)
though any constants defined via macros get caught up in the conversion.
Any factors with a sizeof() of "unsigned char", "char", and "u8" were
dropped, since they're redundant.
The tools/ directory was manually excluded, since it has its own
implementation of kmalloc().
The Coccinelle script used for this was:
// Fix redundant parens around sizeof().
@@
type TYPE;
expression THING, E;
@@
(
kmalloc(
- (sizeof(TYPE)) * E
+ sizeof(TYPE) * E
, ...)
|
kmalloc(
- (sizeof(THING)) * E
+ sizeof(THING) * E
, ...)
)
// Drop single-byte sizes and redundant parens.
@@
expression COUNT;
typedef u8;
typedef __u8;
@@
(
kmalloc(
- sizeof(u8) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(__u8) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(char) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(unsigned char) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(u8) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(__u8) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(char) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(unsigned char) * COUNT
+ COUNT
, ...)
)
// 2-factor product with sizeof(type/expression) and identifier or constant.
@@
type TYPE;
expression THING;
identifier COUNT_ID;
constant COUNT_CONST;
@@
(
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (COUNT_ID)
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * COUNT_ID
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (COUNT_CONST)
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * COUNT_CONST
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (COUNT_ID)
+ COUNT_ID, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * COUNT_ID
+ COUNT_ID, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (COUNT_CONST)
+ COUNT_CONST, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * COUNT_CONST
+ COUNT_CONST, sizeof(THING)
, ...)
)
// 2-factor product, only identifiers.
@@
identifier SIZE, COUNT;
@@
- kmalloc
+ kmalloc_array
(
- SIZE * COUNT
+ COUNT, SIZE
, ...)
// 3-factor product with 1 sizeof(type) or sizeof(expression), with
// redundant parens removed.
@@
expression THING;
identifier STRIDE, COUNT;
type TYPE;
@@
(
kmalloc(
- sizeof(TYPE) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(THING) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
)
// 3-factor product with 2 sizeof(variable), with redundant parens removed.
@@
expression THING1, THING2;
identifier COUNT;
type TYPE1, TYPE2;
@@
(
kmalloc(
- sizeof(TYPE1) * sizeof(TYPE2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kmalloc(
- sizeof(THING1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(THING1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
)
// 3-factor product, only identifiers, with redundant parens removed.
@@
identifier STRIDE, SIZE, COUNT;
@@
(
kmalloc(
- (COUNT) * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
)
// Any remaining multi-factor products, first at least 3-factor products,
// when they're not all constants...
@@
expression E1, E2, E3;
constant C1, C2, C3;
@@
(
kmalloc(C1 * C2 * C3, ...)
|
kmalloc(
- (E1) * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- (E1) * (E2) * E3
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- (E1) * (E2) * (E3)
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- E1 * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
)
// And then all remaining 2 factors products when they're not all constants,
// keeping sizeof() as the second factor argument.
@@
expression THING, E1, E2;
type TYPE;
constant C1, C2, C3;
@@
(
kmalloc(sizeof(THING) * C2, ...)
|
kmalloc(sizeof(TYPE) * C2, ...)
|
kmalloc(C1 * C2 * C3, ...)
|
kmalloc(C1 * C2, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (E2)
+ E2, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * E2
+ E2, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (E2)
+ E2, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * E2
+ E2, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- (E1) * E2
+ E1, E2
, ...)
|
- kmalloc
+ kmalloc_array
(
- (E1) * (E2)
+ E1, E2
, ...)
|
- kmalloc
+ kmalloc_array
(
- E1 * E2
+ E1, E2
, ...)
)
Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-13 04:55:00 +08:00
|
|
|
dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
|
|
|
|
GFP_KERNEL);
|
2007-10-19 14:40:20 +08:00
|
|
|
|
|
|
|
for (nslot = 0, i = 0; i < csn; i++) {
|
|
|
|
struct cpuset *a = csa[i];
|
2009-01-08 10:08:45 +08:00
|
|
|
struct cpumask *dp;
|
2007-10-19 14:40:20 +08:00
|
|
|
int apn = a->pn;
|
|
|
|
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
if (apn < 0) {
|
|
|
|
/* Skip completed partitions */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2009-11-03 12:23:40 +08:00
|
|
|
dp = doms[nslot];
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
|
|
|
|
if (nslot == ndoms) {
|
|
|
|
static int warnings = 10;
|
|
|
|
if (warnings) {
|
2014-05-06 01:49:00 +08:00
|
|
|
pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
|
|
|
|
nslot, ndoms, csn, i, apn);
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
warnings--;
|
2007-10-19 14:40:20 +08:00
|
|
|
}
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
continue;
|
|
|
|
}
|
2007-10-19 14:40:20 +08:00
|
|
|
|
2009-01-08 10:08:45 +08:00
|
|
|
cpumask_clear(dp);
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
if (dattr)
|
|
|
|
*(dattr + nslot) = SD_ATTR_INIT;
|
|
|
|
for (j = i; j < csn; j++) {
|
|
|
|
struct cpuset *b = csa[j];
|
|
|
|
|
|
|
|
if (apn == b->pn) {
|
2014-07-09 16:47:50 +08:00
|
|
|
cpumask_or(dp, dp, b->effective_cpus);
|
2022-02-07 23:59:06 +08:00
|
|
|
cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
if (dattr)
|
|
|
|
update_domain_attr_tree(dattr + nslot, b);
|
|
|
|
|
|
|
|
/* Done with this partition */
|
|
|
|
b->pn = -1;
|
2007-10-19 14:40:20 +08:00
|
|
|
}
|
|
|
|
}
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
nslot++;
|
2007-10-19 14:40:20 +08:00
|
|
|
}
|
|
|
|
BUG_ON(nslot != ndoms);
|
|
|
|
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
done:
|
|
|
|
kfree(csa);
|
|
|
|
|
2008-11-18 14:02:03 +08:00
|
|
|
/*
|
|
|
|
* Fallback to the default domain if kmalloc() failed.
|
|
|
|
* See comments in partition_sched_domains().
|
|
|
|
*/
|
|
|
|
if (doms == NULL)
|
|
|
|
ndoms = 1;
|
|
|
|
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
*domains = doms;
|
|
|
|
*attributes = dattr;
|
|
|
|
return ndoms;
|
|
|
|
}
|
|
|
|
|
2023-05-08 15:58:49 +08:00
|
|
|
static void dl_update_tasks_root_domain(struct cpuset *cs)
|
2019-07-19 21:59:55 +08:00
|
|
|
{
|
|
|
|
struct css_task_iter it;
|
|
|
|
struct task_struct *task;
|
|
|
|
|
2023-05-08 15:58:52 +08:00
|
|
|
if (cs->nr_deadline_tasks == 0)
|
|
|
|
return;
|
|
|
|
|
2019-07-19 21:59:55 +08:00
|
|
|
css_task_iter_start(&cs->css, 0, &it);
|
|
|
|
|
|
|
|
while ((task = css_task_iter_next(&it)))
|
|
|
|
dl_add_task_root_domain(task);
|
|
|
|
|
|
|
|
css_task_iter_end(&it);
|
|
|
|
}
|
|
|
|
|
2023-05-08 15:58:49 +08:00
|
|
|
static void dl_rebuild_rd_accounting(void)
|
2019-07-19 21:59:55 +08:00
|
|
|
{
|
|
|
|
struct cpuset *cs = NULL;
|
|
|
|
struct cgroup_subsys_state *pos_css;
|
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
lockdep_assert_held(&cpuset_mutex);
|
2019-07-19 21:59:55 +08:00
|
|
|
lockdep_assert_cpus_held();
|
|
|
|
lockdep_assert_held(&sched_domains_mutex);
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Clear default root domain DL accounting, it will be computed again
|
|
|
|
* if a task belongs to it.
|
|
|
|
*/
|
|
|
|
dl_clear_root_domain(&def_root_domain);
|
|
|
|
|
|
|
|
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
|
|
|
|
|
|
|
|
if (cpumask_empty(cs->effective_cpus)) {
|
|
|
|
pos_css = css_rightmost_descendant(pos_css);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
css_get(&cs->css);
|
|
|
|
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
2023-05-08 15:58:49 +08:00
|
|
|
dl_update_tasks_root_domain(cs);
|
2019-07-19 21:59:55 +08:00
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
css_put(&cs->css);
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
|
|
|
|
struct sched_domain_attr *dattr_new)
|
|
|
|
{
|
|
|
|
mutex_lock(&sched_domains_mutex);
|
|
|
|
partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
|
2023-05-08 15:58:49 +08:00
|
|
|
dl_rebuild_rd_accounting();
|
2019-07-19 21:59:55 +08:00
|
|
|
mutex_unlock(&sched_domains_mutex);
|
|
|
|
}
|
|
|
|
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
/*
|
|
|
|
* Rebuild scheduler domains.
|
|
|
|
*
|
2013-01-08 00:51:07 +08:00
|
|
|
* If the flag 'sched_load_balance' of any cpuset with non-empty
|
|
|
|
* 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
|
|
|
|
* which has that flag enabled, or if any cpuset with a non-empty
|
|
|
|
* 'cpus' is removed, then call this routine to rebuild the
|
|
|
|
* scheduler's dynamic sched domains.
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
*
|
2023-05-08 15:58:50 +08:00
|
|
|
* Call with cpuset_mutex held. Takes cpus_read_lock().
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
*/
|
2013-01-08 00:51:07 +08:00
|
|
|
static void rebuild_sched_domains_locked(void)
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
{
|
cpuset: fix race between hotplug work and later CPU offline
One of our machines keeled over trying to rebuild the scheduler domains.
Mainline produces the same splat:
BUG: unable to handle page fault for address: 0000607f820054db
CPU: 2 PID: 149 Comm: kworker/1:1 Not tainted 5.10.0-rc1-master+ #6
Workqueue: events cpuset_hotplug_workfn
RIP: build_sched_domains
Call Trace:
partition_sched_domains_locked
rebuild_sched_domains_locked
cpuset_hotplug_workfn
It happens with cgroup2 and exclusive cpusets only. This reproducer
triggers it on an 8-cpu vm and works most effectively with no
preexisting child cgroups:
cd $UNIFIED_ROOT
mkdir cg1
echo 4-7 > cg1/cpuset.cpus
echo root > cg1/cpuset.cpus.partition
# with smt/control reading 'on',
echo off > /sys/devices/system/cpu/smt/control
RIP maps to
sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
from sd_init(). sd_id is calculated earlier in the same function:
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
sd_id = cpumask_first(sched_domain_span(sd));
tl->mask(cpu), which reads cpu_sibling_map on x86, returns an empty mask
and so cpumask_first() returns >= nr_cpu_ids, which leads to the bogus
value from per_cpu_ptr() above.
The problem is a race between cpuset_hotplug_workfn() and a later
offline of CPU N. cpuset_hotplug_workfn() updates the effective masks
when N is still online, the offline clears N from cpu_sibling_map, and
then the worker uses the stale effective masks that still have N to
generate the scheduling domains, leading the worker to read
N's empty cpu_sibling_map in sd_init().
rebuild_sched_domains_locked() prevented the race during the cgroup2
cpuset series up until the Fixes commit changed its check. Make the
check more robust so that it can detect an offline CPU in any exclusive
cpuset's effective mask, not just the top one.
Fixes: 0ccea8feb980 ("cpuset: Make generate_sched_domains() work with partition")
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20201112171711.639541-1-daniel.m.jordan@oracle.com
2020-11-13 01:17:11 +08:00
|
|
|
struct cgroup_subsys_state *pos_css;
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
struct sched_domain_attr *attr;
|
2009-11-03 12:23:40 +08:00
|
|
|
cpumask_var_t *doms;
|
cpuset: fix race between hotplug work and later CPU offline
One of our machines keeled over trying to rebuild the scheduler domains.
Mainline produces the same splat:
BUG: unable to handle page fault for address: 0000607f820054db
CPU: 2 PID: 149 Comm: kworker/1:1 Not tainted 5.10.0-rc1-master+ #6
Workqueue: events cpuset_hotplug_workfn
RIP: build_sched_domains
Call Trace:
partition_sched_domains_locked
rebuild_sched_domains_locked
cpuset_hotplug_workfn
It happens with cgroup2 and exclusive cpusets only. This reproducer
triggers it on an 8-cpu vm and works most effectively with no
preexisting child cgroups:
cd $UNIFIED_ROOT
mkdir cg1
echo 4-7 > cg1/cpuset.cpus
echo root > cg1/cpuset.cpus.partition
# with smt/control reading 'on',
echo off > /sys/devices/system/cpu/smt/control
RIP maps to
sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
from sd_init(). sd_id is calculated earlier in the same function:
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
sd_id = cpumask_first(sched_domain_span(sd));
tl->mask(cpu), which reads cpu_sibling_map on x86, returns an empty mask
and so cpumask_first() returns >= nr_cpu_ids, which leads to the bogus
value from per_cpu_ptr() above.
The problem is a race between cpuset_hotplug_workfn() and a later
offline of CPU N. cpuset_hotplug_workfn() updates the effective masks
when N is still online, the offline clears N from cpu_sibling_map, and
then the worker uses the stale effective masks that still have N to
generate the scheduling domains, leading the worker to read
N's empty cpu_sibling_map in sd_init().
rebuild_sched_domains_locked() prevented the race during the cgroup2
cpuset series up until the Fixes commit changed its check. Make the
check more robust so that it can detect an offline CPU in any exclusive
cpuset's effective mask, not just the top one.
Fixes: 0ccea8feb980 ("cpuset: Make generate_sched_domains() work with partition")
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20201112171711.639541-1-daniel.m.jordan@oracle.com
2020-11-13 01:17:11 +08:00
|
|
|
struct cpuset *cs;
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
int ndoms;
|
|
|
|
|
2019-07-19 21:59:58 +08:00
|
|
|
lockdep_assert_cpus_held();
|
2023-05-08 15:58:50 +08:00
|
|
|
lockdep_assert_held(&cpuset_mutex);
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
|
2013-04-27 21:52:43 +08:00
|
|
|
/*
|
cpuset: fix race between hotplug work and later CPU offline
One of our machines keeled over trying to rebuild the scheduler domains.
Mainline produces the same splat:
BUG: unable to handle page fault for address: 0000607f820054db
CPU: 2 PID: 149 Comm: kworker/1:1 Not tainted 5.10.0-rc1-master+ #6
Workqueue: events cpuset_hotplug_workfn
RIP: build_sched_domains
Call Trace:
partition_sched_domains_locked
rebuild_sched_domains_locked
cpuset_hotplug_workfn
It happens with cgroup2 and exclusive cpusets only. This reproducer
triggers it on an 8-cpu vm and works most effectively with no
preexisting child cgroups:
cd $UNIFIED_ROOT
mkdir cg1
echo 4-7 > cg1/cpuset.cpus
echo root > cg1/cpuset.cpus.partition
# with smt/control reading 'on',
echo off > /sys/devices/system/cpu/smt/control
RIP maps to
sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
from sd_init(). sd_id is calculated earlier in the same function:
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
sd_id = cpumask_first(sched_domain_span(sd));
tl->mask(cpu), which reads cpu_sibling_map on x86, returns an empty mask
and so cpumask_first() returns >= nr_cpu_ids, which leads to the bogus
value from per_cpu_ptr() above.
The problem is a race between cpuset_hotplug_workfn() and a later
offline of CPU N. cpuset_hotplug_workfn() updates the effective masks
when N is still online, the offline clears N from cpu_sibling_map, and
then the worker uses the stale effective masks that still have N to
generate the scheduling domains, leading the worker to read
N's empty cpu_sibling_map in sd_init().
rebuild_sched_domains_locked() prevented the race during the cgroup2
cpuset series up until the Fixes commit changed its check. Make the
check more robust so that it can detect an offline CPU in any exclusive
cpuset's effective mask, not just the top one.
Fixes: 0ccea8feb980 ("cpuset: Make generate_sched_domains() work with partition")
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20201112171711.639541-1-daniel.m.jordan@oracle.com
2020-11-13 01:17:11 +08:00
|
|
|
* If we have raced with CPU hotplug, return early to avoid
|
2013-04-27 21:52:43 +08:00
|
|
|
* passing doms with offlined cpu to partition_sched_domains().
|
cpuset: fix race between hotplug work and later CPU offline
One of our machines keeled over trying to rebuild the scheduler domains.
Mainline produces the same splat:
BUG: unable to handle page fault for address: 0000607f820054db
CPU: 2 PID: 149 Comm: kworker/1:1 Not tainted 5.10.0-rc1-master+ #6
Workqueue: events cpuset_hotplug_workfn
RIP: build_sched_domains
Call Trace:
partition_sched_domains_locked
rebuild_sched_domains_locked
cpuset_hotplug_workfn
It happens with cgroup2 and exclusive cpusets only. This reproducer
triggers it on an 8-cpu vm and works most effectively with no
preexisting child cgroups:
cd $UNIFIED_ROOT
mkdir cg1
echo 4-7 > cg1/cpuset.cpus
echo root > cg1/cpuset.cpus.partition
# with smt/control reading 'on',
echo off > /sys/devices/system/cpu/smt/control
RIP maps to
sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
from sd_init(). sd_id is calculated earlier in the same function:
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
sd_id = cpumask_first(sched_domain_span(sd));
tl->mask(cpu), which reads cpu_sibling_map on x86, returns an empty mask
and so cpumask_first() returns >= nr_cpu_ids, which leads to the bogus
value from per_cpu_ptr() above.
The problem is a race between cpuset_hotplug_workfn() and a later
offline of CPU N. cpuset_hotplug_workfn() updates the effective masks
when N is still online, the offline clears N from cpu_sibling_map, and
then the worker uses the stale effective masks that still have N to
generate the scheduling domains, leading the worker to read
N's empty cpu_sibling_map in sd_init().
rebuild_sched_domains_locked() prevented the race during the cgroup2
cpuset series up until the Fixes commit changed its check. Make the
check more robust so that it can detect an offline CPU in any exclusive
cpuset's effective mask, not just the top one.
Fixes: 0ccea8feb980 ("cpuset: Make generate_sched_domains() work with partition")
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20201112171711.639541-1-daniel.m.jordan@oracle.com
2020-11-13 01:17:11 +08:00
|
|
|
* Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
|
|
|
|
*
|
|
|
|
* With no CPUs in any subpartitions, top_cpuset's effective CPUs
|
|
|
|
* should be the same as the active CPUs, so checking only top_cpuset
|
|
|
|
* is enough to detect racing CPU offlines.
|
2013-04-27 21:52:43 +08:00
|
|
|
*/
|
2018-11-08 23:08:42 +08:00
|
|
|
if (!top_cpuset.nr_subparts_cpus &&
|
|
|
|
!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
|
2019-07-19 21:59:58 +08:00
|
|
|
return;
|
2018-11-08 23:08:42 +08:00
|
|
|
|
cpuset: fix race between hotplug work and later CPU offline
One of our machines keeled over trying to rebuild the scheduler domains.
Mainline produces the same splat:
BUG: unable to handle page fault for address: 0000607f820054db
CPU: 2 PID: 149 Comm: kworker/1:1 Not tainted 5.10.0-rc1-master+ #6
Workqueue: events cpuset_hotplug_workfn
RIP: build_sched_domains
Call Trace:
partition_sched_domains_locked
rebuild_sched_domains_locked
cpuset_hotplug_workfn
It happens with cgroup2 and exclusive cpusets only. This reproducer
triggers it on an 8-cpu vm and works most effectively with no
preexisting child cgroups:
cd $UNIFIED_ROOT
mkdir cg1
echo 4-7 > cg1/cpuset.cpus
echo root > cg1/cpuset.cpus.partition
# with smt/control reading 'on',
echo off > /sys/devices/system/cpu/smt/control
RIP maps to
sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
from sd_init(). sd_id is calculated earlier in the same function:
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
sd_id = cpumask_first(sched_domain_span(sd));
tl->mask(cpu), which reads cpu_sibling_map on x86, returns an empty mask
and so cpumask_first() returns >= nr_cpu_ids, which leads to the bogus
value from per_cpu_ptr() above.
The problem is a race between cpuset_hotplug_workfn() and a later
offline of CPU N. cpuset_hotplug_workfn() updates the effective masks
when N is still online, the offline clears N from cpu_sibling_map, and
then the worker uses the stale effective masks that still have N to
generate the scheduling domains, leading the worker to read
N's empty cpu_sibling_map in sd_init().
rebuild_sched_domains_locked() prevented the race during the cgroup2
cpuset series up until the Fixes commit changed its check. Make the
check more robust so that it can detect an offline CPU in any exclusive
cpuset's effective mask, not just the top one.
Fixes: 0ccea8feb980 ("cpuset: Make generate_sched_domains() work with partition")
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20201112171711.639541-1-daniel.m.jordan@oracle.com
2020-11-13 01:17:11 +08:00
|
|
|
/*
|
|
|
|
* With subpartition CPUs, however, the effective CPUs of a partition
|
|
|
|
* root should be only a subset of the active CPUs. Since a CPU in any
|
|
|
|
* partition root could be offlined, all must be checked.
|
|
|
|
*/
|
|
|
|
if (top_cpuset.nr_subparts_cpus) {
|
|
|
|
rcu_read_lock();
|
|
|
|
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
|
2022-09-02 04:57:37 +08:00
|
|
|
if (!is_partition_valid(cs)) {
|
cpuset: fix race between hotplug work and later CPU offline
One of our machines keeled over trying to rebuild the scheduler domains.
Mainline produces the same splat:
BUG: unable to handle page fault for address: 0000607f820054db
CPU: 2 PID: 149 Comm: kworker/1:1 Not tainted 5.10.0-rc1-master+ #6
Workqueue: events cpuset_hotplug_workfn
RIP: build_sched_domains
Call Trace:
partition_sched_domains_locked
rebuild_sched_domains_locked
cpuset_hotplug_workfn
It happens with cgroup2 and exclusive cpusets only. This reproducer
triggers it on an 8-cpu vm and works most effectively with no
preexisting child cgroups:
cd $UNIFIED_ROOT
mkdir cg1
echo 4-7 > cg1/cpuset.cpus
echo root > cg1/cpuset.cpus.partition
# with smt/control reading 'on',
echo off > /sys/devices/system/cpu/smt/control
RIP maps to
sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
from sd_init(). sd_id is calculated earlier in the same function:
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
sd_id = cpumask_first(sched_domain_span(sd));
tl->mask(cpu), which reads cpu_sibling_map on x86, returns an empty mask
and so cpumask_first() returns >= nr_cpu_ids, which leads to the bogus
value from per_cpu_ptr() above.
The problem is a race between cpuset_hotplug_workfn() and a later
offline of CPU N. cpuset_hotplug_workfn() updates the effective masks
when N is still online, the offline clears N from cpu_sibling_map, and
then the worker uses the stale effective masks that still have N to
generate the scheduling domains, leading the worker to read
N's empty cpu_sibling_map in sd_init().
rebuild_sched_domains_locked() prevented the race during the cgroup2
cpuset series up until the Fixes commit changed its check. Make the
check more robust so that it can detect an offline CPU in any exclusive
cpuset's effective mask, not just the top one.
Fixes: 0ccea8feb980 ("cpuset: Make generate_sched_domains() work with partition")
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20201112171711.639541-1-daniel.m.jordan@oracle.com
2020-11-13 01:17:11 +08:00
|
|
|
pos_css = css_rightmost_descendant(pos_css);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!cpumask_subset(cs->effective_cpus,
|
|
|
|
cpu_active_mask)) {
|
|
|
|
rcu_read_unlock();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
2013-04-27 21:52:43 +08:00
|
|
|
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
/* Generate domain masks and attrs */
|
|
|
|
ndoms = generate_sched_domains(&doms, &attr);
|
|
|
|
|
|
|
|
/* Have scheduler rebuild the domains */
|
2019-07-19 21:59:55 +08:00
|
|
|
partition_and_rebuild_sched_domains(ndoms, doms, attr);
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
}
|
2009-04-03 07:57:55 +08:00
|
|
|
#else /* !CONFIG_SMP */
|
2013-01-08 00:51:07 +08:00
|
|
|
static void rebuild_sched_domains_locked(void)
|
2009-04-03 07:57:55 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_SMP */
|
2007-10-19 14:40:20 +08:00
|
|
|
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
void rebuild_sched_domains(void)
|
|
|
|
{
|
2021-08-03 22:16:07 +08:00
|
|
|
cpus_read_lock();
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2013-01-08 00:51:07 +08:00
|
|
|
rebuild_sched_domains_locked();
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2021-08-03 22:16:07 +08:00
|
|
|
cpus_read_unlock();
|
2007-10-19 14:40:20 +08:00
|
|
|
}
|
|
|
|
|
2008-07-25 16:47:21 +08:00
|
|
|
/**
|
|
|
|
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
|
|
|
|
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
|
2023-02-01 06:17:19 +08:00
|
|
|
* @new_cpus: the temp variable for the new effective_cpus mask
|
2008-07-25 16:47:21 +08:00
|
|
|
*
|
2014-02-13 19:58:40 +08:00
|
|
|
* Iterate through each task of @cs updating its cpus_allowed to the
|
2023-05-08 15:58:50 +08:00
|
|
|
* effective cpuset's. As this function is called with cpuset_mutex held,
|
2023-03-17 23:15:07 +08:00
|
|
|
* cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask()
|
|
|
|
* is used instead of effective_cpus to make sure all offline CPUs are also
|
|
|
|
* included as hotplug code won't update cpumasks for tasks in top_cpuset.
|
2008-07-25 16:47:21 +08:00
|
|
|
*/
|
2023-02-01 06:17:19 +08:00
|
|
|
static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
|
2008-07-25 16:47:21 +08:00
|
|
|
{
|
2014-02-13 19:58:40 +08:00
|
|
|
struct css_task_iter it;
|
|
|
|
struct task_struct *task;
|
2022-09-02 04:57:36 +08:00
|
|
|
bool top_cs = cs == &top_cpuset;
|
2014-02-13 19:58:40 +08:00
|
|
|
|
2017-05-15 21:34:01 +08:00
|
|
|
css_task_iter_start(&cs->css, 0, &it);
|
2022-09-02 04:57:36 +08:00
|
|
|
while ((task = css_task_iter_next(&it))) {
|
2023-03-17 23:15:07 +08:00
|
|
|
const struct cpumask *possible_mask = task_cpu_possible_mask(task);
|
2023-02-01 06:17:19 +08:00
|
|
|
|
2023-03-17 23:15:07 +08:00
|
|
|
if (top_cs) {
|
|
|
|
/*
|
|
|
|
* Percpu kthreads in top_cpuset are ignored
|
|
|
|
*/
|
2023-07-04 19:30:49 +08:00
|
|
|
if (kthread_is_per_cpu(task))
|
2023-03-17 23:15:07 +08:00
|
|
|
continue;
|
|
|
|
cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus);
|
|
|
|
} else {
|
|
|
|
cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
|
|
|
|
}
|
2023-02-01 06:17:19 +08:00
|
|
|
set_cpus_allowed_ptr(task, new_cpus);
|
2022-09-02 04:57:36 +08:00
|
|
|
}
|
2014-02-13 19:58:40 +08:00
|
|
|
css_task_iter_end(&it);
|
2008-07-25 16:47:21 +08:00
|
|
|
}
|
|
|
|
|
2018-11-08 23:08:38 +08:00
|
|
|
/**
|
|
|
|
* compute_effective_cpumask - Compute the effective cpumask of the cpuset
|
|
|
|
* @new_cpus: the temp variable for the new effective_cpus mask
|
|
|
|
* @cs: the cpuset the need to recompute the new effective_cpus mask
|
|
|
|
* @parent: the parent cpuset
|
|
|
|
*
|
|
|
|
* If the parent has subpartition CPUs, include them in the list of
|
2018-11-08 23:08:41 +08:00
|
|
|
* allowable CPUs in computing the new effective_cpus mask. Since offlined
|
|
|
|
* CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
|
|
|
|
* to mask those out.
|
2018-11-08 23:08:38 +08:00
|
|
|
*/
|
|
|
|
static void compute_effective_cpumask(struct cpumask *new_cpus,
|
|
|
|
struct cpuset *cs, struct cpuset *parent)
|
|
|
|
{
|
2023-06-27 22:35:01 +08:00
|
|
|
if (parent->nr_subparts_cpus && is_partition_valid(cs)) {
|
2018-11-08 23:08:38 +08:00
|
|
|
cpumask_or(new_cpus, parent->effective_cpus,
|
|
|
|
parent->subparts_cpus);
|
|
|
|
cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
|
2018-11-08 23:08:41 +08:00
|
|
|
cpumask_and(new_cpus, new_cpus, cpu_active_mask);
|
2018-11-08 23:08:38 +08:00
|
|
|
} else {
|
|
|
|
cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Commands for update_parent_subparts_cpumask
|
|
|
|
*/
|
|
|
|
enum subparts_cmd {
|
|
|
|
partcmd_enable, /* Enable partition root */
|
|
|
|
partcmd_disable, /* Disable partition root */
|
|
|
|
partcmd_update, /* Update parent's subparts_cpus */
|
2022-09-02 04:57:43 +08:00
|
|
|
partcmd_invalidate, /* Make partition invalid */
|
2018-11-08 23:08:38 +08:00
|
|
|
};
|
|
|
|
|
2022-09-02 04:57:39 +08:00
|
|
|
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
|
|
|
|
int turning_on);
|
2023-06-27 22:35:02 +08:00
|
|
|
static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
|
|
|
|
struct tmpmasks *tmp);
|
2023-06-27 22:35:01 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Update partition exclusive flag
|
|
|
|
*
|
|
|
|
* Return: 0 if successful, an error code otherwise
|
|
|
|
*/
|
|
|
|
static int update_partition_exclusive(struct cpuset *cs, int new_prs)
|
|
|
|
{
|
|
|
|
bool exclusive = (new_prs > 0);
|
|
|
|
|
|
|
|
if (exclusive && !is_cpu_exclusive(cs)) {
|
|
|
|
if (update_flag(CS_CPU_EXCLUSIVE, cs, 1))
|
|
|
|
return PERR_NOTEXCL;
|
|
|
|
} else if (!exclusive && is_cpu_exclusive(cs)) {
|
|
|
|
/* Turning off CS_CPU_EXCLUSIVE will not return error */
|
|
|
|
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update partition load balance flag and/or rebuild sched domain
|
|
|
|
*
|
|
|
|
* Changing load balance flag will automatically call
|
|
|
|
* rebuild_sched_domains_locked().
|
|
|
|
*/
|
|
|
|
static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
|
|
|
|
{
|
|
|
|
int new_prs = cs->partition_root_state;
|
|
|
|
bool new_lb = (new_prs != PRS_ISOLATED);
|
|
|
|
bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
|
|
|
|
|
|
|
|
if (new_lb != !!is_sched_load_balance(cs)) {
|
|
|
|
rebuild_domains = true;
|
|
|
|
if (new_lb)
|
|
|
|
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
|
|
|
|
else
|
|
|
|
clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rebuild_domains)
|
|
|
|
rebuild_sched_domains_locked();
|
|
|
|
}
|
|
|
|
|
2018-11-08 23:08:38 +08:00
|
|
|
/**
|
|
|
|
* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
|
2023-01-08 10:12:17 +08:00
|
|
|
* @cs: The cpuset that requests change in partition root state
|
2018-11-08 23:08:38 +08:00
|
|
|
* @cmd: Partition root state change command
|
|
|
|
* @newmask: Optional new cpumask for partcmd_update
|
|
|
|
* @tmp: Temporary addmask and delmask
|
2022-09-02 04:57:41 +08:00
|
|
|
* Return: 0 or a partition root state error code
|
2018-11-08 23:08:38 +08:00
|
|
|
*
|
|
|
|
* For partcmd_enable, the cpuset is being transformed from a non-partition
|
|
|
|
* root to a partition root. The cpus_allowed mask of the given cpuset will
|
|
|
|
* be put into parent's subparts_cpus and taken away from parent's
|
|
|
|
* effective_cpus. The function will return 0 if all the CPUs listed in
|
|
|
|
* cpus_allowed can be granted or an error code will be returned.
|
|
|
|
*
|
2022-03-06 04:46:57 +08:00
|
|
|
* For partcmd_disable, the cpuset is being transformed from a partition
|
2021-07-20 22:18:26 +08:00
|
|
|
* root back to a non-partition root. Any CPUs in cpus_allowed that are in
|
2018-11-08 23:08:38 +08:00
|
|
|
* parent's subparts_cpus will be taken away from that cpumask and put back
|
2022-09-02 04:57:39 +08:00
|
|
|
* into parent's effective_cpus. 0 will always be returned.
|
2018-11-08 23:08:38 +08:00
|
|
|
*
|
2022-09-02 04:57:39 +08:00
|
|
|
* For partcmd_update, if the optional newmask is specified, the cpu list is
|
|
|
|
* to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is
|
|
|
|
* assumed to remain the same. The cpuset should either be a valid or invalid
|
|
|
|
* partition root. The partition root state may change from valid to invalid
|
|
|
|
* or vice versa. An error code will only be returned if transitioning from
|
|
|
|
* invalid to valid violates the exclusivity rule.
|
2018-11-08 23:08:38 +08:00
|
|
|
*
|
2022-09-02 04:57:43 +08:00
|
|
|
* For partcmd_invalidate, the current partition will be made invalid.
|
|
|
|
*
|
2018-11-08 23:08:38 +08:00
|
|
|
* The partcmd_enable and partcmd_disable commands are used by
|
2022-09-02 04:57:39 +08:00
|
|
|
* update_prstate(). An error code may be returned and the caller will check
|
|
|
|
* for error.
|
2018-11-08 23:08:38 +08:00
|
|
|
*
|
2022-09-02 04:57:39 +08:00
|
|
|
* The partcmd_update command is used by update_cpumasks_hier() with newmask
|
2022-09-02 04:57:43 +08:00
|
|
|
* NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
|
|
|
|
* by update_cpumask() with NULL newmask. In both cases, the callers won't
|
|
|
|
* check for error and so partition_root_state and prs_error will be updated
|
|
|
|
* directly.
|
2018-11-08 23:08:38 +08:00
|
|
|
*/
|
2022-09-02 04:57:37 +08:00
|
|
|
static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
|
2018-11-08 23:08:38 +08:00
|
|
|
struct cpumask *newmask,
|
|
|
|
struct tmpmasks *tmp)
|
|
|
|
{
|
2022-09-02 04:57:37 +08:00
|
|
|
struct cpuset *parent = parent_cs(cs);
|
2018-11-08 23:08:38 +08:00
|
|
|
int adding; /* Moving cpus from effective_cpus to subparts_cpus */
|
|
|
|
int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
|
2021-08-11 11:06:02 +08:00
|
|
|
int old_prs, new_prs;
|
2022-09-02 04:57:41 +08:00
|
|
|
int part_error = PERR_NONE; /* Partition error? */
|
2018-11-08 23:08:38 +08:00
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
lockdep_assert_held(&cpuset_mutex);
|
2018-11-08 23:08:38 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The parent must be a partition root.
|
|
|
|
* The new cpumask, if present, or the current cpus_allowed must
|
|
|
|
* not be empty.
|
|
|
|
*/
|
2022-09-02 04:57:41 +08:00
|
|
|
if (!is_partition_valid(parent)) {
|
|
|
|
return is_partition_invalid(parent)
|
|
|
|
? PERR_INVPARENT : PERR_NOTPART;
|
|
|
|
}
|
2023-06-27 22:35:01 +08:00
|
|
|
if (!newmask && cpumask_empty(cs->cpus_allowed))
|
2022-09-02 04:57:41 +08:00
|
|
|
return PERR_CPUSEMPTY;
|
2018-11-08 23:08:38 +08:00
|
|
|
|
|
|
|
/*
|
2022-09-02 04:57:43 +08:00
|
|
|
* new_prs will only be changed for the partcmd_update and
|
|
|
|
* partcmd_invalidate commands.
|
2018-11-08 23:08:38 +08:00
|
|
|
*/
|
|
|
|
adding = deleting = false;
|
2022-09-02 04:57:37 +08:00
|
|
|
old_prs = new_prs = cs->partition_root_state;
|
2018-11-08 23:08:38 +08:00
|
|
|
if (cmd == partcmd_enable) {
|
2022-09-02 04:57:38 +08:00
|
|
|
/*
|
2022-09-02 04:57:39 +08:00
|
|
|
* Enabling partition root is not allowed if cpus_allowed
|
|
|
|
* doesn't overlap parent's cpus_allowed.
|
2022-09-02 04:57:38 +08:00
|
|
|
*/
|
2022-09-02 04:57:39 +08:00
|
|
|
if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed))
|
2022-09-02 04:57:41 +08:00
|
|
|
return PERR_INVCPUS;
|
2022-09-02 04:57:38 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* A parent can be left with no CPU as long as there is no
|
2022-09-02 04:57:39 +08:00
|
|
|
* task directly associated with the parent partition.
|
2022-09-02 04:57:38 +08:00
|
|
|
*/
|
2023-01-31 23:48:03 +08:00
|
|
|
if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) &&
|
2022-09-02 04:57:38 +08:00
|
|
|
partition_is_populated(parent, cs))
|
2022-09-02 04:57:41 +08:00
|
|
|
return PERR_NOCPUS;
|
2022-09-02 04:57:38 +08:00
|
|
|
|
2022-09-02 04:57:37 +08:00
|
|
|
cpumask_copy(tmp->addmask, cs->cpus_allowed);
|
2018-11-08 23:08:38 +08:00
|
|
|
adding = true;
|
|
|
|
} else if (cmd == partcmd_disable) {
|
2022-09-02 04:57:39 +08:00
|
|
|
/*
|
|
|
|
* Need to remove cpus from parent's subparts_cpus for valid
|
|
|
|
* partition root.
|
|
|
|
*/
|
|
|
|
deleting = !is_prs_invalid(old_prs) &&
|
|
|
|
cpumask_and(tmp->delmask, cs->cpus_allowed,
|
2018-11-08 23:08:38 +08:00
|
|
|
parent->subparts_cpus);
|
2022-09-02 04:57:43 +08:00
|
|
|
} else if (cmd == partcmd_invalidate) {
|
|
|
|
if (is_prs_invalid(old_prs))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make the current partition invalid. It is assumed that
|
|
|
|
* invalidation is caused by violating cpu exclusivity rule.
|
|
|
|
*/
|
|
|
|
deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
|
|
|
|
parent->subparts_cpus);
|
|
|
|
if (old_prs > 0) {
|
|
|
|
new_prs = -old_prs;
|
|
|
|
part_error = PERR_NOTEXCL;
|
|
|
|
}
|
2018-11-08 23:08:38 +08:00
|
|
|
} else if (newmask) {
|
|
|
|
/*
|
|
|
|
* partcmd_update with newmask:
|
|
|
|
*
|
2022-09-02 04:57:39 +08:00
|
|
|
* Compute add/delete mask to/from subparts_cpus
|
|
|
|
*
|
2018-11-08 23:08:38 +08:00
|
|
|
* delmask = cpus_allowed & ~newmask & parent->subparts_cpus
|
2022-09-02 04:57:39 +08:00
|
|
|
* addmask = newmask & parent->cpus_allowed
|
2018-11-08 23:08:38 +08:00
|
|
|
* & ~parent->subparts_cpus
|
|
|
|
*/
|
2022-09-02 04:57:37 +08:00
|
|
|
cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask);
|
2018-11-08 23:08:38 +08:00
|
|
|
deleting = cpumask_and(tmp->delmask, tmp->delmask,
|
|
|
|
parent->subparts_cpus);
|
|
|
|
|
2022-09-02 04:57:39 +08:00
|
|
|
cpumask_and(tmp->addmask, newmask, parent->cpus_allowed);
|
2018-11-08 23:08:38 +08:00
|
|
|
adding = cpumask_andnot(tmp->addmask, tmp->addmask,
|
|
|
|
parent->subparts_cpus);
|
2023-06-27 22:35:01 +08:00
|
|
|
/*
|
2023-06-27 22:35:02 +08:00
|
|
|
* Empty cpumask is not allowed
|
2023-06-27 22:35:01 +08:00
|
|
|
*/
|
|
|
|
if (cpumask_empty(newmask)) {
|
|
|
|
part_error = PERR_CPUSEMPTY;
|
2018-11-08 23:08:38 +08:00
|
|
|
/*
|
2022-09-02 04:57:39 +08:00
|
|
|
* Make partition invalid if parent's effective_cpus could
|
|
|
|
* become empty and there are tasks in the parent.
|
2018-11-08 23:08:38 +08:00
|
|
|
*/
|
2023-06-27 22:35:01 +08:00
|
|
|
} else if (adding &&
|
2022-09-02 04:57:39 +08:00
|
|
|
cpumask_subset(parent->effective_cpus, tmp->addmask) &&
|
|
|
|
!cpumask_intersects(tmp->delmask, cpu_active_mask) &&
|
2022-09-02 04:57:38 +08:00
|
|
|
partition_is_populated(parent, cs)) {
|
2022-09-02 04:57:41 +08:00
|
|
|
part_error = PERR_NOCPUS;
|
2022-09-02 04:57:39 +08:00
|
|
|
adding = false;
|
|
|
|
deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
|
|
|
|
parent->subparts_cpus);
|
2018-11-08 23:08:41 +08:00
|
|
|
}
|
2018-11-08 23:08:38 +08:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* partcmd_update w/o newmask:
|
|
|
|
*
|
2022-09-02 04:57:39 +08:00
|
|
|
* delmask = cpus_allowed & parent->subparts_cpus
|
|
|
|
* addmask = cpus_allowed & parent->cpus_allowed
|
|
|
|
* & ~parent->subparts_cpus
|
2018-11-08 23:08:38 +08:00
|
|
|
*
|
2022-09-02 04:57:39 +08:00
|
|
|
* This gets invoked either due to a hotplug event or from
|
|
|
|
* update_cpumasks_hier(). This can cause the state of a
|
|
|
|
* partition root to transition from valid to invalid or vice
|
|
|
|
* versa. So we still need to compute the addmask and delmask.
|
|
|
|
|
|
|
|
* A partition error happens when:
|
|
|
|
* 1) Cpuset is valid partition, but parent does not distribute
|
|
|
|
* out any CPUs.
|
|
|
|
* 2) Parent has tasks and all its effective CPUs will have
|
|
|
|
* to be distributed out.
|
2018-11-08 23:08:38 +08:00
|
|
|
*/
|
2022-09-02 04:57:39 +08:00
|
|
|
cpumask_and(tmp->addmask, cs->cpus_allowed,
|
|
|
|
parent->cpus_allowed);
|
|
|
|
adding = cpumask_andnot(tmp->addmask, tmp->addmask,
|
|
|
|
parent->subparts_cpus);
|
2022-09-02 04:57:43 +08:00
|
|
|
|
2022-09-02 04:57:39 +08:00
|
|
|
if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) ||
|
|
|
|
(adding &&
|
|
|
|
cpumask_subset(parent->effective_cpus, tmp->addmask) &&
|
|
|
|
partition_is_populated(parent, cs))) {
|
2022-09-02 04:57:41 +08:00
|
|
|
part_error = PERR_NOCPUS;
|
2022-09-02 04:57:39 +08:00
|
|
|
adding = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (part_error && is_partition_valid(cs) &&
|
|
|
|
parent->nr_subparts_cpus)
|
|
|
|
deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
|
|
|
|
parent->subparts_cpus);
|
2018-11-08 23:08:39 +08:00
|
|
|
}
|
2022-09-02 04:57:41 +08:00
|
|
|
if (part_error)
|
|
|
|
WRITE_ONCE(cs->prs_err, part_error);
|
2018-11-08 23:08:39 +08:00
|
|
|
|
|
|
|
if (cmd == partcmd_update) {
|
|
|
|
/*
|
2022-09-02 04:57:40 +08:00
|
|
|
* Check for possible transition between valid and invalid
|
|
|
|
* partition root.
|
2018-11-08 23:08:39 +08:00
|
|
|
*/
|
2022-09-02 04:57:37 +08:00
|
|
|
switch (cs->partition_root_state) {
|
|
|
|
case PRS_ROOT:
|
2022-09-02 04:57:40 +08:00
|
|
|
case PRS_ISOLATED:
|
2018-11-08 23:08:39 +08:00
|
|
|
if (part_error)
|
2022-09-02 04:57:40 +08:00
|
|
|
new_prs = -old_prs;
|
2018-11-08 23:08:39 +08:00
|
|
|
break;
|
2022-09-02 04:57:37 +08:00
|
|
|
case PRS_INVALID_ROOT:
|
2022-09-02 04:57:40 +08:00
|
|
|
case PRS_INVALID_ISOLATED:
|
2018-11-08 23:08:39 +08:00
|
|
|
if (!part_error)
|
2022-09-02 04:57:40 +08:00
|
|
|
new_prs = -old_prs;
|
2018-11-08 23:08:39 +08:00
|
|
|
break;
|
|
|
|
}
|
2018-11-08 23:08:38 +08:00
|
|
|
}
|
|
|
|
|
2021-08-11 11:06:02 +08:00
|
|
|
if (!adding && !deleting && (new_prs == old_prs))
|
2018-11-08 23:08:38 +08:00
|
|
|
return 0;
|
|
|
|
|
2022-09-02 04:57:39 +08:00
|
|
|
/*
|
|
|
|
* Transitioning between invalid to valid or vice versa may require
|
2023-06-27 22:35:01 +08:00
|
|
|
* changing CS_CPU_EXCLUSIVE.
|
2022-09-02 04:57:39 +08:00
|
|
|
*/
|
|
|
|
if (old_prs != new_prs) {
|
2023-06-27 22:35:01 +08:00
|
|
|
int err = update_partition_exclusive(cs, new_prs);
|
|
|
|
|
|
|
|
if (err)
|
|
|
|
return err;
|
2022-09-02 04:57:39 +08:00
|
|
|
}
|
|
|
|
|
2018-11-08 23:08:38 +08:00
|
|
|
/*
|
|
|
|
* Change the parent's subparts_cpus.
|
|
|
|
* Newly added CPUs will be removed from effective_cpus and
|
|
|
|
* newly deleted ones will be added back to effective_cpus.
|
|
|
|
*/
|
|
|
|
spin_lock_irq(&callback_lock);
|
|
|
|
if (adding) {
|
|
|
|
cpumask_or(parent->subparts_cpus,
|
|
|
|
parent->subparts_cpus, tmp->addmask);
|
|
|
|
cpumask_andnot(parent->effective_cpus,
|
|
|
|
parent->effective_cpus, tmp->addmask);
|
|
|
|
}
|
|
|
|
if (deleting) {
|
|
|
|
cpumask_andnot(parent->subparts_cpus,
|
|
|
|
parent->subparts_cpus, tmp->delmask);
|
2018-11-08 23:08:41 +08:00
|
|
|
/*
|
|
|
|
* Some of the CPUs in subparts_cpus might have been offlined.
|
|
|
|
*/
|
|
|
|
cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
|
2018-11-08 23:08:38 +08:00
|
|
|
cpumask_or(parent->effective_cpus,
|
|
|
|
parent->effective_cpus, tmp->delmask);
|
|
|
|
}
|
|
|
|
|
|
|
|
parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
|
2021-07-20 22:18:28 +08:00
|
|
|
|
2021-08-11 11:06:02 +08:00
|
|
|
if (old_prs != new_prs)
|
2022-09-02 04:57:37 +08:00
|
|
|
cs->partition_root_state = new_prs;
|
2021-08-11 11:06:02 +08:00
|
|
|
|
2018-11-08 23:08:38 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
2022-09-02 04:57:39 +08:00
|
|
|
|
2023-06-27 22:35:02 +08:00
|
|
|
if (adding || deleting) {
|
2023-03-17 23:15:05 +08:00
|
|
|
update_tasks_cpumask(parent, tmp->addmask);
|
2023-06-27 22:35:02 +08:00
|
|
|
if (parent->child_ecpus_count)
|
|
|
|
update_sibling_cpumasks(parent, cs, tmp);
|
|
|
|
}
|
2022-09-02 04:57:39 +08:00
|
|
|
|
2022-09-02 04:57:40 +08:00
|
|
|
/*
|
2023-06-27 22:35:01 +08:00
|
|
|
* For partcmd_update without newmask, it is being called from
|
|
|
|
* cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
|
|
|
|
* Update the load balance flag and scheduling domain if
|
|
|
|
* cpus_read_trylock() is successful.
|
2022-09-02 04:57:40 +08:00
|
|
|
*/
|
2023-06-27 22:35:01 +08:00
|
|
|
if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) {
|
|
|
|
update_partition_sd_lb(cs, old_prs);
|
|
|
|
cpus_read_unlock();
|
2022-09-02 04:57:40 +08:00
|
|
|
}
|
2023-06-27 22:35:01 +08:00
|
|
|
|
2022-09-02 04:57:37 +08:00
|
|
|
notify_partition_change(cs, old_prs);
|
2022-09-02 04:57:39 +08:00
|
|
|
return 0;
|
2018-11-08 23:08:38 +08:00
|
|
|
}
|
|
|
|
|
2023-06-27 22:35:03 +08:00
|
|
|
/*
|
|
|
|
* update_cpumasks_hier() flags
|
|
|
|
*/
|
|
|
|
#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */
|
|
|
|
#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */
|
|
|
|
|
2013-06-09 17:16:29 +08:00
|
|
|
/*
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
|
2018-11-08 23:08:38 +08:00
|
|
|
* @cs: the cpuset to consider
|
|
|
|
* @tmp: temp variables for calculating effective_cpus & partition setup
|
2022-09-02 04:57:39 +08:00
|
|
|
* @force: don't skip any descendant cpusets if set
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
*
|
2021-01-13 12:37:41 +08:00
|
|
|
* When configured cpumask is changed, the effective cpumasks of this cpuset
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
* and all its descendants need to be updated.
|
2013-06-09 17:16:29 +08:00
|
|
|
*
|
2021-01-13 12:37:41 +08:00
|
|
|
* On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
|
2013-06-09 17:16:29 +08:00
|
|
|
*
|
2023-05-08 15:58:50 +08:00
|
|
|
* Called with cpuset_mutex held
|
2013-06-09 17:16:29 +08:00
|
|
|
*/
|
2022-09-02 04:57:39 +08:00
|
|
|
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
|
2023-06-27 22:35:03 +08:00
|
|
|
int flags)
|
2013-06-09 17:16:29 +08:00
|
|
|
{
|
|
|
|
struct cpuset *cp;
|
2013-08-09 08:11:25 +08:00
|
|
|
struct cgroup_subsys_state *pos_css;
|
2014-07-09 16:47:50 +08:00
|
|
|
bool need_rebuild_sched_domains = false;
|
2021-08-11 11:06:02 +08:00
|
|
|
int old_prs, new_prs;
|
2013-06-09 17:16:29 +08:00
|
|
|
|
|
|
|
rcu_read_lock();
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
|
|
|
|
struct cpuset *parent = parent_cs(cp);
|
2022-09-02 04:57:39 +08:00
|
|
|
bool update_parent = false;
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
|
2018-11-08 23:08:38 +08:00
|
|
|
compute_effective_cpumask(tmp->new_cpus, cp, parent);
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
|
cpuset: inherit ancestor's masks if effective_{cpus, mems} becomes empty
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The last item is done here.
This won't introduce behavior change.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:41 +08:00
|
|
|
/*
|
|
|
|
* If it becomes empty, inherit the effective mask of the
|
2022-09-02 04:57:38 +08:00
|
|
|
* parent, which is guaranteed to have some CPUs unless
|
|
|
|
* it is a partition root that has explicitly distributed
|
|
|
|
* out all its CPUs.
|
cpuset: inherit ancestor's masks if effective_{cpus, mems} becomes empty
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The last item is done here.
This won't introduce behavior change.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:41 +08:00
|
|
|
*/
|
2018-11-08 23:08:40 +08:00
|
|
|
if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
|
2022-09-02 04:57:38 +08:00
|
|
|
if (is_partition_valid(cp) &&
|
|
|
|
cpumask_equal(cp->cpus_allowed, cp->subparts_cpus))
|
|
|
|
goto update_parent_subparts;
|
|
|
|
|
2018-11-08 23:08:38 +08:00
|
|
|
cpumask_copy(tmp->new_cpus, parent->effective_cpus);
|
2018-11-08 23:08:40 +08:00
|
|
|
if (!cp->use_parent_ecpus) {
|
|
|
|
cp->use_parent_ecpus = true;
|
|
|
|
parent->child_ecpus_count++;
|
|
|
|
}
|
|
|
|
} else if (cp->use_parent_ecpus) {
|
|
|
|
cp->use_parent_ecpus = false;
|
|
|
|
WARN_ON_ONCE(!parent->child_ecpus_count);
|
|
|
|
parent->child_ecpus_count--;
|
|
|
|
}
|
cpuset: inherit ancestor's masks if effective_{cpus, mems} becomes empty
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The last item is done here.
This won't introduce behavior change.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:41 +08:00
|
|
|
|
2018-11-08 23:08:38 +08:00
|
|
|
/*
|
2023-06-27 22:35:00 +08:00
|
|
|
* Skip the whole subtree if
|
|
|
|
* 1) the cpumask remains the same,
|
|
|
|
* 2) has no partition root state,
|
2023-06-27 22:35:03 +08:00
|
|
|
* 3) HIER_CHECKALL flag not set, and
|
2023-06-27 22:35:00 +08:00
|
|
|
* 4) for v2 load balance state same as its parent.
|
2018-11-08 23:08:38 +08:00
|
|
|
*/
|
2023-06-27 22:35:03 +08:00
|
|
|
if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
|
2023-06-27 22:35:00 +08:00
|
|
|
cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
|
|
|
|
(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
|
|
|
|
(is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
pos_css = css_rightmost_descendant(pos_css);
|
|
|
|
continue;
|
2013-06-09 17:16:29 +08:00
|
|
|
}
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
|
2022-09-02 04:57:38 +08:00
|
|
|
update_parent_subparts:
|
2018-11-08 23:08:38 +08:00
|
|
|
/*
|
|
|
|
* update_parent_subparts_cpumask() should have been called
|
|
|
|
* for cs already in update_cpumask(). We should also call
|
|
|
|
* update_tasks_cpumask() again for tasks in the parent
|
|
|
|
* cpuset if the parent's subparts_cpus changes.
|
|
|
|
*/
|
2021-08-11 11:06:02 +08:00
|
|
|
old_prs = new_prs = cp->partition_root_state;
|
|
|
|
if ((cp != cs) && old_prs) {
|
2018-11-08 23:08:39 +08:00
|
|
|
switch (parent->partition_root_state) {
|
2022-09-02 04:57:37 +08:00
|
|
|
case PRS_ROOT:
|
2022-09-02 04:57:40 +08:00
|
|
|
case PRS_ISOLATED:
|
2022-09-02 04:57:39 +08:00
|
|
|
update_parent = true;
|
2018-11-08 23:08:39 +08:00
|
|
|
break;
|
|
|
|
|
2022-09-02 04:57:39 +08:00
|
|
|
default:
|
2018-11-08 23:08:39 +08:00
|
|
|
/*
|
2022-09-02 04:57:39 +08:00
|
|
|
* When parent is not a partition root or is
|
|
|
|
* invalid, child partition roots become
|
|
|
|
* invalid too.
|
2018-11-08 23:08:39 +08:00
|
|
|
*/
|
2022-09-02 04:57:40 +08:00
|
|
|
if (is_partition_valid(cp))
|
|
|
|
new_prs = -cp->partition_root_state;
|
2022-09-02 04:57:41 +08:00
|
|
|
WRITE_ONCE(cp->prs_err,
|
|
|
|
is_partition_invalid(parent)
|
|
|
|
? PERR_INVPARENT : PERR_NOTPART);
|
2018-11-08 23:08:39 +08:00
|
|
|
break;
|
|
|
|
}
|
2018-11-08 23:08:38 +08:00
|
|
|
}
|
|
|
|
|
2014-05-14 00:11:01 +08:00
|
|
|
if (!css_tryget_online(&cp->css))
|
2013-06-09 17:16:29 +08:00
|
|
|
continue;
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
2022-09-02 04:57:39 +08:00
|
|
|
if (update_parent) {
|
|
|
|
update_parent_subparts_cpumask(cp, partcmd_update, NULL,
|
|
|
|
tmp);
|
|
|
|
/*
|
|
|
|
* The cpuset partition_root_state may become
|
|
|
|
* invalid. Capture it.
|
|
|
|
*/
|
|
|
|
new_prs = cp->partition_root_state;
|
|
|
|
}
|
|
|
|
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_lock_irq(&callback_lock);
|
2018-11-08 23:08:38 +08:00
|
|
|
|
2022-09-02 04:57:37 +08:00
|
|
|
if (cp->nr_subparts_cpus && !is_partition_valid(cp)) {
|
2022-09-02 04:57:39 +08:00
|
|
|
/*
|
|
|
|
* Put all active subparts_cpus back to effective_cpus.
|
|
|
|
*/
|
|
|
|
cpumask_or(tmp->new_cpus, tmp->new_cpus,
|
|
|
|
cp->subparts_cpus);
|
|
|
|
cpumask_and(tmp->new_cpus, tmp->new_cpus,
|
|
|
|
cpu_active_mask);
|
2018-11-08 23:08:39 +08:00
|
|
|
cp->nr_subparts_cpus = 0;
|
|
|
|
cpumask_clear(cp->subparts_cpus);
|
2022-09-02 04:57:39 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
|
|
|
|
if (cp->nr_subparts_cpus) {
|
2018-11-08 23:08:38 +08:00
|
|
|
/*
|
|
|
|
* Make sure that effective_cpus & subparts_cpus
|
|
|
|
* are mutually exclusive.
|
|
|
|
*/
|
|
|
|
cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
|
|
|
|
cp->subparts_cpus);
|
|
|
|
}
|
2021-07-20 22:18:28 +08:00
|
|
|
|
2022-09-02 04:57:39 +08:00
|
|
|
cp->partition_root_state = new_prs;
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
2022-09-02 04:57:39 +08:00
|
|
|
|
2022-09-02 04:57:37 +08:00
|
|
|
notify_partition_change(cp, old_prs);
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
|
2017-08-18 03:33:10 +08:00
|
|
|
WARN_ON(!is_in_v2_mode() &&
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
|
|
|
|
|
2023-02-01 06:17:19 +08:00
|
|
|
update_tasks_cpumask(cp, tmp->new_cpus);
|
2013-06-09 17:16:29 +08:00
|
|
|
|
2023-06-27 22:35:00 +08:00
|
|
|
/*
|
|
|
|
* On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
|
|
|
|
* from parent if current cpuset isn't a valid partition root
|
|
|
|
* and their load balance states differ.
|
|
|
|
*/
|
|
|
|
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
|
|
|
|
!is_partition_valid(cp) &&
|
|
|
|
(is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
|
|
|
|
if (is_sched_load_balance(parent))
|
|
|
|
set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
|
|
|
|
else
|
|
|
|
clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
|
|
|
|
}
|
|
|
|
|
2014-07-09 16:47:50 +08:00
|
|
|
/*
|
2018-11-08 23:08:42 +08:00
|
|
|
* On legacy hierarchy, if the effective cpumask of any non-
|
|
|
|
* empty cpuset is changed, we need to rebuild sched domains.
|
|
|
|
* On default hierarchy, the cpuset needs to be a partition
|
|
|
|
* root as well.
|
2014-07-09 16:47:50 +08:00
|
|
|
*/
|
|
|
|
if (!cpumask_empty(cp->cpus_allowed) &&
|
2018-11-08 23:08:42 +08:00
|
|
|
is_sched_load_balance(cp) &&
|
|
|
|
(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
|
2022-09-02 04:57:37 +08:00
|
|
|
is_partition_valid(cp)))
|
2014-07-09 16:47:50 +08:00
|
|
|
need_rebuild_sched_domains = true;
|
|
|
|
|
2013-06-09 17:16:29 +08:00
|
|
|
rcu_read_lock();
|
|
|
|
css_put(&cp->css);
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
2014-07-09 16:47:50 +08:00
|
|
|
|
2023-06-27 22:35:03 +08:00
|
|
|
if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD))
|
2014-07-09 16:47:50 +08:00
|
|
|
rebuild_sched_domains_locked();
|
2013-06-09 17:16:29 +08:00
|
|
|
}
|
|
|
|
|
2018-11-08 23:08:40 +08:00
|
|
|
/**
|
|
|
|
* update_sibling_cpumasks - Update siblings cpumasks
|
|
|
|
* @parent: Parent cpuset
|
|
|
|
* @cs: Current cpuset
|
|
|
|
* @tmp: Temp variables
|
|
|
|
*/
|
|
|
|
static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
|
|
|
|
struct tmpmasks *tmp)
|
|
|
|
{
|
|
|
|
struct cpuset *sibling;
|
|
|
|
struct cgroup_subsys_state *pos_css;
|
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
lockdep_assert_held(&cpuset_mutex);
|
2022-02-03 11:31:03 +08:00
|
|
|
|
2018-11-08 23:08:40 +08:00
|
|
|
/*
|
|
|
|
* Check all its siblings and call update_cpumasks_hier()
|
|
|
|
* if their use_parent_ecpus flag is set in order for them
|
|
|
|
* to use the right effective_cpus value.
|
2022-02-03 11:31:03 +08:00
|
|
|
*
|
|
|
|
* The update_cpumasks_hier() function may sleep. So we have to
|
2023-06-27 22:35:03 +08:00
|
|
|
* release the RCU read lock before calling it. HIER_NO_SD_REBUILD
|
|
|
|
* flag is used to suppress rebuild of sched domains as the callers
|
|
|
|
* will take care of that.
|
2018-11-08 23:08:40 +08:00
|
|
|
*/
|
|
|
|
rcu_read_lock();
|
|
|
|
cpuset_for_each_child(sibling, pos_css, parent) {
|
|
|
|
if (sibling == cs)
|
|
|
|
continue;
|
|
|
|
if (!sibling->use_parent_ecpus)
|
|
|
|
continue;
|
2022-02-03 11:31:03 +08:00
|
|
|
if (!css_tryget_online(&sibling->css))
|
|
|
|
continue;
|
2018-11-08 23:08:40 +08:00
|
|
|
|
2022-02-03 11:31:03 +08:00
|
|
|
rcu_read_unlock();
|
2023-06-27 22:35:03 +08:00
|
|
|
update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD);
|
2022-02-03 11:31:03 +08:00
|
|
|
rcu_read_lock();
|
|
|
|
css_put(&sibling->css);
|
2018-11-08 23:08:40 +08:00
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2008-02-07 16:14:44 +08:00
|
|
|
/**
|
|
|
|
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
|
|
|
|
* @cs: the cpuset to consider
|
2014-05-06 01:46:55 +08:00
|
|
|
* @trialcs: trial cpuset
|
2008-02-07 16:14:44 +08:00
|
|
|
* @buf: buffer of cpu numbers written to this cpuset
|
|
|
|
*/
|
2009-01-08 10:08:43 +08:00
|
|
|
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
|
|
|
const char *buf)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-02-07 16:14:44 +08:00
|
|
|
int retval;
|
2018-11-08 23:08:38 +08:00
|
|
|
struct tmpmasks tmp;
|
2022-09-02 04:57:43 +08:00
|
|
|
bool invalidate = false;
|
2023-06-27 22:35:01 +08:00
|
|
|
int old_prs = cs->partition_root_state;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-03-29 13:08:31 +08:00
|
|
|
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
|
[PATCH] cpuset: top_cpuset tracks hotplug changes to cpu_online_map
Change the list of cpus allowed to tasks in the top (root) cpuset to
dynamically track what cpus are online, using a CPU hotplug notifier. Make
this top cpus file read-only.
On systems that have cpusets configured in their kernel, but that aren't
actively using cpusets (for some distros, this covers the majority of
systems) all tasks end up in the top cpuset.
If that system does support CPU hotplug, then these tasks cannot make use
of CPUs that are added after system boot, because the CPUs are not allowed
in the top cpuset. This is a surprising regression over earlier kernels
that didn't have cpusets enabled.
In order to keep the behaviour of cpusets consistent between systems
actively making use of them and systems not using them, this patch changes
the behaviour of the 'cpus' file in the top (root) cpuset, making it read
only, and making it automatically track the value of cpu_online_map. Thus
tasks in the top cpuset will have automatic use of hot plugged CPUs allowed
by their cpuset.
Thanks to Anton Blanchard and Nathan Lynch for reporting this problem,
driving the fix, and earlier versions of this patch.
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Nathan Lynch <ntl@pobox.com>
Cc: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-08-27 16:23:51 +08:00
|
|
|
if (cs == &top_cpuset)
|
|
|
|
return -EACCES;
|
|
|
|
|
2007-05-08 15:31:43 +08:00
|
|
|
/*
|
hotplug cpu: move tasks in empty cpusets to parent various other fixes
Various minor formatting and comment tweaks to Cliff Wickman's
[PATCH_3_of_3]_cpusets__update_cpumask_revision.patch
I had had "iff", meaning "if and only if" in a comment. However, except for
ancient mathematicians, the abbreviation "iff" was a tad too cryptic. Cliff
changed it to "if", presumably figuring that the "iff" was a typo. However,
it was the "only if" half of the conjunction that was most interesting.
Reword to emphasis the "only if" aspect.
The locking comment for remove_tasks_in_empty_cpuset() was wrong; it said
callback_mutex had to be held on entry. The opposite is true.
Several mentions of attach_task() in comments needed to be
changed to cgroup_attach_task().
A comment about notify_on_release was no longer relevant,
as the line of code it had commented, namely:
set_bit(CS_RELEASED_RESOURCE, &parent->flags);
is no longer present in that place in the cpuset.c code.
Similarly a comment about notify_on_release before the
scan_for_empty_cpusets() routine was no longer relevant.
Removed extra parentheses and unnecessary return statement.
Renamed attach_task() to cpuset_attach() in various comments.
Removed comment about not needing memory migration, as it seems the migration
is done anyway, via the cpuset_attach() callback from cgroup_attach_task().
Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Cliff Wickman <cpw@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-07 16:14:46 +08:00
|
|
|
* An empty cpus_allowed is ok only if the cpuset has no tasks.
|
2007-10-19 14:40:21 +08:00
|
|
|
* Since cpulist_parse() fails on an empty mask, we special case
|
|
|
|
* that parsing. The validate_change() call ensures that cpusets
|
|
|
|
* with tasks have cpus.
|
2007-05-08 15:31:43 +08:00
|
|
|
*/
|
2007-10-19 14:40:21 +08:00
|
|
|
if (!*buf) {
|
2009-01-08 10:08:44 +08:00
|
|
|
cpumask_clear(trialcs->cpus_allowed);
|
2007-05-08 15:31:43 +08:00
|
|
|
} else {
|
2009-01-08 10:08:44 +08:00
|
|
|
retval = cpulist_parse(buf, trialcs->cpus_allowed);
|
2007-05-08 15:31:43 +08:00
|
|
|
if (retval < 0)
|
|
|
|
return retval;
|
2008-06-06 13:46:32 +08:00
|
|
|
|
2014-07-09 16:49:12 +08:00
|
|
|
if (!cpumask_subset(trialcs->cpus_allowed,
|
|
|
|
top_cpuset.cpus_allowed))
|
2008-06-06 13:46:32 +08:00
|
|
|
return -EINVAL;
|
2007-05-08 15:31:43 +08:00
|
|
|
}
|
2007-10-19 14:40:20 +08:00
|
|
|
|
2007-10-19 14:40:22 +08:00
|
|
|
/* Nothing to do if the cpus didn't change */
|
2009-01-08 10:08:44 +08:00
|
|
|
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
|
2007-10-19 14:40:22 +08:00
|
|
|
return 0;
|
2008-02-07 16:14:44 +08:00
|
|
|
|
2023-06-27 22:35:02 +08:00
|
|
|
if (alloc_cpumasks(NULL, &tmp))
|
|
|
|
return -ENOMEM;
|
2018-11-08 23:08:38 +08:00
|
|
|
|
2022-09-02 04:57:43 +08:00
|
|
|
retval = validate_change(cs, trialcs);
|
|
|
|
|
|
|
|
if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
|
|
|
|
struct cpuset *cp, *parent;
|
|
|
|
struct cgroup_subsys_state *css;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The -EINVAL error code indicates that partition sibling
|
|
|
|
* CPU exclusivity rule has been violated. We still allow
|
|
|
|
* the cpumask change to proceed while invalidating the
|
|
|
|
* partition. However, any conflicting sibling partitions
|
|
|
|
* have to be marked as invalid too.
|
|
|
|
*/
|
|
|
|
invalidate = true;
|
|
|
|
rcu_read_lock();
|
|
|
|
parent = parent_cs(cs);
|
|
|
|
cpuset_for_each_child(cp, css, parent)
|
|
|
|
if (is_partition_valid(cp) &&
|
|
|
|
cpumask_intersects(trialcs->cpus_allowed, cp->cpus_allowed)) {
|
|
|
|
rcu_read_unlock();
|
|
|
|
update_parent_subparts_cpumask(cp, partcmd_invalidate, NULL, &tmp);
|
|
|
|
rcu_read_lock();
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
retval = 0;
|
|
|
|
}
|
|
|
|
if (retval < 0)
|
2023-06-27 22:35:02 +08:00
|
|
|
goto out_free;
|
2022-09-02 04:57:43 +08:00
|
|
|
|
2018-11-08 23:08:38 +08:00
|
|
|
if (cs->partition_root_state) {
|
2022-09-02 04:57:43 +08:00
|
|
|
if (invalidate)
|
|
|
|
update_parent_subparts_cpumask(cs, partcmd_invalidate,
|
|
|
|
NULL, &tmp);
|
|
|
|
else
|
|
|
|
update_parent_subparts_cpumask(cs, partcmd_update,
|
|
|
|
trialcs->cpus_allowed, &tmp);
|
2018-11-08 23:08:38 +08:00
|
|
|
}
|
|
|
|
|
2022-09-02 04:57:39 +08:00
|
|
|
compute_effective_cpumask(trialcs->effective_cpus, trialcs,
|
|
|
|
parent_cs(cs));
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_lock_irq(&callback_lock);
|
2009-01-08 10:08:44 +08:00
|
|
|
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
|
2018-11-08 23:08:38 +08:00
|
|
|
|
|
|
|
/*
|
2022-09-02 04:57:39 +08:00
|
|
|
* Make sure that subparts_cpus, if not empty, is a subset of
|
|
|
|
* cpus_allowed. Clear subparts_cpus if partition not valid or
|
|
|
|
* empty effective cpus with tasks.
|
2018-11-08 23:08:38 +08:00
|
|
|
*/
|
|
|
|
if (cs->nr_subparts_cpus) {
|
2022-09-02 04:57:39 +08:00
|
|
|
if (!is_partition_valid(cs) ||
|
|
|
|
(cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus) &&
|
|
|
|
partition_is_populated(cs, NULL))) {
|
|
|
|
cs->nr_subparts_cpus = 0;
|
|
|
|
cpumask_clear(cs->subparts_cpus);
|
|
|
|
} else {
|
|
|
|
cpumask_and(cs->subparts_cpus, cs->subparts_cpus,
|
|
|
|
cs->cpus_allowed);
|
|
|
|
cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
|
|
|
|
}
|
2018-11-08 23:08:38 +08:00
|
|
|
}
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
2007-10-19 14:40:20 +08:00
|
|
|
|
2022-09-02 04:57:39 +08:00
|
|
|
/* effective_cpus will be updated here */
|
2023-06-27 22:35:03 +08:00
|
|
|
update_cpumasks_hier(cs, &tmp, 0);
|
2018-11-08 23:08:40 +08:00
|
|
|
|
|
|
|
if (cs->partition_root_state) {
|
|
|
|
struct cpuset *parent = parent_cs(cs);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For partition root, update the cpumasks of sibling
|
|
|
|
* cpusets if they use parent's effective_cpus.
|
|
|
|
*/
|
|
|
|
if (parent->child_ecpus_count)
|
|
|
|
update_sibling_cpumasks(parent, cs, &tmp);
|
2023-06-27 22:35:01 +08:00
|
|
|
|
|
|
|
/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */
|
|
|
|
update_partition_sd_lb(cs, old_prs);
|
2018-11-08 23:08:40 +08:00
|
|
|
}
|
2023-06-27 22:35:02 +08:00
|
|
|
out_free:
|
|
|
|
free_cpumasks(NULL, &tmp);
|
2005-06-26 05:57:34 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2006-03-31 18:30:52 +08:00
|
|
|
/*
|
2016-01-20 01:18:41 +08:00
|
|
|
* Migrate memory region from one set of nodes to another. This is
|
|
|
|
* performed asynchronously as it can be called from process migration path
|
|
|
|
* holding locks involved in process management. All mm migrations are
|
|
|
|
* performed in the queued order and can be waited for by flushing
|
|
|
|
* cpuset_migrate_mm_wq.
|
2006-03-31 18:30:52 +08:00
|
|
|
*/
|
|
|
|
|
2016-01-20 01:18:41 +08:00
|
|
|
struct cpuset_migrate_mm_work {
|
|
|
|
struct work_struct work;
|
|
|
|
struct mm_struct *mm;
|
|
|
|
nodemask_t from;
|
|
|
|
nodemask_t to;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void cpuset_migrate_mm_workfn(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct cpuset_migrate_mm_work *mwork =
|
|
|
|
container_of(work, struct cpuset_migrate_mm_work, work);
|
|
|
|
|
|
|
|
/* on a wq worker, no need to worry about %current's mems_allowed */
|
|
|
|
do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
|
|
|
|
mmput(mwork->mm);
|
|
|
|
kfree(mwork);
|
|
|
|
}
|
|
|
|
|
2006-03-31 18:30:52 +08:00
|
|
|
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
|
|
|
|
const nodemask_t *to)
|
|
|
|
{
|
2016-01-20 01:18:41 +08:00
|
|
|
struct cpuset_migrate_mm_work *mwork;
|
2006-03-31 18:30:52 +08:00
|
|
|
|
2021-08-25 18:54:15 +08:00
|
|
|
if (nodes_equal(*from, *to)) {
|
|
|
|
mmput(mm);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2016-01-20 01:18:41 +08:00
|
|
|
mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
|
|
|
|
if (mwork) {
|
|
|
|
mwork->mm = mm;
|
|
|
|
mwork->from = *from;
|
|
|
|
mwork->to = *to;
|
|
|
|
INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
|
|
|
|
queue_work(cpuset_migrate_mm_wq, &mwork->work);
|
|
|
|
} else {
|
|
|
|
mmput(mm);
|
|
|
|
}
|
|
|
|
}
|
2006-03-31 18:30:52 +08:00
|
|
|
|
2016-04-22 07:06:48 +08:00
|
|
|
static void cpuset_post_attach(void)
|
2016-01-20 01:18:41 +08:00
|
|
|
{
|
|
|
|
flush_workqueue(cpuset_migrate_mm_wq);
|
2006-03-31 18:30:52 +08:00
|
|
|
}
|
|
|
|
|
2009-04-03 07:57:51 +08:00
|
|
|
/*
|
cpuset,mm: update tasks' mems_allowed in time
Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.
In order to update tasks' mems_allowed in time, we must modify the code of
memory policy. Because the memory policy is applied in the process's
context originally. After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.
But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression. But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set. In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.
[lee.schermerhorn@hp.com:
The rework of mpol_new() to extract the adjusting of the node mask to
apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
allocation. Fix this by adding the check for MPOL_PREFERRED and empty
node mask to mpol_new_mpolicy().
Remove the now unneeded 'nodes = NULL' from mpol_new().
Note that mpol_new_mempolicy() is always called with a non-NULL
'nodes' parameter now that it has been removed from mpol_new().
Therefore, we don't need to test nodes for NULL before testing it for
'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to
verify this assumption.]
[lee.schermerhorn@hp.com:
I don't think the function name 'mpol_new_mempolicy' is descriptive
enough to differentiate it from mpol_new().
This function applies cpuset set context, usually constraining nodes
to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag
is set, it also translates the nodes. So I settled on
'mpol_set_nodemask()', because the comment block for mpol_new() mentions
that we need to call this function to "set nodes".
Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-17 06:31:49 +08:00
|
|
|
* cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
|
|
|
|
* @tsk: the task to change
|
|
|
|
* @newmems: new nodes that the task will be set
|
|
|
|
*
|
2017-07-07 06:40:09 +08:00
|
|
|
* We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
|
|
|
|
* and rebind an eventual tasks' mempolicy. If the task is allocating in
|
|
|
|
* parallel, it might temporarily see an empty intersection, which results in
|
|
|
|
* a seqlock check and retry before OOM or allocation failure.
|
cpuset,mm: update tasks' mems_allowed in time
Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.
In order to update tasks' mems_allowed in time, we must modify the code of
memory policy. Because the memory policy is applied in the process's
context originally. After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.
But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression. But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set. In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.
[lee.schermerhorn@hp.com:
The rework of mpol_new() to extract the adjusting of the node mask to
apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
allocation. Fix this by adding the check for MPOL_PREFERRED and empty
node mask to mpol_new_mpolicy().
Remove the now unneeded 'nodes = NULL' from mpol_new().
Note that mpol_new_mempolicy() is always called with a non-NULL
'nodes' parameter now that it has been removed from mpol_new().
Therefore, we don't need to test nodes for NULL before testing it for
'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to
verify this assumption.]
[lee.schermerhorn@hp.com:
I don't think the function name 'mpol_new_mempolicy' is descriptive
enough to differentiate it from mpol_new().
This function applies cpuset set context, usually constraining nodes
to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag
is set, it also translates the nodes. So I settled on
'mpol_set_nodemask()', because the comment block for mpol_new() mentions
that we need to call this function to "set nodes".
Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-17 06:31:49 +08:00
|
|
|
*/
|
|
|
|
static void cpuset_change_task_nodemask(struct task_struct *tsk,
|
|
|
|
nodemask_t *newmems)
|
|
|
|
{
|
2010-05-25 05:32:08 +08:00
|
|
|
task_lock(tsk);
|
|
|
|
|
2017-07-07 06:40:09 +08:00
|
|
|
local_irq_disable();
|
|
|
|
write_seqcount_begin(&tsk->mems_allowed_seq);
|
2010-05-25 05:32:08 +08:00
|
|
|
|
cpuset: mm: reduce large amounts of memory barrier related damage v3
Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when
changing cpuset's mems") wins a super prize for the largest number of
memory barriers entered into fast paths for one commit.
[get|put]_mems_allowed is incredibly heavy with pairs of full memory
barriers inserted into a number of hot paths. This was detected while
investigating at large page allocator slowdown introduced some time
after 2.6.32. The largest portion of this overhead was shown by
oprofile to be at an mfence introduced by this commit into the page
allocator hot path.
For extra style points, the commit introduced the use of yield() in an
implementation of what looks like a spinning mutex.
This patch replaces the full memory barriers on both read and write
sides with a sequence counter with just read barriers on the fast path
side. This is much cheaper on some architectures, including x86. The
main bulk of the patch is the retry logic if the nodemask changes in a
manner that can cause a false failure.
While updating the nodemask, a check is made to see if a false failure
is a risk. If it is, the sequence number gets bumped and parallel
allocators will briefly stall while the nodemask update takes place.
In a page fault test microbenchmark, oprofile samples from
__alloc_pages_nodemask went from 4.53% of all samples to 1.15%. The
actual results were
3.3.0-rc3 3.3.0-rc3
rc3-vanilla nobarrier-v2r1
Clients 1 UserTime 0.07 ( 0.00%) 0.08 (-14.19%)
Clients 2 UserTime 0.07 ( 0.00%) 0.07 ( 2.72%)
Clients 4 UserTime 0.08 ( 0.00%) 0.07 ( 3.29%)
Clients 1 SysTime 0.70 ( 0.00%) 0.65 ( 6.65%)
Clients 2 SysTime 0.85 ( 0.00%) 0.82 ( 3.65%)
Clients 4 SysTime 1.41 ( 0.00%) 1.41 ( 0.32%)
Clients 1 WallTime 0.77 ( 0.00%) 0.74 ( 4.19%)
Clients 2 WallTime 0.47 ( 0.00%) 0.45 ( 3.73%)
Clients 4 WallTime 0.38 ( 0.00%) 0.37 ( 1.58%)
Clients 1 Flt/sec/cpu 497620.28 ( 0.00%) 520294.53 ( 4.56%)
Clients 2 Flt/sec/cpu 414639.05 ( 0.00%) 429882.01 ( 3.68%)
Clients 4 Flt/sec/cpu 257959.16 ( 0.00%) 258761.48 ( 0.31%)
Clients 1 Flt/sec 495161.39 ( 0.00%) 517292.87 ( 4.47%)
Clients 2 Flt/sec 820325.95 ( 0.00%) 850289.77 ( 3.65%)
Clients 4 Flt/sec 1020068.93 ( 0.00%) 1022674.06 ( 0.26%)
MMTests Statistics: duration
Sys Time Running Test (seconds) 135.68 132.17
User+Sys Time Running Test (seconds) 164.2 160.13
Total Elapsed Time (seconds) 123.46 120.87
The overall improvement is small but the System CPU time is much
improved and roughly in correlation to what oprofile reported (these
performance figures are without profiling so skew is expected). The
actual number of page faults is noticeably improved.
For benchmarks like kernel builds, the overall benefit is marginal but
the system CPU time is slightly reduced.
To test the actual bug the commit fixed I opened two terminals. The
first ran within a cpuset and continually ran a small program that
faulted 100M of anonymous data. In a second window, the nodemask of the
cpuset was continually randomised in a loop.
Without the commit, the program would fail every so often (usually
within 10 seconds) and obviously with the commit everything worked fine.
With this patch applied, it also worked fine so the fix should be
functionally equivalent.
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:11 +08:00
|
|
|
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
|
2017-07-07 06:40:06 +08:00
|
|
|
mpol_rebind_task(tsk, newmems);
|
cpuset,mm: update tasks' mems_allowed in time
Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.
In order to update tasks' mems_allowed in time, we must modify the code of
memory policy. Because the memory policy is applied in the process's
context originally. After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.
But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression. But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set. In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.
[lee.schermerhorn@hp.com:
The rework of mpol_new() to extract the adjusting of the node mask to
apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
allocation. Fix this by adding the check for MPOL_PREFERRED and empty
node mask to mpol_new_mpolicy().
Remove the now unneeded 'nodes = NULL' from mpol_new().
Note that mpol_new_mempolicy() is always called with a non-NULL
'nodes' parameter now that it has been removed from mpol_new().
Therefore, we don't need to test nodes for NULL before testing it for
'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to
verify this assumption.]
[lee.schermerhorn@hp.com:
I don't think the function name 'mpol_new_mempolicy' is descriptive
enough to differentiate it from mpol_new().
This function applies cpuset set context, usually constraining nodes
to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag
is set, it also translates the nodes. So I settled on
'mpol_set_nodemask()', because the comment block for mpol_new() mentions
that we need to call this function to "set nodes".
Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-17 06:31:49 +08:00
|
|
|
tsk->mems_allowed = *newmems;
|
cpuset: mm: reduce large amounts of memory barrier related damage v3
Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when
changing cpuset's mems") wins a super prize for the largest number of
memory barriers entered into fast paths for one commit.
[get|put]_mems_allowed is incredibly heavy with pairs of full memory
barriers inserted into a number of hot paths. This was detected while
investigating at large page allocator slowdown introduced some time
after 2.6.32. The largest portion of this overhead was shown by
oprofile to be at an mfence introduced by this commit into the page
allocator hot path.
For extra style points, the commit introduced the use of yield() in an
implementation of what looks like a spinning mutex.
This patch replaces the full memory barriers on both read and write
sides with a sequence counter with just read barriers on the fast path
side. This is much cheaper on some architectures, including x86. The
main bulk of the patch is the retry logic if the nodemask changes in a
manner that can cause a false failure.
While updating the nodemask, a check is made to see if a false failure
is a risk. If it is, the sequence number gets bumped and parallel
allocators will briefly stall while the nodemask update takes place.
In a page fault test microbenchmark, oprofile samples from
__alloc_pages_nodemask went from 4.53% of all samples to 1.15%. The
actual results were
3.3.0-rc3 3.3.0-rc3
rc3-vanilla nobarrier-v2r1
Clients 1 UserTime 0.07 ( 0.00%) 0.08 (-14.19%)
Clients 2 UserTime 0.07 ( 0.00%) 0.07 ( 2.72%)
Clients 4 UserTime 0.08 ( 0.00%) 0.07 ( 3.29%)
Clients 1 SysTime 0.70 ( 0.00%) 0.65 ( 6.65%)
Clients 2 SysTime 0.85 ( 0.00%) 0.82 ( 3.65%)
Clients 4 SysTime 1.41 ( 0.00%) 1.41 ( 0.32%)
Clients 1 WallTime 0.77 ( 0.00%) 0.74 ( 4.19%)
Clients 2 WallTime 0.47 ( 0.00%) 0.45 ( 3.73%)
Clients 4 WallTime 0.38 ( 0.00%) 0.37 ( 1.58%)
Clients 1 Flt/sec/cpu 497620.28 ( 0.00%) 520294.53 ( 4.56%)
Clients 2 Flt/sec/cpu 414639.05 ( 0.00%) 429882.01 ( 3.68%)
Clients 4 Flt/sec/cpu 257959.16 ( 0.00%) 258761.48 ( 0.31%)
Clients 1 Flt/sec 495161.39 ( 0.00%) 517292.87 ( 4.47%)
Clients 2 Flt/sec 820325.95 ( 0.00%) 850289.77 ( 3.65%)
Clients 4 Flt/sec 1020068.93 ( 0.00%) 1022674.06 ( 0.26%)
MMTests Statistics: duration
Sys Time Running Test (seconds) 135.68 132.17
User+Sys Time Running Test (seconds) 164.2 160.13
Total Elapsed Time (seconds) 123.46 120.87
The overall improvement is small but the System CPU time is much
improved and roughly in correlation to what oprofile reported (these
performance figures are without profiling so skew is expected). The
actual number of page faults is noticeably improved.
For benchmarks like kernel builds, the overall benefit is marginal but
the system CPU time is slightly reduced.
To test the actual bug the commit fixed I opened two terminals. The
first ran within a cpuset and continually ran a small program that
faulted 100M of anonymous data. In a second window, the nodemask of the
cpuset was continually randomised in a loop.
Without the commit, the program would fail every so often (usually
within 10 seconds) and obviously with the commit everything worked fine.
With this patch applied, it also worked fine so the fix should be
functionally equivalent.
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:11 +08:00
|
|
|
|
2017-07-07 06:40:09 +08:00
|
|
|
write_seqcount_end(&tsk->mems_allowed_seq);
|
|
|
|
local_irq_enable();
|
cpuset: mm: reduce large amounts of memory barrier related damage v3
Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when
changing cpuset's mems") wins a super prize for the largest number of
memory barriers entered into fast paths for one commit.
[get|put]_mems_allowed is incredibly heavy with pairs of full memory
barriers inserted into a number of hot paths. This was detected while
investigating at large page allocator slowdown introduced some time
after 2.6.32. The largest portion of this overhead was shown by
oprofile to be at an mfence introduced by this commit into the page
allocator hot path.
For extra style points, the commit introduced the use of yield() in an
implementation of what looks like a spinning mutex.
This patch replaces the full memory barriers on both read and write
sides with a sequence counter with just read barriers on the fast path
side. This is much cheaper on some architectures, including x86. The
main bulk of the patch is the retry logic if the nodemask changes in a
manner that can cause a false failure.
While updating the nodemask, a check is made to see if a false failure
is a risk. If it is, the sequence number gets bumped and parallel
allocators will briefly stall while the nodemask update takes place.
In a page fault test microbenchmark, oprofile samples from
__alloc_pages_nodemask went from 4.53% of all samples to 1.15%. The
actual results were
3.3.0-rc3 3.3.0-rc3
rc3-vanilla nobarrier-v2r1
Clients 1 UserTime 0.07 ( 0.00%) 0.08 (-14.19%)
Clients 2 UserTime 0.07 ( 0.00%) 0.07 ( 2.72%)
Clients 4 UserTime 0.08 ( 0.00%) 0.07 ( 3.29%)
Clients 1 SysTime 0.70 ( 0.00%) 0.65 ( 6.65%)
Clients 2 SysTime 0.85 ( 0.00%) 0.82 ( 3.65%)
Clients 4 SysTime 1.41 ( 0.00%) 1.41 ( 0.32%)
Clients 1 WallTime 0.77 ( 0.00%) 0.74 ( 4.19%)
Clients 2 WallTime 0.47 ( 0.00%) 0.45 ( 3.73%)
Clients 4 WallTime 0.38 ( 0.00%) 0.37 ( 1.58%)
Clients 1 Flt/sec/cpu 497620.28 ( 0.00%) 520294.53 ( 4.56%)
Clients 2 Flt/sec/cpu 414639.05 ( 0.00%) 429882.01 ( 3.68%)
Clients 4 Flt/sec/cpu 257959.16 ( 0.00%) 258761.48 ( 0.31%)
Clients 1 Flt/sec 495161.39 ( 0.00%) 517292.87 ( 4.47%)
Clients 2 Flt/sec 820325.95 ( 0.00%) 850289.77 ( 3.65%)
Clients 4 Flt/sec 1020068.93 ( 0.00%) 1022674.06 ( 0.26%)
MMTests Statistics: duration
Sys Time Running Test (seconds) 135.68 132.17
User+Sys Time Running Test (seconds) 164.2 160.13
Total Elapsed Time (seconds) 123.46 120.87
The overall improvement is small but the System CPU time is much
improved and roughly in correlation to what oprofile reported (these
performance figures are without profiling so skew is expected). The
actual number of page faults is noticeably improved.
For benchmarks like kernel builds, the overall benefit is marginal but
the system CPU time is slightly reduced.
To test the actual bug the commit fixed I opened two terminals. The
first ran within a cpuset and continually ran a small program that
faulted 100M of anonymous data. In a second window, the nodemask of the
cpuset was continually randomised in a loop.
Without the commit, the program would fail every so often (usually
within 10 seconds) and obviously with the commit everything worked fine.
With this patch applied, it also worked fine so the fix should be
functionally equivalent.
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:11 +08:00
|
|
|
|
2010-05-25 05:32:08 +08:00
|
|
|
task_unlock(tsk);
|
cpuset,mm: update tasks' mems_allowed in time
Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.
In order to update tasks' mems_allowed in time, we must modify the code of
memory policy. Because the memory policy is applied in the process's
context originally. After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.
But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression. But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set. In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.
[lee.schermerhorn@hp.com:
The rework of mpol_new() to extract the adjusting of the node mask to
apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
allocation. Fix this by adding the check for MPOL_PREFERRED and empty
node mask to mpol_new_mpolicy().
Remove the now unneeded 'nodes = NULL' from mpol_new().
Note that mpol_new_mempolicy() is always called with a non-NULL
'nodes' parameter now that it has been removed from mpol_new().
Therefore, we don't need to test nodes for NULL before testing it for
'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to
verify this assumption.]
[lee.schermerhorn@hp.com:
I don't think the function name 'mpol_new_mempolicy' is descriptive
enough to differentiate it from mpol_new().
This function applies cpuset set context, usually constraining nodes
to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag
is set, it also translates the nodes. So I settled on
'mpol_set_nodemask()', because the comment block for mpol_new() mentions
that we need to call this function to "set nodes".
Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-17 06:31:49 +08:00
|
|
|
}
|
|
|
|
|
2007-10-19 14:39:39 +08:00
|
|
|
static void *cpuset_being_rebound;
|
|
|
|
|
2008-07-25 16:47:21 +08:00
|
|
|
/**
|
|
|
|
* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
|
|
|
|
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
|
|
|
|
*
|
2014-02-13 19:58:40 +08:00
|
|
|
* Iterate through each task of @cs updating its mems_allowed to the
|
2023-05-08 15:58:50 +08:00
|
|
|
* effective cpuset's. As this function is called with cpuset_mutex held,
|
2014-02-13 19:58:40 +08:00
|
|
|
* cpuset membership stays stable.
|
2008-07-25 16:47:21 +08:00
|
|
|
*/
|
2014-02-13 19:58:40 +08:00
|
|
|
static void update_tasks_nodemask(struct cpuset *cs)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2023-05-08 15:58:50 +08:00
|
|
|
static nodemask_t newmems; /* protected by cpuset_mutex */
|
2014-02-13 19:58:40 +08:00
|
|
|
struct css_task_iter it;
|
|
|
|
struct task_struct *task;
|
2006-01-08 17:01:52 +08:00
|
|
|
|
2008-04-28 17:13:09 +08:00
|
|
|
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
|
[PATCH] cpuset: rebind vma mempolicies fix
Fix more of longstanding bug in cpuset/mempolicy interaction.
NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset. The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.
When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.
An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.
This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired. The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.
Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan. In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock). So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.
Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.
When a task is moved to a different cpuset, it is easier, as there is only one
task involved. It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.
It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice. This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:59 +08:00
|
|
|
|
2014-07-09 16:48:32 +08:00
|
|
|
guarantee_online_mems(cs, &newmems);
|
2013-06-09 17:15:08 +08:00
|
|
|
|
[PATCH] cpuset: rebind vma mempolicies fix
Fix more of longstanding bug in cpuset/mempolicy interaction.
NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset. The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.
When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.
An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.
This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired. The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.
Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan. In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock). So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.
Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.
When a task is moved to a different cpuset, it is easier, as there is only one
task involved. It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.
It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice. This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:59 +08:00
|
|
|
/*
|
2020-06-09 12:33:54 +08:00
|
|
|
* The mpol_rebind_mm() call takes mmap_lock, which we couldn't
|
2009-04-03 07:57:51 +08:00
|
|
|
* take while holding tasklist_lock. Forks can happen - the
|
|
|
|
* mpol_dup() cpuset_being_rebound check will catch such forks,
|
|
|
|
* and rebind their vma mempolicies too. Because we still hold
|
2023-05-08 15:58:50 +08:00
|
|
|
* the global cpuset_mutex, we know that no other rebind effort
|
2009-04-03 07:57:51 +08:00
|
|
|
* will be contending for the global variable cpuset_being_rebound.
|
[PATCH] cpuset: rebind vma mempolicies fix
Fix more of longstanding bug in cpuset/mempolicy interaction.
NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset. The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.
When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.
An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.
This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired. The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.
Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan. In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock). So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.
Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.
When a task is moved to a different cpuset, it is easier, as there is only one
task involved. It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.
It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice. This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:59 +08:00
|
|
|
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
|
2006-01-08 17:02:00 +08:00
|
|
|
* is idempotent. Also migrate pages in each mm to new nodes.
|
[PATCH] cpuset: rebind vma mempolicies fix
Fix more of longstanding bug in cpuset/mempolicy interaction.
NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset. The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.
When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.
An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.
This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired. The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.
Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan. In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock). So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.
Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.
When a task is moved to a different cpuset, it is easier, as there is only one
task involved. It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.
It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice. This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:59 +08:00
|
|
|
*/
|
2017-05-15 21:34:01 +08:00
|
|
|
css_task_iter_start(&cs->css, 0, &it);
|
2014-02-13 19:58:40 +08:00
|
|
|
while ((task = css_task_iter_next(&it))) {
|
|
|
|
struct mm_struct *mm;
|
|
|
|
bool migrate;
|
|
|
|
|
|
|
|
cpuset_change_task_nodemask(task, &newmems);
|
|
|
|
|
|
|
|
mm = get_task_mm(task);
|
|
|
|
if (!mm)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
migrate = is_memory_migrate(cs);
|
|
|
|
|
|
|
|
mpol_rebind_mm(mm, &cs->mems_allowed);
|
|
|
|
if (migrate)
|
|
|
|
cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
|
2016-01-20 01:18:41 +08:00
|
|
|
else
|
|
|
|
mmput(mm);
|
2014-02-13 19:58:40 +08:00
|
|
|
}
|
|
|
|
css_task_iter_end(&it);
|
[PATCH] cpuset: rebind vma mempolicies fix
Fix more of longstanding bug in cpuset/mempolicy interaction.
NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset. The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.
When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.
An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.
This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired. The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.
Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan. In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock). So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.
Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.
When a task is moved to a different cpuset, it is easier, as there is only one
task involved. It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.
It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice. This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:59 +08:00
|
|
|
|
2013-06-09 17:15:08 +08:00
|
|
|
/*
|
|
|
|
* All the tasks' nodemasks have been updated, update
|
|
|
|
* cs->old_mems_allowed.
|
|
|
|
*/
|
|
|
|
cs->old_mems_allowed = newmems;
|
|
|
|
|
2008-02-07 16:14:45 +08:00
|
|
|
/* We're done rebinding vmas to this cpuset's new mems_allowed. */
|
2007-10-19 14:39:39 +08:00
|
|
|
cpuset_being_rebound = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2013-06-09 17:16:29 +08:00
|
|
|
/*
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
* update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
|
|
|
|
* @cs: the cpuset to consider
|
|
|
|
* @new_mems: a temp variable for calculating new effective_mems
|
2013-06-09 17:16:29 +08:00
|
|
|
*
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
* When configured nodemask is changed, the effective nodemasks of this cpuset
|
|
|
|
* and all its descendants need to be updated.
|
2013-06-09 17:16:29 +08:00
|
|
|
*
|
2021-04-08 16:03:46 +08:00
|
|
|
* On legacy hierarchy, effective_mems will be the same with mems_allowed.
|
2013-06-09 17:16:29 +08:00
|
|
|
*
|
2023-05-08 15:58:50 +08:00
|
|
|
* Called with cpuset_mutex held
|
2013-06-09 17:16:29 +08:00
|
|
|
*/
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
|
2013-06-09 17:16:29 +08:00
|
|
|
{
|
|
|
|
struct cpuset *cp;
|
2013-08-09 08:11:25 +08:00
|
|
|
struct cgroup_subsys_state *pos_css;
|
2013-06-09 17:16:29 +08:00
|
|
|
|
|
|
|
rcu_read_lock();
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
|
|
|
|
struct cpuset *parent = parent_cs(cp);
|
|
|
|
|
|
|
|
nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
|
|
|
|
|
cpuset: inherit ancestor's masks if effective_{cpus, mems} becomes empty
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The last item is done here.
This won't introduce behavior change.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:41 +08:00
|
|
|
/*
|
|
|
|
* If it becomes empty, inherit the effective mask of the
|
|
|
|
* parent, which is guaranteed to have some MEMs.
|
|
|
|
*/
|
2017-08-18 03:33:10 +08:00
|
|
|
if (is_in_v2_mode() && nodes_empty(*new_mems))
|
cpuset: inherit ancestor's masks if effective_{cpus, mems} becomes empty
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The last item is done here.
This won't introduce behavior change.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:41 +08:00
|
|
|
*new_mems = parent->effective_mems;
|
|
|
|
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
/* Skip the whole subtree if the nodemask remains the same. */
|
|
|
|
if (nodes_equal(*new_mems, cp->effective_mems)) {
|
|
|
|
pos_css = css_rightmost_descendant(pos_css);
|
|
|
|
continue;
|
2013-06-09 17:16:29 +08:00
|
|
|
}
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
|
2014-05-14 00:11:01 +08:00
|
|
|
if (!css_tryget_online(&cp->css))
|
2013-06-09 17:16:29 +08:00
|
|
|
continue;
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_lock_irq(&callback_lock);
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
cp->effective_mems = *new_mems;
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
|
2017-08-18 03:33:10 +08:00
|
|
|
WARN_ON(!is_in_v2_mode() &&
|
2014-07-30 15:07:13 +08:00
|
|
|
!nodes_equal(cp->mems_allowed, cp->effective_mems));
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
|
2014-02-13 19:58:40 +08:00
|
|
|
update_tasks_nodemask(cp);
|
2013-06-09 17:16:29 +08:00
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
css_put(&cp->css);
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2008-07-25 16:47:21 +08:00
|
|
|
/*
|
|
|
|
* Handle user request to change the 'mems' memory placement
|
|
|
|
* of a cpuset. Needs to validate the request, update the
|
cpuset,mm: update tasks' mems_allowed in time
Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.
In order to update tasks' mems_allowed in time, we must modify the code of
memory policy. Because the memory policy is applied in the process's
context originally. After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.
But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression. But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set. In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.
[lee.schermerhorn@hp.com:
The rework of mpol_new() to extract the adjusting of the node mask to
apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
allocation. Fix this by adding the check for MPOL_PREFERRED and empty
node mask to mpol_new_mpolicy().
Remove the now unneeded 'nodes = NULL' from mpol_new().
Note that mpol_new_mempolicy() is always called with a non-NULL
'nodes' parameter now that it has been removed from mpol_new().
Therefore, we don't need to test nodes for NULL before testing it for
'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to
verify this assumption.]
[lee.schermerhorn@hp.com:
I don't think the function name 'mpol_new_mempolicy' is descriptive
enough to differentiate it from mpol_new().
This function applies cpuset set context, usually constraining nodes
to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag
is set, it also translates the nodes. So I settled on
'mpol_set_nodemask()', because the comment block for mpol_new() mentions
that we need to call this function to "set nodes".
Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-17 06:31:49 +08:00
|
|
|
* cpusets mems_allowed, and for each task in the cpuset,
|
|
|
|
* update mems_allowed and rebind task's mempolicy and any vma
|
|
|
|
* mempolicies and if the cpuset is marked 'memory_migrate',
|
|
|
|
* migrate the tasks pages to the new memory.
|
2008-07-25 16:47:21 +08:00
|
|
|
*
|
2023-05-08 15:58:50 +08:00
|
|
|
* Call with cpuset_mutex held. May take callback_lock during call.
|
2008-07-25 16:47:21 +08:00
|
|
|
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
|
2020-06-09 12:33:54 +08:00
|
|
|
* lock each such tasks mm->mmap_lock, scan its vma's and rebind
|
2008-07-25 16:47:21 +08:00
|
|
|
* their mempolicies to the cpusets new mems_allowed.
|
|
|
|
*/
|
2009-01-08 10:08:43 +08:00
|
|
|
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
|
|
|
|
const char *buf)
|
2008-07-25 16:47:21 +08:00
|
|
|
{
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
/*
|
2012-12-13 05:51:24 +08:00
|
|
|
* top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
|
2008-07-25 16:47:21 +08:00
|
|
|
* it's read-only
|
|
|
|
*/
|
2010-03-24 04:35:35 +08:00
|
|
|
if (cs == &top_cpuset) {
|
|
|
|
retval = -EACCES;
|
|
|
|
goto done;
|
|
|
|
}
|
2008-07-25 16:47:21 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* An empty mems_allowed is ok iff there are no tasks in the cpuset.
|
|
|
|
* Since nodelist_parse() fails on an empty mask, we special case
|
|
|
|
* that parsing. The validate_change() call ensures that cpusets
|
|
|
|
* with tasks have memory.
|
|
|
|
*/
|
|
|
|
if (!*buf) {
|
2009-01-08 10:08:43 +08:00
|
|
|
nodes_clear(trialcs->mems_allowed);
|
2008-07-25 16:47:21 +08:00
|
|
|
} else {
|
2009-01-08 10:08:43 +08:00
|
|
|
retval = nodelist_parse(buf, trialcs->mems_allowed);
|
2008-07-25 16:47:21 +08:00
|
|
|
if (retval < 0)
|
|
|
|
goto done;
|
|
|
|
|
2009-01-08 10:08:43 +08:00
|
|
|
if (!nodes_subset(trialcs->mems_allowed,
|
2014-07-09 16:49:12 +08:00
|
|
|
top_cpuset.mems_allowed)) {
|
|
|
|
retval = -EINVAL;
|
2010-03-24 04:35:35 +08:00
|
|
|
goto done;
|
|
|
|
}
|
2008-07-25 16:47:21 +08:00
|
|
|
}
|
2013-06-09 17:15:08 +08:00
|
|
|
|
|
|
|
if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
|
2008-07-25 16:47:21 +08:00
|
|
|
retval = 0; /* Too easy - nothing to do */
|
|
|
|
goto done;
|
|
|
|
}
|
2009-01-08 10:08:43 +08:00
|
|
|
retval = validate_change(cs, trialcs);
|
2008-07-25 16:47:21 +08:00
|
|
|
if (retval < 0)
|
|
|
|
goto done;
|
|
|
|
|
mm/page_alloc: detect allocation forbidden by cpuset and bail out early
There was a report that starting an Ubuntu in docker while using cpuset
to bind it to movable nodes (a node only has movable zone, like a node
for hotplug or a Persistent Memory node in normal usage) will fail due
to memory allocation failure, and then OOM is involved and many other
innocent processes got killed.
It can be reproduced with command:
$ docker run -it --rm --cpuset-mems 4 ubuntu:latest bash -c "grep Mems_allowed /proc/self/status"
(where node 4 is a movable node)
runc:[2:INIT] invoked oom-killer: gfp_mask=0x500cc2(GFP_HIGHUSER|__GFP_ACCOUNT), order=0, oom_score_adj=0
CPU: 8 PID: 8291 Comm: runc:[2:INIT] Tainted: G W I E 5.8.2-0.g71b519a-default #1 openSUSE Tumbleweed (unreleased)
Hardware name: Dell Inc. PowerEdge R640/0PHYDR, BIOS 2.6.4 04/09/2020
Call Trace:
dump_stack+0x6b/0x88
dump_header+0x4a/0x1e2
oom_kill_process.cold+0xb/0x10
out_of_memory.part.0+0xaf/0x230
out_of_memory+0x3d/0x80
__alloc_pages_slowpath.constprop.0+0x954/0xa20
__alloc_pages_nodemask+0x2d3/0x300
pipe_write+0x322/0x590
new_sync_write+0x196/0x1b0
vfs_write+0x1c3/0x1f0
ksys_write+0xa7/0xe0
do_syscall_64+0x52/0xd0
entry_SYSCALL_64_after_hwframe+0x44/0xa9
Mem-Info:
active_anon:392832 inactive_anon:182 isolated_anon:0
active_file:68130 inactive_file:151527 isolated_file:0
unevictable:2701 dirty:0 writeback:7
slab_reclaimable:51418 slab_unreclaimable:116300
mapped:45825 shmem:735 pagetables:2540 bounce:0
free:159849484 free_pcp:73 free_cma:0
Node 4 active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:0kB dirty:0kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 0kB writeback_tmp:0kB all_unreclaimable? no
Node 4 Movable free:130021408kB min:9140kB low:139160kB high:269180kB reserved_highatomic:0KB active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:130023424kB managed:130023424kB mlocked:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:292kB local_pcp:84kB free_cma:0kB
lowmem_reserve[]: 0 0 0 0 0
Node 4 Movable: 1*4kB (M) 0*8kB 0*16kB 1*32kB (M) 0*64kB 0*128kB 1*256kB (M) 1*512kB (M) 1*1024kB (M) 0*2048kB 31743*4096kB (M) = 130021156kB
oom-kill:constraint=CONSTRAINT_CPUSET,nodemask=(null),cpuset=docker-9976a269caec812c134fa317f27487ee36e1129beba7278a463dd53e5fb9997b.scope,mems_allowed=4,global_oom,task_memcg=/system.slice/containerd.service,task=containerd,pid=4100,uid=0
Out of memory: Killed process 4100 (containerd) total-vm:4077036kB, anon-rss:51184kB, file-rss:26016kB, shmem-rss:0kB, UID:0 pgtables:676kB oom_score_adj:0
oom_reaper: reaped process 8248 (docker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
oom_reaper: reaped process 2054 (node_exporter), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
oom_reaper: reaped process 1452 (systemd-journal), now anon-rss:0kB, file-rss:8564kB, shmem-rss:4kB
oom_reaper: reaped process 2146 (munin-node), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
oom_reaper: reaped process 8291 (runc:[2:INIT]), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
The reason is that in this case, the target cpuset nodes only have
movable zone, while the creation of an OS in docker sometimes needs to
allocate memory in non-movable zones (dma/dma32/normal) like
GFP_HIGHUSER, and the cpuset limit forbids the allocation, then
out-of-memory killing is involved even when normal nodes and movable
nodes both have many free memory.
The OOM killer cannot help to resolve the situation as there is no
usable memory for the request in the cpuset scope. The only reasonable
measure to take is to fail the allocation right away and have the caller
to deal with it.
So add a check for cases like this in the slowpath of allocation, and
bail out early returning NULL for the allocation.
As page allocation is one of the hottest path in kernel, this check will
hurt all users with sane cpuset configuration, add a static branch check
and detect the abnormal config in cpuset memory binding setup so that
the extra check cost in page allocation is not paid by everyone.
[thanks to Micho Hocko and David Rientjes for suggesting not handling
it inside OOM code, adding cpuset check, refining comments]
Link: https://lkml.kernel.org/r/1632481657-68112-1-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-06 04:40:34 +08:00
|
|
|
check_insane_mems_config(&trialcs->mems_allowed);
|
|
|
|
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_lock_irq(&callback_lock);
|
2009-01-08 10:08:43 +08:00
|
|
|
cs->mems_allowed = trialcs->mems_allowed;
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
2008-07-25 16:47:21 +08:00
|
|
|
|
cpuset: update cs->effective_{cpus, mems} when config changes
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().
This won't introduce behavior change.
v3:
- add a WARN_ON() to check if effective masks are the same with configured
masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
it. Similar change for update_nodemasks_hier(). Suggested by Tejun.
v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:29 +08:00
|
|
|
/* use trialcs->mems_allowed as a temp variable */
|
2015-08-06 22:21:05 +08:00
|
|
|
update_nodemasks_hier(cs, &trialcs->mems_allowed);
|
2008-07-25 16:47:21 +08:00
|
|
|
done:
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2018-02-07 07:41:24 +08:00
|
|
|
bool current_cpuset_is_being_rebound(void)
|
2007-10-19 14:39:39 +08:00
|
|
|
{
|
2018-02-07 07:41:24 +08:00
|
|
|
bool ret;
|
cpuset,mempolicy: fix sleeping function called from invalid context
When runing with the kernel(3.15-rc7+), the follow bug occurs:
[ 9969.258987] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:586
[ 9969.359906] in_atomic(): 1, irqs_disabled(): 0, pid: 160655, name: python
[ 9969.441175] INFO: lockdep is turned off.
[ 9969.488184] CPU: 26 PID: 160655 Comm: python Tainted: G A 3.15.0-rc7+ #85
[ 9969.581032] Hardware name: FUJITSU-SV PRIMEQUEST 1800E/SB, BIOS PRIMEQUEST 1000 Series BIOS Version 1.39 11/16/2012
[ 9969.706052] ffffffff81a20e60 ffff8803e941fbd0 ffffffff8162f523 ffff8803e941fd18
[ 9969.795323] ffff8803e941fbe0 ffffffff8109995a ffff8803e941fc58 ffffffff81633e6c
[ 9969.884710] ffffffff811ba5dc ffff880405c6b480 ffff88041fdd90a0 0000000000002000
[ 9969.974071] Call Trace:
[ 9970.003403] [<ffffffff8162f523>] dump_stack+0x4d/0x66
[ 9970.065074] [<ffffffff8109995a>] __might_sleep+0xfa/0x130
[ 9970.130743] [<ffffffff81633e6c>] mutex_lock_nested+0x3c/0x4f0
[ 9970.200638] [<ffffffff811ba5dc>] ? kmem_cache_alloc+0x1bc/0x210
[ 9970.272610] [<ffffffff81105807>] cpuset_mems_allowed+0x27/0x140
[ 9970.344584] [<ffffffff811b1303>] ? __mpol_dup+0x63/0x150
[ 9970.409282] [<ffffffff811b1385>] __mpol_dup+0xe5/0x150
[ 9970.471897] [<ffffffff811b1303>] ? __mpol_dup+0x63/0x150
[ 9970.536585] [<ffffffff81068c86>] ? copy_process.part.23+0x606/0x1d40
[ 9970.613763] [<ffffffff810bf28d>] ? trace_hardirqs_on+0xd/0x10
[ 9970.683660] [<ffffffff810ddddf>] ? monotonic_to_bootbased+0x2f/0x50
[ 9970.759795] [<ffffffff81068cf0>] copy_process.part.23+0x670/0x1d40
[ 9970.834885] [<ffffffff8106a598>] do_fork+0xd8/0x380
[ 9970.894375] [<ffffffff81110e4c>] ? __audit_syscall_entry+0x9c/0xf0
[ 9970.969470] [<ffffffff8106a8c6>] SyS_clone+0x16/0x20
[ 9971.030011] [<ffffffff81642009>] stub_clone+0x69/0x90
[ 9971.091573] [<ffffffff81641c29>] ? system_call_fastpath+0x16/0x1b
The cause is that cpuset_mems_allowed() try to take
mutex_lock(&callback_mutex) under the rcu_read_lock(which was hold in
__mpol_dup()). And in cpuset_mems_allowed(), the access to cpuset is
under rcu_read_lock, so in __mpol_dup, we can reduce the rcu_read_lock
protection region to protect the access to cpuset only in
current_cpuset_is_being_rebound(). So that we can avoid this bug.
This patch is a temporary solution that just addresses the bug
mentioned above, can not fix the long-standing issue about cpuset.mems
rebinding on fork():
"When the forker's task_struct is duplicated (which includes
->mems_allowed) and it races with an update to cpuset_being_rebound
in update_tasks_nodemask() then the task's mems_allowed doesn't get
updated. And the child task's mems_allowed can be wrong if the
cpuset's nodemask changes before the child has been added to the
cgroup's tasklist."
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable <stable@vger.kernel.org>
2014-06-25 09:57:18 +08:00
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
ret = task_cs(current) == cpuset_being_rebound;
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return ret;
|
2007-10-19 14:39:39 +08:00
|
|
|
}
|
|
|
|
|
2008-05-07 11:42:41 +08:00
|
|
|
static int update_relax_domain_level(struct cpuset *cs, s64 val)
|
2008-04-15 13:04:23 +08:00
|
|
|
{
|
2009-04-03 07:57:55 +08:00
|
|
|
#ifdef CONFIG_SMP
|
2011-04-07 20:10:04 +08:00
|
|
|
if (val < -1 || val >= sched_domain_level_max)
|
2008-05-13 10:27:17 +08:00
|
|
|
return -EINVAL;
|
2009-04-03 07:57:55 +08:00
|
|
|
#endif
|
2008-04-15 13:04:23 +08:00
|
|
|
|
|
|
|
if (val != cs->relax_domain_level) {
|
|
|
|
cs->relax_domain_level = val;
|
2009-01-08 10:08:44 +08:00
|
|
|
if (!cpumask_empty(cs->cpus_allowed) &&
|
|
|
|
is_sched_load_balance(cs))
|
2013-01-08 00:51:07 +08:00
|
|
|
rebuild_sched_domains_locked();
|
2008-04-15 13:04:23 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-08-09 08:11:26 +08:00
|
|
|
/**
|
2009-06-17 06:31:47 +08:00
|
|
|
* update_tasks_flags - update the spread flags of tasks in the cpuset.
|
|
|
|
* @cs: the cpuset in which each task's spread flags needs to be changed
|
|
|
|
*
|
2014-02-13 19:58:40 +08:00
|
|
|
* Iterate through each task of @cs updating its spread flags. As this
|
2023-05-08 15:58:50 +08:00
|
|
|
* function is called with cpuset_mutex held, cpuset membership stays
|
2014-02-13 19:58:40 +08:00
|
|
|
* stable.
|
2009-06-17 06:31:47 +08:00
|
|
|
*/
|
2014-02-13 19:58:40 +08:00
|
|
|
static void update_tasks_flags(struct cpuset *cs)
|
2009-06-17 06:31:47 +08:00
|
|
|
{
|
2014-02-13 19:58:40 +08:00
|
|
|
struct css_task_iter it;
|
|
|
|
struct task_struct *task;
|
|
|
|
|
2017-05-15 21:34:01 +08:00
|
|
|
css_task_iter_start(&cs->css, 0, &it);
|
2014-02-13 19:58:40 +08:00
|
|
|
while ((task = css_task_iter_next(&it)))
|
2022-11-13 06:19:38 +08:00
|
|
|
cpuset_update_task_spread_flags(cs, task);
|
2014-02-13 19:58:40 +08:00
|
|
|
css_task_iter_end(&it);
|
2009-06-17 06:31:47 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* update_flag - read a 0 or a 1 in a file and update associated flag
|
2008-04-29 16:00:26 +08:00
|
|
|
* bit: the bit to update (see cpuset_flagbits_t)
|
|
|
|
* cs: the cpuset to update
|
|
|
|
* turning_on: whether the flag is being set or cleared
|
[PATCH] cpusets: dual semaphore locking overhaul
Overhaul cpuset locking. Replace single semaphore with two semaphores.
The suggestion to use two locks was made by Roman Zippel.
Both locks are global. Code that wants to modify cpusets must first
acquire the exclusive manage_sem, which allows them read-only access to
cpusets, and holds off other would-be modifiers. Before making actual
changes, the second semaphore, callback_sem must be acquired as well. Code
that needs only to query cpusets must acquire callback_sem, which is also a
global exclusive lock.
The earlier problems with double tripping are avoided, because it is
allowed for holders of manage_sem to nest the second callback_sem lock, and
only callback_sem is needed by code called from within __alloc_pages(),
where the double tripping had been possible.
This is not quite the same as a normal read/write semaphore, because
obtaining read-only access with intent to change must hold off other such
attempts, while allowing read-only access w/o such intention. Changing
cpusets involves several related checks and changes, which must be done
while allowing read-only queries (to avoid the double trip), but while
ensuring nothing changes (holding off other would be modifiers.)
This overhaul of cpuset locking also makes careful use of task_lock() to
guard access to the task->cpuset pointer, closing a couple of race
conditions noticed while reading this code (thanks, Roman). I've never
seen these races fail in any use or test.
See further the comments in the code.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-31 07:02:30 +08:00
|
|
|
*
|
2023-05-08 15:58:50 +08:00
|
|
|
* Call with cpuset_mutex held.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
|
2008-04-29 16:00:00 +08:00
|
|
|
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
|
|
|
|
int turning_on)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2009-01-08 10:08:43 +08:00
|
|
|
struct cpuset *trialcs;
|
2008-10-19 11:28:18 +08:00
|
|
|
int balance_flag_changed;
|
2009-06-17 06:31:47 +08:00
|
|
|
int spread_flag_changed;
|
|
|
|
int err;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-01-08 10:08:43 +08:00
|
|
|
trialcs = alloc_trial_cpuset(cs);
|
|
|
|
if (!trialcs)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (turning_on)
|
2009-01-08 10:08:43 +08:00
|
|
|
set_bit(bit, &trialcs->flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
else
|
2009-01-08 10:08:43 +08:00
|
|
|
clear_bit(bit, &trialcs->flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-01-08 10:08:43 +08:00
|
|
|
err = validate_change(cs, trialcs);
|
2005-06-26 05:57:34 +08:00
|
|
|
if (err < 0)
|
2009-01-08 10:08:43 +08:00
|
|
|
goto out;
|
2007-10-19 14:40:20 +08:00
|
|
|
|
|
|
|
balance_flag_changed = (is_sched_load_balance(cs) !=
|
2009-01-08 10:08:43 +08:00
|
|
|
is_sched_load_balance(trialcs));
|
2007-10-19 14:40:20 +08:00
|
|
|
|
2009-06-17 06:31:47 +08:00
|
|
|
spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
|
|
|
|
|| (is_spread_page(cs) != is_spread_page(trialcs)));
|
|
|
|
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_lock_irq(&callback_lock);
|
2009-01-08 10:08:43 +08:00
|
|
|
cs->flags = trialcs->flags;
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
2005-06-26 05:57:34 +08:00
|
|
|
|
2009-01-08 10:08:44 +08:00
|
|
|
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
|
2013-01-08 00:51:07 +08:00
|
|
|
rebuild_sched_domains_locked();
|
2007-10-19 14:40:20 +08:00
|
|
|
|
2009-06-17 06:31:47 +08:00
|
|
|
if (spread_flag_changed)
|
2014-02-13 19:58:40 +08:00
|
|
|
update_tasks_flags(cs);
|
2009-01-08 10:08:43 +08:00
|
|
|
out:
|
2018-11-08 23:08:37 +08:00
|
|
|
free_cpuset(trialcs);
|
2009-01-08 10:08:43 +08:00
|
|
|
return err;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2022-09-02 04:57:37 +08:00
|
|
|
/**
|
2022-03-06 04:46:57 +08:00
|
|
|
* update_prstate - update partition_root_state
|
2022-09-02 04:57:37 +08:00
|
|
|
* @cs: the cpuset to update
|
|
|
|
* @new_prs: new partition root state
|
2022-09-02 04:57:41 +08:00
|
|
|
* Return: 0 if successful, != 0 if error
|
2018-11-08 23:08:38 +08:00
|
|
|
*
|
2023-05-08 15:58:50 +08:00
|
|
|
* Call with cpuset_mutex held.
|
2018-11-08 23:08:38 +08:00
|
|
|
*/
|
2021-07-20 22:18:26 +08:00
|
|
|
static int update_prstate(struct cpuset *cs, int new_prs)
|
2018-11-08 23:08:38 +08:00
|
|
|
{
|
2022-09-02 04:57:41 +08:00
|
|
|
int err = PERR_NONE, old_prs = cs->partition_root_state;
|
2018-11-08 23:08:38 +08:00
|
|
|
struct cpuset *parent = parent_cs(cs);
|
2021-07-20 22:18:26 +08:00
|
|
|
struct tmpmasks tmpmask;
|
2018-11-08 23:08:38 +08:00
|
|
|
|
2021-07-20 22:18:28 +08:00
|
|
|
if (old_prs == new_prs)
|
2018-11-08 23:08:38 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
2022-09-02 04:57:39 +08:00
|
|
|
* For a previously invalid partition root, leave it at being
|
|
|
|
* invalid if new_prs is not "member".
|
2018-11-08 23:08:38 +08:00
|
|
|
*/
|
2022-09-02 04:57:40 +08:00
|
|
|
if (new_prs && is_prs_invalid(old_prs)) {
|
|
|
|
cs->partition_root_state = -new_prs;
|
2022-09-02 04:57:39 +08:00
|
|
|
return 0;
|
2022-09-02 04:57:40 +08:00
|
|
|
}
|
2018-11-08 23:08:38 +08:00
|
|
|
|
2021-07-20 22:18:26 +08:00
|
|
|
if (alloc_cpumasks(NULL, &tmpmask))
|
2018-11-08 23:08:38 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
|
2023-06-27 22:35:01 +08:00
|
|
|
err = update_partition_exclusive(cs, new_prs);
|
|
|
|
if (err)
|
|
|
|
goto out;
|
|
|
|
|
2021-07-20 22:18:28 +08:00
|
|
|
if (!old_prs) {
|
2018-11-08 23:08:38 +08:00
|
|
|
/*
|
2023-06-27 22:35:01 +08:00
|
|
|
* cpus_allowed cannot be empty.
|
2018-11-08 23:08:38 +08:00
|
|
|
*/
|
2022-09-02 04:57:39 +08:00
|
|
|
if (cpumask_empty(cs->cpus_allowed)) {
|
2022-09-02 04:57:41 +08:00
|
|
|
err = PERR_CPUSEMPTY;
|
2018-11-08 23:08:38 +08:00
|
|
|
goto out;
|
2022-09-02 04:57:39 +08:00
|
|
|
}
|
2018-11-08 23:08:38 +08:00
|
|
|
|
|
|
|
err = update_parent_subparts_cpumask(cs, partcmd_enable,
|
2021-07-20 22:18:26 +08:00
|
|
|
NULL, &tmpmask);
|
2022-09-02 04:57:40 +08:00
|
|
|
} else if (old_prs && new_prs) {
|
|
|
|
/*
|
|
|
|
* A change in load balance state only, no change in cpumasks.
|
|
|
|
*/
|
2023-06-27 22:35:02 +08:00
|
|
|
;
|
2018-11-08 23:08:38 +08:00
|
|
|
} else {
|
2018-11-08 23:08:39 +08:00
|
|
|
/*
|
2022-09-02 04:57:39 +08:00
|
|
|
* Switching back to member is always allowed even if it
|
|
|
|
* disables child partitions.
|
2018-11-08 23:08:39 +08:00
|
|
|
*/
|
2022-09-02 04:57:39 +08:00
|
|
|
update_parent_subparts_cpumask(cs, partcmd_disable, NULL,
|
|
|
|
&tmpmask);
|
2018-11-08 23:08:39 +08:00
|
|
|
|
2022-09-02 04:57:39 +08:00
|
|
|
/*
|
|
|
|
* If there are child partitions, they will all become invalid.
|
|
|
|
*/
|
|
|
|
if (unlikely(cs->nr_subparts_cpus)) {
|
|
|
|
spin_lock_irq(&callback_lock);
|
|
|
|
cs->nr_subparts_cpus = 0;
|
|
|
|
cpumask_clear(cs->subparts_cpus);
|
|
|
|
compute_effective_cpumask(cs->effective_cpus, cs, parent);
|
|
|
|
spin_unlock_irq(&callback_lock);
|
|
|
|
}
|
2018-11-08 23:08:38 +08:00
|
|
|
}
|
|
|
|
out:
|
2022-09-02 04:57:39 +08:00
|
|
|
/*
|
2023-06-27 22:35:01 +08:00
|
|
|
* Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
|
|
|
|
* happens.
|
2022-09-02 04:57:39 +08:00
|
|
|
*/
|
2023-06-27 22:35:01 +08:00
|
|
|
if (err) {
|
2022-09-02 04:57:40 +08:00
|
|
|
new_prs = -new_prs;
|
2023-06-27 22:35:01 +08:00
|
|
|
update_partition_exclusive(cs, new_prs);
|
|
|
|
}
|
|
|
|
|
2022-09-02 04:57:39 +08:00
|
|
|
spin_lock_irq(&callback_lock);
|
|
|
|
cs->partition_root_state = new_prs;
|
2023-01-31 23:48:03 +08:00
|
|
|
WRITE_ONCE(cs->prs_err, err);
|
2022-09-02 04:57:39 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
2023-06-27 22:35:01 +08:00
|
|
|
|
2022-09-02 04:57:39 +08:00
|
|
|
/*
|
|
|
|
* Update child cpusets, if present.
|
|
|
|
* Force update if switching back to member.
|
|
|
|
*/
|
|
|
|
if (!list_empty(&cs->css.children))
|
2023-06-27 22:35:03 +08:00
|
|
|
update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
|
2021-07-20 22:18:28 +08:00
|
|
|
|
2023-06-27 22:35:01 +08:00
|
|
|
/* Update sched domains and load balance flag */
|
|
|
|
update_partition_sd_lb(cs, old_prs);
|
|
|
|
|
2022-09-02 04:57:39 +08:00
|
|
|
notify_partition_change(cs, old_prs);
|
2021-07-20 22:18:26 +08:00
|
|
|
free_cpumasks(NULL, &tmpmask);
|
2022-09-02 04:57:39 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
/*
|
2006-07-01 00:27:16 +08:00
|
|
|
* Frequency meter - How fast is some event occurring?
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
*
|
|
|
|
* These routines manage a digitally filtered, constant time based,
|
|
|
|
* event frequency meter. There are four routines:
|
|
|
|
* fmeter_init() - initialize a frequency meter.
|
|
|
|
* fmeter_markevent() - called each time the event happens.
|
|
|
|
* fmeter_getrate() - returns the recent rate of such events.
|
|
|
|
* fmeter_update() - internal routine used to update fmeter.
|
|
|
|
*
|
|
|
|
* A common data structure is passed to each of these routines,
|
|
|
|
* which is used to keep track of the state required to manage the
|
|
|
|
* frequency meter and its digital filter.
|
|
|
|
*
|
|
|
|
* The filter works on the number of events marked per unit time.
|
|
|
|
* The filter is single-pole low-pass recursive (IIR). The time unit
|
|
|
|
* is 1 second. Arithmetic is done using 32-bit integers scaled to
|
|
|
|
* simulate 3 decimal digits of precision (multiplied by 1000).
|
|
|
|
*
|
|
|
|
* With an FM_COEF of 933, and a time base of 1 second, the filter
|
|
|
|
* has a half-life of 10 seconds, meaning that if the events quit
|
|
|
|
* happening, then the rate returned from the fmeter_getrate()
|
|
|
|
* will be cut in half each 10 seconds, until it converges to zero.
|
|
|
|
*
|
|
|
|
* It is not worth doing a real infinitely recursive filter. If more
|
|
|
|
* than FM_MAXTICKS ticks have elapsed since the last filter event,
|
|
|
|
* just compute FM_MAXTICKS ticks worth, by which point the level
|
|
|
|
* will be stable.
|
|
|
|
*
|
|
|
|
* Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
|
|
|
|
* arithmetic overflow in the fmeter_update() routine.
|
|
|
|
*
|
|
|
|
* Given the simple 32 bit integer arithmetic used, this meter works
|
|
|
|
* best for reporting rates between one per millisecond (msec) and
|
|
|
|
* one per 32 (approx) seconds. At constant rates faster than one
|
|
|
|
* per msec it maxes out at values just under 1,000,000. At constant
|
|
|
|
* rates between one per msec, and one per second it will stabilize
|
|
|
|
* to a value N*1000, where N is the rate of events per second.
|
|
|
|
* At constant rates between one per second and one per 32 seconds,
|
|
|
|
* it will be choppy, moving up on the seconds that have an event,
|
|
|
|
* and then decaying until the next event. At rates slower than
|
|
|
|
* about one in 32 seconds, it decays all the way back to zero between
|
|
|
|
* each event.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define FM_COEF 933 /* coefficient for half-life of 10 secs */
|
2015-11-25 23:16:55 +08:00
|
|
|
#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
|
|
|
|
#define FM_SCALE 1000 /* faux fixed point scale */
|
|
|
|
|
|
|
|
/* Initialize a frequency meter */
|
|
|
|
static void fmeter_init(struct fmeter *fmp)
|
|
|
|
{
|
|
|
|
fmp->cnt = 0;
|
|
|
|
fmp->val = 0;
|
|
|
|
fmp->time = 0;
|
|
|
|
spin_lock_init(&fmp->lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Internal meter update - process cnt events and update value */
|
|
|
|
static void fmeter_update(struct fmeter *fmp)
|
|
|
|
{
|
2015-11-25 23:16:55 +08:00
|
|
|
time64_t now;
|
|
|
|
u32 ticks;
|
|
|
|
|
|
|
|
now = ktime_get_seconds();
|
|
|
|
ticks = now - fmp->time;
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
|
|
|
|
if (ticks == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ticks = min(FM_MAXTICKS, ticks);
|
|
|
|
while (ticks-- > 0)
|
|
|
|
fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
|
|
|
|
fmp->time = now;
|
|
|
|
|
|
|
|
fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
|
|
|
|
fmp->cnt = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Process any previous ticks, then bump cnt by one (times scale). */
|
|
|
|
static void fmeter_markevent(struct fmeter *fmp)
|
|
|
|
{
|
|
|
|
spin_lock(&fmp->lock);
|
|
|
|
fmeter_update(fmp);
|
|
|
|
fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
|
|
|
|
spin_unlock(&fmp->lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Process any previous ticks, then return current value. */
|
|
|
|
static int fmeter_getrate(struct fmeter *fmp)
|
|
|
|
{
|
|
|
|
int val;
|
|
|
|
|
|
|
|
spin_lock(&fmp->lock);
|
|
|
|
fmeter_update(fmp);
|
|
|
|
val = fmp->val;
|
|
|
|
spin_unlock(&fmp->lock);
|
|
|
|
return val;
|
|
|
|
}
|
|
|
|
|
2014-02-13 19:58:41 +08:00
|
|
|
static struct cpuset *cpuset_attach_old_cs;
|
|
|
|
|
2023-04-11 21:35:59 +08:00
|
|
|
/*
|
|
|
|
* Check to see if a cpuset can accept a new task
|
|
|
|
* For v1, cpus_allowed and mems_allowed can't be empty.
|
|
|
|
* For v2, effective_cpus can't be empty.
|
|
|
|
* Note that in v1, effective_cpus = cpus_allowed.
|
|
|
|
*/
|
|
|
|
static int cpuset_can_attach_check(struct cpuset *cs)
|
|
|
|
{
|
|
|
|
if (cpumask_empty(cs->effective_cpus) ||
|
|
|
|
(!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
|
|
|
|
return -ENOSPC;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-05-08 15:58:54 +08:00
|
|
|
static void reset_migrate_dl_data(struct cpuset *cs)
|
|
|
|
{
|
|
|
|
cs->nr_migrate_dl_tasks = 0;
|
|
|
|
cs->sum_migrate_dl_bw = 0;
|
|
|
|
}
|
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
static int cpuset_can_attach(struct cgroup_taskset *tset)
|
2011-05-27 07:25:19 +08:00
|
|
|
{
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
struct cgroup_subsys_state *css;
|
2023-05-08 15:58:54 +08:00
|
|
|
struct cpuset *cs, *oldcs;
|
2011-12-13 10:12:21 +08:00
|
|
|
struct task_struct *task;
|
2023-07-04 01:27:39 +08:00
|
|
|
bool cpus_updated, mems_updated;
|
2011-12-13 10:12:21 +08:00
|
|
|
int ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-02-13 19:58:41 +08:00
|
|
|
/* used later by cpuset_attach() */
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
|
2023-05-08 15:58:54 +08:00
|
|
|
oldcs = cpuset_attach_old_cs;
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
cs = css_cs(css);
|
2014-02-13 19:58:41 +08:00
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2013-01-08 00:51:08 +08:00
|
|
|
|
2023-04-11 21:35:59 +08:00
|
|
|
/* Check to see if task is allowed in the cpuset */
|
|
|
|
ret = cpuset_can_attach_check(cs);
|
|
|
|
if (ret)
|
2022-09-02 04:57:38 +08:00
|
|
|
goto out_unlock;
|
|
|
|
|
2023-07-04 01:27:39 +08:00
|
|
|
cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
|
|
|
|
mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
|
|
|
|
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
cgroup_taskset_for_each(task, css, tset) {
|
2023-05-08 15:58:54 +08:00
|
|
|
ret = task_can_attach(task);
|
2014-09-19 17:22:40 +08:00
|
|
|
if (ret)
|
2013-01-08 00:51:08 +08:00
|
|
|
goto out_unlock;
|
2023-07-04 01:27:39 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Skip rights over task check in v2 when nothing changes,
|
|
|
|
* migration permission derives from hierarchy ownership in
|
|
|
|
* cgroup_procs_write_permission()).
|
|
|
|
*/
|
|
|
|
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
|
|
|
|
(cpus_updated || mems_updated)) {
|
|
|
|
ret = security_task_setscheduler(task);
|
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2023-05-08 15:58:51 +08:00
|
|
|
|
|
|
|
if (dl_task(task)) {
|
2023-05-08 15:58:54 +08:00
|
|
|
cs->nr_migrate_dl_tasks++;
|
|
|
|
cs->sum_migrate_dl_bw += task->dl.dl_bw;
|
2023-05-08 15:58:51 +08:00
|
|
|
}
|
2011-12-13 10:12:21 +08:00
|
|
|
}
|
2011-05-27 07:25:19 +08:00
|
|
|
|
2023-05-08 15:58:54 +08:00
|
|
|
if (!cs->nr_migrate_dl_tasks)
|
|
|
|
goto out_success;
|
|
|
|
|
|
|
|
if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
|
|
|
|
int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
|
|
|
|
|
|
|
|
if (unlikely(cpu >= nr_cpu_ids)) {
|
|
|
|
reset_migrate_dl_data(cs);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
|
|
|
|
if (ret) {
|
|
|
|
reset_migrate_dl_data(cs);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
out_success:
|
2013-01-08 00:51:07 +08:00
|
|
|
/*
|
|
|
|
* Mark attach is in progress. This makes validate_change() fail
|
|
|
|
* changes which zero cpus/mems_allowed.
|
|
|
|
*/
|
|
|
|
cs->attach_in_progress++;
|
2013-01-08 00:51:08 +08:00
|
|
|
out_unlock:
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2013-01-08 00:51:08 +08:00
|
|
|
return ret;
|
2007-10-19 14:39:39 +08:00
|
|
|
}
|
2011-05-27 07:25:19 +08:00
|
|
|
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
|
2013-01-08 00:51:07 +08:00
|
|
|
{
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
struct cgroup_subsys_state *css;
|
2023-04-11 21:35:57 +08:00
|
|
|
struct cpuset *cs;
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
|
|
|
|
cgroup_taskset_first(tset, &css);
|
2023-04-11 21:35:57 +08:00
|
|
|
cs = css_cs(css);
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2023-04-11 21:35:57 +08:00
|
|
|
cs->attach_in_progress--;
|
|
|
|
if (!cs->attach_in_progress)
|
|
|
|
wake_up(&cpuset_attach_wq);
|
2023-05-08 15:58:54 +08:00
|
|
|
|
|
|
|
if (cs->nr_migrate_dl_tasks) {
|
|
|
|
int cpu = cpumask_any(cs->effective_cpus);
|
|
|
|
|
|
|
|
dl_bw_free(cpu, cs->sum_migrate_dl_bw);
|
|
|
|
reset_migrate_dl_data(cs);
|
|
|
|
}
|
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2007-10-19 14:39:39 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
/*
|
2023-05-08 15:58:50 +08:00
|
|
|
* Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()
|
2013-01-08 00:51:07 +08:00
|
|
|
* but we can't allocate it dynamically there. Define it global and
|
|
|
|
* allocate from cpuset_init().
|
|
|
|
*/
|
|
|
|
static cpumask_var_t cpus_attach;
|
2023-04-11 21:35:58 +08:00
|
|
|
static nodemask_t cpuset_attach_nodemask_to;
|
|
|
|
|
|
|
|
static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
|
|
|
|
{
|
2023-05-08 15:58:50 +08:00
|
|
|
lockdep_assert_held(&cpuset_mutex);
|
2023-04-11 21:35:58 +08:00
|
|
|
|
|
|
|
if (cs != &top_cpuset)
|
|
|
|
guarantee_online_cpus(task, cpus_attach);
|
|
|
|
else
|
2023-04-11 21:36:00 +08:00
|
|
|
cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
|
|
|
|
cs->subparts_cpus);
|
2023-04-11 21:35:58 +08:00
|
|
|
/*
|
|
|
|
* can_attach beforehand should guarantee that this doesn't
|
|
|
|
* fail. TODO: have a better way to handle failure here
|
|
|
|
*/
|
|
|
|
WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
|
|
|
|
|
|
|
|
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
|
|
|
|
cpuset_update_task_spread_flags(cs, task);
|
|
|
|
}
|
2013-01-08 00:51:07 +08:00
|
|
|
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
static void cpuset_attach(struct cgroup_taskset *tset)
|
2007-10-19 14:39:39 +08:00
|
|
|
{
|
2011-12-13 10:12:21 +08:00
|
|
|
struct task_struct *task;
|
2015-09-12 03:00:19 +08:00
|
|
|
struct task_struct *leader;
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
struct cgroup_subsys_state *css;
|
|
|
|
struct cpuset *cs;
|
2014-02-13 19:58:41 +08:00
|
|
|
struct cpuset *oldcs = cpuset_attach_old_cs;
|
2022-11-13 06:19:39 +08:00
|
|
|
bool cpus_updated, mems_updated;
|
2006-06-23 17:04:00 +08:00
|
|
|
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
cgroup_taskset_first(tset, &css);
|
|
|
|
cs = css_cs(css);
|
|
|
|
|
2022-08-16 07:27:38 +08:00
|
|
|
lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2022-11-13 06:19:39 +08:00
|
|
|
cpus_updated = !cpumask_equal(cs->effective_cpus,
|
|
|
|
oldcs->effective_cpus);
|
|
|
|
mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In the default hierarchy, enabling cpuset in the child cgroups
|
|
|
|
* will trigger a number of cpuset_attach() calls with no change
|
|
|
|
* in effective cpus and mems. In that case, we can optimize out
|
|
|
|
* by skipping the task iteration and update.
|
|
|
|
*/
|
|
|
|
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
|
|
|
|
!cpus_updated && !mems_updated) {
|
|
|
|
cpuset_attach_nodemask_to = cs->effective_mems;
|
|
|
|
goto out;
|
|
|
|
}
|
2013-01-08 00:51:08 +08:00
|
|
|
|
2014-07-09 16:48:32 +08:00
|
|
|
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
|
2013-01-08 00:51:07 +08:00
|
|
|
|
2023-04-11 21:35:58 +08:00
|
|
|
cgroup_taskset_for_each(task, css, tset)
|
|
|
|
cpuset_attach_task(cs, task);
|
2006-06-23 17:04:00 +08:00
|
|
|
|
2011-05-27 07:25:19 +08:00
|
|
|
/*
|
2015-09-12 03:00:19 +08:00
|
|
|
* Change mm for all threadgroup leaders. This is expensive and may
|
2022-11-13 06:19:39 +08:00
|
|
|
* sleep and should be moved outside migration path proper. Skip it
|
|
|
|
* if there is no change in effective_mems and CS_MEMORY_MIGRATE is
|
|
|
|
* not set.
|
2011-05-27 07:25:19 +08:00
|
|
|
*/
|
2014-07-09 16:48:32 +08:00
|
|
|
cpuset_attach_nodemask_to = cs->effective_mems;
|
2022-11-13 06:19:39 +08:00
|
|
|
if (!is_memory_migrate(cs) && !mems_updated)
|
|
|
|
goto out;
|
|
|
|
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
cgroup_taskset_for_each_leader(leader, css, tset) {
|
2015-09-12 03:00:18 +08:00
|
|
|
struct mm_struct *mm = get_task_mm(leader);
|
|
|
|
|
|
|
|
if (mm) {
|
|
|
|
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* old_mems_allowed is the same with mems_allowed
|
|
|
|
* here, except if this task is being moved
|
|
|
|
* automatically due to hotplug. In that case
|
|
|
|
* @mems_allowed has been updated and is empty, so
|
|
|
|
* @old_mems_allowed is the right nodesets that we
|
|
|
|
* migrate mm from.
|
|
|
|
*/
|
2016-01-20 01:18:41 +08:00
|
|
|
if (is_memory_migrate(cs))
|
2015-09-12 03:00:18 +08:00
|
|
|
cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
|
|
|
|
&cpuset_attach_nodemask_to);
|
2016-01-20 01:18:41 +08:00
|
|
|
else
|
|
|
|
mmput(mm);
|
2013-06-13 15:11:44 +08:00
|
|
|
}
|
[PATCH] cpuset: rebind vma mempolicies fix
Fix more of longstanding bug in cpuset/mempolicy interaction.
NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset. The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.
When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.
An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.
This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired. The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.
Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan. In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock). So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.
Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.
When a task is moved to a different cpuset, it is easier, as there is only one
task involved. It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.
It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice. This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:59 +08:00
|
|
|
}
|
2013-01-08 00:51:07 +08:00
|
|
|
|
2022-11-13 06:19:39 +08:00
|
|
|
out:
|
2013-06-09 17:15:08 +08:00
|
|
|
cs->old_mems_allowed = cpuset_attach_nodemask_to;
|
2013-01-08 00:51:08 +08:00
|
|
|
|
2023-05-08 15:58:54 +08:00
|
|
|
if (cs->nr_migrate_dl_tasks) {
|
|
|
|
cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
|
|
|
|
oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
|
|
|
|
reset_migrate_dl_data(cs);
|
|
|
|
}
|
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
cs->attach_in_progress--;
|
2013-06-09 17:14:22 +08:00
|
|
|
if (!cs->attach_in_progress)
|
|
|
|
wake_up(&cpuset_attach_wq);
|
2013-01-08 00:51:08 +08:00
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* The various types of files and directories in a cpuset file system */
|
|
|
|
|
|
|
|
typedef enum {
|
[PATCH] cpusets: swap migration interface
Add a boolean "memory_migrate" to each cpuset, represented by a file
containing "0" or "1" in each directory below /dev/cpuset.
It defaults to false (file contains "0"). It can be set true by writing
"1" to the file.
If true, then anytime that a task is attached to the cpuset so marked, the
pages of that task will be moved to that cpuset, preserving, to the extent
practical, the cpuset-relative placement of the pages.
Also anytime that a cpuset so marked has its memory placement changed (by
writing to its "mems" file), the tasks in that cpuset will have their pages
moved to the cpusets new nodes, preserving, to the extent practical, the
cpuset-relative placement of the moved pages.
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:00:56 +08:00
|
|
|
FILE_MEMORY_MIGRATE,
|
2005-04-17 06:20:36 +08:00
|
|
|
FILE_CPULIST,
|
|
|
|
FILE_MEMLIST,
|
2014-07-09 16:49:25 +08:00
|
|
|
FILE_EFFECTIVE_CPULIST,
|
|
|
|
FILE_EFFECTIVE_MEMLIST,
|
2018-11-08 23:08:46 +08:00
|
|
|
FILE_SUBPARTS_CPULIST,
|
2005-04-17 06:20:36 +08:00
|
|
|
FILE_CPU_EXCLUSIVE,
|
|
|
|
FILE_MEM_EXCLUSIVE,
|
2008-04-29 16:00:26 +08:00
|
|
|
FILE_MEM_HARDWALL,
|
2007-10-19 14:40:20 +08:00
|
|
|
FILE_SCHED_LOAD_BALANCE,
|
2018-11-08 23:08:38 +08:00
|
|
|
FILE_PARTITION_ROOT,
|
2008-04-15 13:04:23 +08:00
|
|
|
FILE_SCHED_RELAX_DOMAIN_LEVEL,
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
FILE_MEMORY_PRESSURE_ENABLED,
|
|
|
|
FILE_MEMORY_PRESSURE,
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
|
|
|
FILE_SPREAD_PAGE,
|
|
|
|
FILE_SPREAD_SLAB,
|
2005-04-17 06:20:36 +08:00
|
|
|
} cpuset_filetype_t;
|
|
|
|
|
2013-08-09 08:11:24 +08:00
|
|
|
static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
|
|
|
|
u64 val)
|
2008-04-29 16:00:00 +08:00
|
|
|
{
|
2013-08-09 08:11:24 +08:00
|
|
|
struct cpuset *cs = css_cs(css);
|
2008-04-29 16:00:00 +08:00
|
|
|
cpuset_filetype_t type = cft->private;
|
2013-08-13 10:05:59 +08:00
|
|
|
int retval = 0;
|
2008-04-29 16:00:00 +08:00
|
|
|
|
2021-08-03 22:16:07 +08:00
|
|
|
cpus_read_lock();
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2013-08-13 10:05:59 +08:00
|
|
|
if (!is_cpuset_online(cs)) {
|
|
|
|
retval = -ENODEV;
|
2013-01-08 00:51:08 +08:00
|
|
|
goto out_unlock;
|
2013-08-13 10:05:59 +08:00
|
|
|
}
|
2008-04-29 16:00:00 +08:00
|
|
|
|
|
|
|
switch (type) {
|
2005-04-17 06:20:36 +08:00
|
|
|
case FILE_CPU_EXCLUSIVE:
|
2008-04-29 16:00:00 +08:00
|
|
|
retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
case FILE_MEM_EXCLUSIVE:
|
2008-04-29 16:00:00 +08:00
|
|
|
retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
2008-04-29 16:00:26 +08:00
|
|
|
case FILE_MEM_HARDWALL:
|
|
|
|
retval = update_flag(CS_MEM_HARDWALL, cs, val);
|
|
|
|
break;
|
2007-10-19 14:40:20 +08:00
|
|
|
case FILE_SCHED_LOAD_BALANCE:
|
2008-04-29 16:00:00 +08:00
|
|
|
retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
|
2008-04-15 13:04:23 +08:00
|
|
|
break;
|
[PATCH] cpusets: swap migration interface
Add a boolean "memory_migrate" to each cpuset, represented by a file
containing "0" or "1" in each directory below /dev/cpuset.
It defaults to false (file contains "0"). It can be set true by writing
"1" to the file.
If true, then anytime that a task is attached to the cpuset so marked, the
pages of that task will be moved to that cpuset, preserving, to the extent
practical, the cpuset-relative placement of the pages.
Also anytime that a cpuset so marked has its memory placement changed (by
writing to its "mems" file), the tasks in that cpuset will have their pages
moved to the cpusets new nodes, preserving, to the extent practical, the
cpuset-relative placement of the moved pages.
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:00:56 +08:00
|
|
|
case FILE_MEMORY_MIGRATE:
|
2008-04-29 16:00:00 +08:00
|
|
|
retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
|
[PATCH] cpusets: swap migration interface
Add a boolean "memory_migrate" to each cpuset, represented by a file
containing "0" or "1" in each directory below /dev/cpuset.
It defaults to false (file contains "0"). It can be set true by writing
"1" to the file.
If true, then anytime that a task is attached to the cpuset so marked, the
pages of that task will be moved to that cpuset, preserving, to the extent
practical, the cpuset-relative placement of the pages.
Also anytime that a cpuset so marked has its memory placement changed (by
writing to its "mems" file), the tasks in that cpuset will have their pages
moved to the cpusets new nodes, preserving, to the extent practical, the
cpuset-relative placement of the moved pages.
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:00:56 +08:00
|
|
|
break;
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
case FILE_MEMORY_PRESSURE_ENABLED:
|
2008-04-29 16:00:00 +08:00
|
|
|
cpuset_memory_pressure_enabled = !!val;
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
break;
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
|
|
|
case FILE_SPREAD_PAGE:
|
2008-04-29 16:00:00 +08:00
|
|
|
retval = update_flag(CS_SPREAD_PAGE, cs, val);
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
|
|
|
break;
|
|
|
|
case FILE_SPREAD_SLAB:
|
2008-04-29 16:00:00 +08:00
|
|
|
retval = update_flag(CS_SPREAD_SLAB, cs, val);
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
|
|
|
break;
|
2005-04-17 06:20:36 +08:00
|
|
|
default:
|
|
|
|
retval = -EINVAL;
|
2008-04-29 16:00:00 +08:00
|
|
|
break;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2013-01-08 00:51:08 +08:00
|
|
|
out_unlock:
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2021-08-03 22:16:07 +08:00
|
|
|
cpus_read_unlock();
|
2005-04-17 06:20:36 +08:00
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2013-08-09 08:11:24 +08:00
|
|
|
static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
|
|
|
|
s64 val)
|
2008-05-07 11:42:41 +08:00
|
|
|
{
|
2013-08-09 08:11:24 +08:00
|
|
|
struct cpuset *cs = css_cs(css);
|
2008-05-07 11:42:41 +08:00
|
|
|
cpuset_filetype_t type = cft->private;
|
2013-01-08 00:51:08 +08:00
|
|
|
int retval = -ENODEV;
|
2008-05-07 11:42:41 +08:00
|
|
|
|
2021-08-03 22:16:07 +08:00
|
|
|
cpus_read_lock();
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2013-01-08 00:51:08 +08:00
|
|
|
if (!is_cpuset_online(cs))
|
|
|
|
goto out_unlock;
|
2008-07-25 16:47:02 +08:00
|
|
|
|
2008-05-07 11:42:41 +08:00
|
|
|
switch (type) {
|
|
|
|
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
|
|
|
|
retval = update_relax_domain_level(cs, val);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
retval = -EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
2013-01-08 00:51:08 +08:00
|
|
|
out_unlock:
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2021-08-03 22:16:07 +08:00
|
|
|
cpus_read_unlock();
|
2008-05-07 11:42:41 +08:00
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2008-07-25 16:47:02 +08:00
|
|
|
/*
|
|
|
|
* Common handling for a write to a "cpus" or "mems" file.
|
|
|
|
*/
|
2014-05-14 00:16:21 +08:00
|
|
|
static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
2008-07-25 16:47:02 +08:00
|
|
|
{
|
2014-05-14 00:16:21 +08:00
|
|
|
struct cpuset *cs = css_cs(of_css(of));
|
2009-01-08 10:08:43 +08:00
|
|
|
struct cpuset *trialcs;
|
2013-01-08 00:51:08 +08:00
|
|
|
int retval = -ENODEV;
|
2008-07-25 16:47:02 +08:00
|
|
|
|
2014-05-14 00:16:21 +08:00
|
|
|
buf = strstrip(buf);
|
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
/*
|
|
|
|
* CPU or memory hotunplug may leave @cs w/o any execution
|
|
|
|
* resources, in which case the hotplug code asynchronously updates
|
|
|
|
* configuration and transfers all tasks to the nearest ancestor
|
|
|
|
* which can execute.
|
|
|
|
*
|
|
|
|
* As writes to "cpus" or "mems" may restore @cs's execution
|
|
|
|
* resources, wait for the previously scheduled operations before
|
|
|
|
* proceeding, so that we don't end up keep removing tasks added
|
|
|
|
* after execution capability is restored.
|
2014-07-01 03:47:32 +08:00
|
|
|
*
|
|
|
|
* cpuset_hotplug_work calls back into cgroup core via
|
|
|
|
* cgroup_transfer_tasks() and waiting for it from a cgroupfs
|
|
|
|
* operation like this one can lead to a deadlock through kernfs
|
|
|
|
* active_ref protection. Let's break the protection. Losing the
|
|
|
|
* protection is okay as we check whether @cs is online after
|
2023-05-08 15:58:50 +08:00
|
|
|
* grabbing cpuset_mutex anyway. This only happens on the legacy
|
2014-07-01 03:47:32 +08:00
|
|
|
* hierarchies.
|
2013-01-08 00:51:07 +08:00
|
|
|
*/
|
2014-07-01 03:47:32 +08:00
|
|
|
css_get(&cs->css);
|
|
|
|
kernfs_break_active_protection(of->kn);
|
2013-01-08 00:51:07 +08:00
|
|
|
flush_work(&cpuset_hotplug_work);
|
|
|
|
|
2021-08-03 22:16:07 +08:00
|
|
|
cpus_read_lock();
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2013-01-08 00:51:08 +08:00
|
|
|
if (!is_cpuset_online(cs))
|
|
|
|
goto out_unlock;
|
2008-07-25 16:47:02 +08:00
|
|
|
|
2009-01-08 10:08:43 +08:00
|
|
|
trialcs = alloc_trial_cpuset(cs);
|
2011-03-05 09:36:21 +08:00
|
|
|
if (!trialcs) {
|
|
|
|
retval = -ENOMEM;
|
2013-01-08 00:51:08 +08:00
|
|
|
goto out_unlock;
|
2011-03-05 09:36:21 +08:00
|
|
|
}
|
2009-01-08 10:08:43 +08:00
|
|
|
|
2014-05-14 00:16:21 +08:00
|
|
|
switch (of_cft(of)->private) {
|
2008-07-25 16:47:02 +08:00
|
|
|
case FILE_CPULIST:
|
2009-01-08 10:08:43 +08:00
|
|
|
retval = update_cpumask(cs, trialcs, buf);
|
2008-07-25 16:47:02 +08:00
|
|
|
break;
|
|
|
|
case FILE_MEMLIST:
|
2009-01-08 10:08:43 +08:00
|
|
|
retval = update_nodemask(cs, trialcs, buf);
|
2008-07-25 16:47:02 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
retval = -EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
2009-01-08 10:08:43 +08:00
|
|
|
|
2018-11-08 23:08:37 +08:00
|
|
|
free_cpuset(trialcs);
|
2013-01-08 00:51:08 +08:00
|
|
|
out_unlock:
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2021-08-03 22:16:07 +08:00
|
|
|
cpus_read_unlock();
|
2014-07-01 03:47:32 +08:00
|
|
|
kernfs_unbreak_active_protection(of->kn);
|
|
|
|
css_put(&cs->css);
|
2016-01-20 01:18:41 +08:00
|
|
|
flush_workqueue(cpuset_migrate_mm_wq);
|
2014-05-14 00:16:21 +08:00
|
|
|
return retval ?: nbytes;
|
2008-07-25 16:47:02 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* These ascii lists should be read in a single call, by using a user
|
|
|
|
* buffer large enough to hold the entire map. If read in smaller
|
|
|
|
* chunks, there is no guarantee of atomicity. Since the display format
|
|
|
|
* used, list of ranges of sequential numbers, is variable length,
|
|
|
|
* and since these maps can change value dynamically, one could read
|
|
|
|
* gibberish by doing partial reads while a list was changing.
|
|
|
|
*/
|
2013-12-06 01:28:04 +08:00
|
|
|
static int cpuset_common_seq_show(struct seq_file *sf, void *v)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2013-12-06 01:28:04 +08:00
|
|
|
struct cpuset *cs = css_cs(seq_css(sf));
|
|
|
|
cpuset_filetype_t type = seq_cft(sf)->private;
|
2013-12-06 01:28:02 +08:00
|
|
|
int ret = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_lock_irq(&callback_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case FILE_CPULIST:
|
2015-02-14 06:37:23 +08:00
|
|
|
seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
case FILE_MEMLIST:
|
2015-02-14 06:37:23 +08:00
|
|
|
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
2014-07-09 16:49:25 +08:00
|
|
|
case FILE_EFFECTIVE_CPULIST:
|
2015-02-14 06:37:23 +08:00
|
|
|
seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
|
2014-07-09 16:49:25 +08:00
|
|
|
break;
|
|
|
|
case FILE_EFFECTIVE_MEMLIST:
|
2015-02-14 06:37:23 +08:00
|
|
|
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
|
2014-07-09 16:49:25 +08:00
|
|
|
break;
|
2018-11-08 23:08:46 +08:00
|
|
|
case FILE_SUBPARTS_CPULIST:
|
|
|
|
seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
|
|
|
|
break;
|
2005-04-17 06:20:36 +08:00
|
|
|
default:
|
2013-12-06 01:28:02 +08:00
|
|
|
ret = -EINVAL;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
2013-12-06 01:28:02 +08:00
|
|
|
return ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2013-08-09 08:11:24 +08:00
|
|
|
static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
|
2008-04-29 16:00:00 +08:00
|
|
|
{
|
2013-08-09 08:11:24 +08:00
|
|
|
struct cpuset *cs = css_cs(css);
|
2008-04-29 16:00:00 +08:00
|
|
|
cpuset_filetype_t type = cft->private;
|
|
|
|
switch (type) {
|
|
|
|
case FILE_CPU_EXCLUSIVE:
|
|
|
|
return is_cpu_exclusive(cs);
|
|
|
|
case FILE_MEM_EXCLUSIVE:
|
|
|
|
return is_mem_exclusive(cs);
|
2008-04-29 16:00:26 +08:00
|
|
|
case FILE_MEM_HARDWALL:
|
|
|
|
return is_mem_hardwall(cs);
|
2008-04-29 16:00:00 +08:00
|
|
|
case FILE_SCHED_LOAD_BALANCE:
|
|
|
|
return is_sched_load_balance(cs);
|
|
|
|
case FILE_MEMORY_MIGRATE:
|
|
|
|
return is_memory_migrate(cs);
|
|
|
|
case FILE_MEMORY_PRESSURE_ENABLED:
|
|
|
|
return cpuset_memory_pressure_enabled;
|
|
|
|
case FILE_MEMORY_PRESSURE:
|
|
|
|
return fmeter_getrate(&cs->fmeter);
|
|
|
|
case FILE_SPREAD_PAGE:
|
|
|
|
return is_spread_page(cs);
|
|
|
|
case FILE_SPREAD_SLAB:
|
|
|
|
return is_spread_slab(cs);
|
|
|
|
default:
|
|
|
|
BUG();
|
|
|
|
}
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
|
|
|
|
/* Unreachable but makes gcc happy */
|
|
|
|
return 0;
|
2008-04-29 16:00:00 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-08-09 08:11:24 +08:00
|
|
|
static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
|
2008-05-07 11:42:41 +08:00
|
|
|
{
|
2013-08-09 08:11:24 +08:00
|
|
|
struct cpuset *cs = css_cs(css);
|
2008-05-07 11:42:41 +08:00
|
|
|
cpuset_filetype_t type = cft->private;
|
|
|
|
switch (type) {
|
|
|
|
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
|
|
|
|
return cs->relax_domain_level;
|
|
|
|
default:
|
|
|
|
BUG();
|
|
|
|
}
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
|
2021-04-08 16:03:46 +08:00
|
|
|
/* Unreachable but makes gcc happy */
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
return 0;
|
2008-05-07 11:42:41 +08:00
|
|
|
}
|
|
|
|
|
2018-11-08 23:08:44 +08:00
|
|
|
static int sched_partition_show(struct seq_file *seq, void *v)
|
|
|
|
{
|
|
|
|
struct cpuset *cs = css_cs(seq_css(seq));
|
2022-09-02 04:57:41 +08:00
|
|
|
const char *err, *type = NULL;
|
2018-11-08 23:08:44 +08:00
|
|
|
|
|
|
|
switch (cs->partition_root_state) {
|
2022-09-02 04:57:37 +08:00
|
|
|
case PRS_ROOT:
|
2018-11-08 23:08:44 +08:00
|
|
|
seq_puts(seq, "root\n");
|
|
|
|
break;
|
2022-09-02 04:57:40 +08:00
|
|
|
case PRS_ISOLATED:
|
|
|
|
seq_puts(seq, "isolated\n");
|
|
|
|
break;
|
2022-09-02 04:57:37 +08:00
|
|
|
case PRS_MEMBER:
|
2018-11-08 23:08:44 +08:00
|
|
|
seq_puts(seq, "member\n");
|
|
|
|
break;
|
2022-09-02 04:57:37 +08:00
|
|
|
case PRS_INVALID_ROOT:
|
2022-09-02 04:57:41 +08:00
|
|
|
type = "root";
|
|
|
|
fallthrough;
|
2022-09-02 04:57:40 +08:00
|
|
|
case PRS_INVALID_ISOLATED:
|
2022-09-02 04:57:41 +08:00
|
|
|
if (!type)
|
|
|
|
type = "isolated";
|
|
|
|
err = perr_strings[READ_ONCE(cs->prs_err)];
|
|
|
|
if (err)
|
|
|
|
seq_printf(seq, "%s invalid (%s)\n", type, err);
|
|
|
|
else
|
|
|
|
seq_printf(seq, "%s invalid\n", type);
|
|
|
|
break;
|
2018-11-08 23:08:44 +08:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
|
|
|
|
size_t nbytes, loff_t off)
|
|
|
|
{
|
|
|
|
struct cpuset *cs = css_cs(of_css(of));
|
|
|
|
int val;
|
|
|
|
int retval = -ENODEV;
|
|
|
|
|
|
|
|
buf = strstrip(buf);
|
|
|
|
|
|
|
|
/*
|
2018-11-14 04:03:33 +08:00
|
|
|
* Convert "root" to ENABLED, and convert "member" to DISABLED.
|
2018-11-08 23:08:44 +08:00
|
|
|
*/
|
2018-11-14 04:03:33 +08:00
|
|
|
if (!strcmp(buf, "root"))
|
2022-09-02 04:57:37 +08:00
|
|
|
val = PRS_ROOT;
|
2018-11-14 04:03:33 +08:00
|
|
|
else if (!strcmp(buf, "member"))
|
2022-09-02 04:57:37 +08:00
|
|
|
val = PRS_MEMBER;
|
2022-09-02 04:57:40 +08:00
|
|
|
else if (!strcmp(buf, "isolated"))
|
|
|
|
val = PRS_ISOLATED;
|
2018-11-08 23:08:44 +08:00
|
|
|
else
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
css_get(&cs->css);
|
2021-08-03 22:16:07 +08:00
|
|
|
cpus_read_lock();
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2018-11-08 23:08:44 +08:00
|
|
|
if (!is_cpuset_online(cs))
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
retval = update_prstate(cs, val);
|
|
|
|
out_unlock:
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2021-08-03 22:16:07 +08:00
|
|
|
cpus_read_unlock();
|
2018-11-08 23:08:44 +08:00
|
|
|
css_put(&cs->css);
|
|
|
|
return retval ?: nbytes;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* for the common functions, 'private' gives the type of file
|
|
|
|
*/
|
|
|
|
|
2018-11-08 23:08:35 +08:00
|
|
|
static struct cftype legacy_files[] = {
|
2008-04-29 16:00:26 +08:00
|
|
|
{
|
|
|
|
.name = "cpus",
|
2013-12-06 01:28:04 +08:00
|
|
|
.seq_show = cpuset_common_seq_show,
|
2014-05-14 00:16:21 +08:00
|
|
|
.write = cpuset_write_resmask,
|
2008-07-25 16:47:02 +08:00
|
|
|
.max_write_len = (100U + 6 * NR_CPUS),
|
2008-04-29 16:00:26 +08:00
|
|
|
.private = FILE_CPULIST,
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
.name = "mems",
|
2013-12-06 01:28:04 +08:00
|
|
|
.seq_show = cpuset_common_seq_show,
|
2014-05-14 00:16:21 +08:00
|
|
|
.write = cpuset_write_resmask,
|
2008-07-25 16:47:02 +08:00
|
|
|
.max_write_len = (100U + 6 * MAX_NUMNODES),
|
2008-04-29 16:00:26 +08:00
|
|
|
.private = FILE_MEMLIST,
|
|
|
|
},
|
|
|
|
|
2014-07-09 16:49:25 +08:00
|
|
|
{
|
|
|
|
.name = "effective_cpus",
|
|
|
|
.seq_show = cpuset_common_seq_show,
|
|
|
|
.private = FILE_EFFECTIVE_CPULIST,
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
.name = "effective_mems",
|
|
|
|
.seq_show = cpuset_common_seq_show,
|
|
|
|
.private = FILE_EFFECTIVE_MEMLIST,
|
|
|
|
},
|
|
|
|
|
2008-04-29 16:00:26 +08:00
|
|
|
{
|
|
|
|
.name = "cpu_exclusive",
|
|
|
|
.read_u64 = cpuset_read_u64,
|
|
|
|
.write_u64 = cpuset_write_u64,
|
|
|
|
.private = FILE_CPU_EXCLUSIVE,
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
.name = "mem_exclusive",
|
|
|
|
.read_u64 = cpuset_read_u64,
|
|
|
|
.write_u64 = cpuset_write_u64,
|
|
|
|
.private = FILE_MEM_EXCLUSIVE,
|
|
|
|
},
|
|
|
|
|
2008-04-29 16:00:26 +08:00
|
|
|
{
|
|
|
|
.name = "mem_hardwall",
|
|
|
|
.read_u64 = cpuset_read_u64,
|
|
|
|
.write_u64 = cpuset_write_u64,
|
|
|
|
.private = FILE_MEM_HARDWALL,
|
|
|
|
},
|
|
|
|
|
2008-04-29 16:00:26 +08:00
|
|
|
{
|
|
|
|
.name = "sched_load_balance",
|
|
|
|
.read_u64 = cpuset_read_u64,
|
|
|
|
.write_u64 = cpuset_write_u64,
|
|
|
|
.private = FILE_SCHED_LOAD_BALANCE,
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
.name = "sched_relax_domain_level",
|
2008-05-07 11:42:41 +08:00
|
|
|
.read_s64 = cpuset_read_s64,
|
|
|
|
.write_s64 = cpuset_write_s64,
|
2008-04-29 16:00:26 +08:00
|
|
|
.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
.name = "memory_migrate",
|
|
|
|
.read_u64 = cpuset_read_u64,
|
|
|
|
.write_u64 = cpuset_write_u64,
|
|
|
|
.private = FILE_MEMORY_MIGRATE,
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
.name = "memory_pressure",
|
|
|
|
.read_u64 = cpuset_read_u64,
|
2017-08-25 00:04:29 +08:00
|
|
|
.private = FILE_MEMORY_PRESSURE,
|
2008-04-29 16:00:26 +08:00
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
.name = "memory_spread_page",
|
|
|
|
.read_u64 = cpuset_read_u64,
|
|
|
|
.write_u64 = cpuset_write_u64,
|
|
|
|
.private = FILE_SPREAD_PAGE,
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
.name = "memory_spread_slab",
|
|
|
|
.read_u64 = cpuset_read_u64,
|
|
|
|
.write_u64 = cpuset_write_u64,
|
|
|
|
.private = FILE_SPREAD_SLAB,
|
|
|
|
},
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
|
2012-04-02 03:09:55 +08:00
|
|
|
{
|
|
|
|
.name = "memory_pressure_enabled",
|
|
|
|
.flags = CFTYPE_ONLY_ON_ROOT,
|
|
|
|
.read_u64 = cpuset_read_u64,
|
|
|
|
.write_u64 = cpuset_write_u64,
|
|
|
|
.private = FILE_MEMORY_PRESSURE_ENABLED,
|
|
|
|
},
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-04-02 03:09:55 +08:00
|
|
|
{ } /* terminate */
|
|
|
|
};
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2018-11-08 23:08:35 +08:00
|
|
|
/*
|
|
|
|
* This is currently a minimal set for the default hierarchy. It can be
|
|
|
|
* expanded later on by migrating more features and control files from v1.
|
|
|
|
*/
|
|
|
|
static struct cftype dfl_files[] = {
|
|
|
|
{
|
|
|
|
.name = "cpus",
|
|
|
|
.seq_show = cpuset_common_seq_show,
|
|
|
|
.write = cpuset_write_resmask,
|
|
|
|
.max_write_len = (100U + 6 * NR_CPUS),
|
|
|
|
.private = FILE_CPULIST,
|
|
|
|
.flags = CFTYPE_NOT_ON_ROOT,
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
.name = "mems",
|
|
|
|
.seq_show = cpuset_common_seq_show,
|
|
|
|
.write = cpuset_write_resmask,
|
|
|
|
.max_write_len = (100U + 6 * MAX_NUMNODES),
|
|
|
|
.private = FILE_MEMLIST,
|
|
|
|
.flags = CFTYPE_NOT_ON_ROOT,
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
.name = "cpus.effective",
|
|
|
|
.seq_show = cpuset_common_seq_show,
|
|
|
|
.private = FILE_EFFECTIVE_CPULIST,
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
.name = "mems.effective",
|
|
|
|
.seq_show = cpuset_common_seq_show,
|
|
|
|
.private = FILE_EFFECTIVE_MEMLIST,
|
|
|
|
},
|
|
|
|
|
2018-11-08 23:08:38 +08:00
|
|
|
{
|
2018-11-14 04:03:33 +08:00
|
|
|
.name = "cpus.partition",
|
2018-11-08 23:08:44 +08:00
|
|
|
.seq_show = sched_partition_show,
|
|
|
|
.write = sched_partition_write,
|
2018-11-08 23:08:38 +08:00
|
|
|
.private = FILE_PARTITION_ROOT,
|
|
|
|
.flags = CFTYPE_NOT_ON_ROOT,
|
2021-08-11 11:06:02 +08:00
|
|
|
.file_offset = offsetof(struct cpuset, partition_file),
|
2018-11-08 23:08:38 +08:00
|
|
|
},
|
|
|
|
|
2018-11-08 23:08:46 +08:00
|
|
|
{
|
|
|
|
.name = "cpus.subpartitions",
|
|
|
|
.seq_show = cpuset_common_seq_show,
|
|
|
|
.private = FILE_SUBPARTS_CPULIST,
|
|
|
|
.flags = CFTYPE_DEBUG,
|
|
|
|
},
|
|
|
|
|
2018-11-08 23:08:35 +08:00
|
|
|
{ } /* terminate */
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2022-11-17 15:15:57 +08:00
|
|
|
/**
|
|
|
|
* cpuset_css_alloc - Allocate a cpuset css
|
|
|
|
* @parent_css: Parent css of the control group that the new cpuset will be
|
|
|
|
* part of
|
|
|
|
* Return: cpuset css on success, -ENOMEM on failure.
|
|
|
|
*
|
|
|
|
* Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
|
|
|
|
* top cpuset css otherwise.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2013-08-09 08:11:23 +08:00
|
|
|
static struct cgroup_subsys_state *
|
|
|
|
cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2013-01-08 00:51:07 +08:00
|
|
|
struct cpuset *cs;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-08-09 08:11:23 +08:00
|
|
|
if (!parent_css)
|
2007-10-19 14:39:39 +08:00
|
|
|
return &top_cpuset.css;
|
2012-11-20 00:13:39 +08:00
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!cs)
|
2007-10-19 14:39:39 +08:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2018-11-08 23:08:37 +08:00
|
|
|
|
|
|
|
if (alloc_cpumasks(cs, NULL)) {
|
|
|
|
kfree(cs);
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2021-08-12 03:57:07 +08:00
|
|
|
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
|
2008-04-05 09:11:07 +08:00
|
|
|
nodes_clear(cs->mems_allowed);
|
cpuset: add cs->effective_cpus and cs->effective_mems
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierachy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
This patch adds the effective masks to struct cpuset and initializes
them. The effective masks of the top cpuset is the same with configured
masks, and a child cpuset inherits its parent's effective masks.
This won't introduce behavior change.
v2:
- s/real_{mems,cpus}_allowed/effective_{mems,cpus}, suggested by Tejun.
- don't init effective masks in cpuset_css_online() if !cgroup_on_dfl.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:03 +08:00
|
|
|
nodes_clear(cs->effective_mems);
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
fmeter_init(&cs->fmeter);
|
2008-04-15 13:04:23 +08:00
|
|
|
cs->relax_domain_level = -1;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2021-08-12 03:57:07 +08:00
|
|
|
/* Set CS_MEMORY_MIGRATE for default hierarchy */
|
|
|
|
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
|
|
|
|
__set_bit(CS_MEMORY_MIGRATE, &cs->flags);
|
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
return &cs->css;
|
|
|
|
}
|
|
|
|
|
2013-08-09 08:11:23 +08:00
|
|
|
static int cpuset_css_online(struct cgroup_subsys_state *css)
|
2013-01-08 00:51:07 +08:00
|
|
|
{
|
2013-08-09 08:11:23 +08:00
|
|
|
struct cpuset *cs = css_cs(css);
|
2013-01-08 00:51:08 +08:00
|
|
|
struct cpuset *parent = parent_cs(cs);
|
2013-01-08 00:51:07 +08:00
|
|
|
struct cpuset *tmp_cs;
|
2013-08-09 08:11:25 +08:00
|
|
|
struct cgroup_subsys_state *pos_css;
|
2013-01-08 00:51:07 +08:00
|
|
|
|
|
|
|
if (!parent)
|
|
|
|
return 0;
|
|
|
|
|
2021-08-03 22:16:07 +08:00
|
|
|
cpus_read_lock();
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2013-01-08 00:51:08 +08:00
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
set_bit(CS_ONLINE, &cs->flags);
|
2013-01-08 00:51:07 +08:00
|
|
|
if (is_spread_page(parent))
|
|
|
|
set_bit(CS_SPREAD_PAGE, &cs->flags);
|
|
|
|
if (is_spread_slab(parent))
|
|
|
|
set_bit(CS_SPREAD_SLAB, &cs->flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-06-05 07:10:08 +08:00
|
|
|
cpuset_inc();
|
2012-11-20 00:13:39 +08:00
|
|
|
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_lock_irq(&callback_lock);
|
2017-08-18 03:33:10 +08:00
|
|
|
if (is_in_v2_mode()) {
|
cpuset: add cs->effective_cpus and cs->effective_mems
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierachy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
This patch adds the effective masks to struct cpuset and initializes
them. The effective masks of the top cpuset is the same with configured
masks, and a child cpuset inherits its parent's effective masks.
This won't introduce behavior change.
v2:
- s/real_{mems,cpus}_allowed/effective_{mems,cpus}, suggested by Tejun.
- don't init effective masks in cpuset_css_online() if !cgroup_on_dfl.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:03 +08:00
|
|
|
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
|
|
|
|
cs->effective_mems = parent->effective_mems;
|
2018-11-08 23:08:40 +08:00
|
|
|
cs->use_parent_ecpus = true;
|
|
|
|
parent->child_ecpus_count++;
|
cpuset: add cs->effective_cpus and cs->effective_mems
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierachy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
This patch adds the effective masks to struct cpuset and initializes
them. The effective masks of the top cpuset is the same with configured
masks, and a child cpuset inherits its parent's effective masks.
This won't introduce behavior change.
v2:
- s/real_{mems,cpus}_allowed/effective_{mems,cpus}, suggested by Tejun.
- don't init effective masks in cpuset_css_online() if !cgroup_on_dfl.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:03 +08:00
|
|
|
}
|
2023-06-27 22:35:00 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
|
|
|
|
*/
|
|
|
|
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
|
|
|
|
!is_sched_load_balance(parent))
|
|
|
|
clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
|
|
|
|
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
cpuset: add cs->effective_cpus and cs->effective_mems
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierachy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
This patch adds the effective masks to struct cpuset and initializes
them. The effective masks of the top cpuset is the same with configured
masks, and a child cpuset inherits its parent's effective masks.
This won't introduce behavior change.
v2:
- s/real_{mems,cpus}_allowed/effective_{mems,cpus}, suggested by Tejun.
- don't init effective masks in cpuset_css_online() if !cgroup_on_dfl.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:03 +08:00
|
|
|
|
2013-08-09 08:11:23 +08:00
|
|
|
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
|
2013-01-08 00:51:08 +08:00
|
|
|
goto out_unlock;
|
2012-11-20 00:13:39 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
|
|
|
|
* set. This flag handling is implemented in cgroup core for
|
2022-03-06 04:46:57 +08:00
|
|
|
* historical reasons - the flag may be specified during mount.
|
2012-11-20 00:13:39 +08:00
|
|
|
*
|
|
|
|
* Currently, if any sibling cpusets have exclusive cpus or mem, we
|
|
|
|
* refuse to clone the configuration - thereby refusing the task to
|
|
|
|
* be entered, and as a result refusing the sys_unshare() or
|
|
|
|
* clone() which initiated it. If this becomes a problem for some
|
|
|
|
* users who wish to allow that scenario, then this could be
|
|
|
|
* changed to grant parent->cpus_allowed-sibling_cpus_exclusive
|
|
|
|
* (and likewise for mems) to the new cgroup.
|
|
|
|
*/
|
2013-01-08 00:51:07 +08:00
|
|
|
rcu_read_lock();
|
2013-08-09 08:11:25 +08:00
|
|
|
cpuset_for_each_child(tmp_cs, pos_css, parent) {
|
2013-01-08 00:51:07 +08:00
|
|
|
if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
|
|
|
|
rcu_read_unlock();
|
2013-01-08 00:51:08 +08:00
|
|
|
goto out_unlock;
|
2013-01-08 00:51:07 +08:00
|
|
|
}
|
2012-11-20 00:13:39 +08:00
|
|
|
}
|
2013-01-08 00:51:07 +08:00
|
|
|
rcu_read_unlock();
|
2012-11-20 00:13:39 +08:00
|
|
|
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_lock_irq(&callback_lock);
|
2012-11-20 00:13:39 +08:00
|
|
|
cs->mems_allowed = parent->mems_allowed;
|
2015-02-13 11:19:49 +08:00
|
|
|
cs->effective_mems = parent->mems_allowed;
|
2012-11-20 00:13:39 +08:00
|
|
|
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
|
2015-02-13 11:19:49 +08:00
|
|
|
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
|
2014-10-27 21:27:02 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
2013-01-08 00:51:08 +08:00
|
|
|
out_unlock:
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2021-08-03 22:16:07 +08:00
|
|
|
cpus_read_unlock();
|
2013-01-08 00:51:07 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-07-27 11:56:53 +08:00
|
|
|
/*
|
|
|
|
* If the cpuset being removed has its flag 'sched_load_balance'
|
|
|
|
* enabled, then simulate turning sched_load_balance off, which
|
2018-11-08 23:08:38 +08:00
|
|
|
* will call rebuild_sched_domains_locked(). That is not needed
|
|
|
|
* in the default hierarchy where only changes in partition
|
|
|
|
* will cause repartitioning.
|
|
|
|
*
|
|
|
|
* If the cpuset has the 'sched.partition' flag enabled, simulate
|
|
|
|
* turning 'sched.partition" off.
|
2013-07-27 11:56:53 +08:00
|
|
|
*/
|
|
|
|
|
2013-08-09 08:11:23 +08:00
|
|
|
static void cpuset_css_offline(struct cgroup_subsys_state *css)
|
2013-01-08 00:51:07 +08:00
|
|
|
{
|
2013-08-09 08:11:23 +08:00
|
|
|
struct cpuset *cs = css_cs(css);
|
2013-01-08 00:51:07 +08:00
|
|
|
|
2021-08-03 22:16:07 +08:00
|
|
|
cpus_read_lock();
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2013-01-08 00:51:07 +08:00
|
|
|
|
2022-09-02 04:57:37 +08:00
|
|
|
if (is_partition_valid(cs))
|
2018-11-08 23:08:38 +08:00
|
|
|
update_prstate(cs, 0);
|
|
|
|
|
|
|
|
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
|
|
|
|
is_sched_load_balance(cs))
|
2013-01-08 00:51:07 +08:00
|
|
|
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
|
|
|
|
|
2018-11-08 23:08:40 +08:00
|
|
|
if (cs->use_parent_ecpus) {
|
|
|
|
struct cpuset *parent = parent_cs(cs);
|
|
|
|
|
|
|
|
cs->use_parent_ecpus = false;
|
|
|
|
parent->child_ecpus_count--;
|
|
|
|
}
|
|
|
|
|
2014-06-05 07:10:08 +08:00
|
|
|
cpuset_dec();
|
2013-01-08 00:51:07 +08:00
|
|
|
clear_bit(CS_ONLINE, &cs->flags);
|
2013-01-08 00:51:07 +08:00
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2021-08-03 22:16:07 +08:00
|
|
|
cpus_read_unlock();
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2013-08-09 08:11:23 +08:00
|
|
|
static void cpuset_css_free(struct cgroup_subsys_state *css)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2013-08-09 08:11:23 +08:00
|
|
|
struct cpuset *cs = css_cs(css);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2018-11-08 23:08:37 +08:00
|
|
|
free_cpuset(cs);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2014-07-09 16:48:01 +08:00
|
|
|
static void cpuset_bind(struct cgroup_subsys_state *root_css)
|
|
|
|
{
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_lock_irq(&callback_lock);
|
2014-07-09 16:48:01 +08:00
|
|
|
|
2017-08-18 03:33:10 +08:00
|
|
|
if (is_in_v2_mode()) {
|
2014-07-09 16:48:01 +08:00
|
|
|
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
|
|
|
|
top_cpuset.mems_allowed = node_possible_map;
|
|
|
|
} else {
|
|
|
|
cpumask_copy(top_cpuset.cpus_allowed,
|
|
|
|
top_cpuset.effective_cpus);
|
|
|
|
top_cpuset.mems_allowed = top_cpuset.effective_mems;
|
|
|
|
}
|
|
|
|
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2014-07-09 16:48:01 +08:00
|
|
|
}
|
|
|
|
|
2023-04-11 21:35:59 +08:00
|
|
|
/*
|
|
|
|
* In case the child is cloned into a cpuset different from its parent,
|
|
|
|
* additional checks are done to see if the move is allowed.
|
|
|
|
*/
|
|
|
|
static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
|
|
|
|
{
|
|
|
|
struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
|
|
|
|
bool same_cs;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
same_cs = (cs == task_cs(current));
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
if (same_cs)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
lockdep_assert_held(&cgroup_mutex);
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2023-04-11 21:35:59 +08:00
|
|
|
|
|
|
|
/* Check to see if task is allowed in the cpuset */
|
|
|
|
ret = cpuset_can_attach_check(cs);
|
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2023-05-08 15:58:54 +08:00
|
|
|
ret = task_can_attach(task);
|
2023-04-11 21:35:59 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
ret = security_task_setscheduler(task);
|
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Mark attach is in progress. This makes validate_change() fail
|
|
|
|
* changes which zero cpus/mems_allowed.
|
|
|
|
*/
|
|
|
|
cs->attach_in_progress++;
|
|
|
|
out_unlock:
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2023-04-11 21:35:59 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
|
|
|
|
{
|
|
|
|
struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
|
|
|
|
bool same_cs;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
same_cs = (cs == task_cs(current));
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
if (same_cs)
|
|
|
|
return;
|
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2023-04-11 21:35:59 +08:00
|
|
|
cs->attach_in_progress--;
|
|
|
|
if (!cs->attach_in_progress)
|
|
|
|
wake_up(&cpuset_attach_wq);
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2023-04-11 21:35:59 +08:00
|
|
|
}
|
|
|
|
|
2016-08-09 11:25:01 +08:00
|
|
|
/*
|
|
|
|
* Make sure the new task conform to the current state of its parent,
|
|
|
|
* which could have been changed by cpuset just after it inherits the
|
|
|
|
* state from the parent and before it sits on the cgroup's task list.
|
|
|
|
*/
|
2016-09-16 21:02:37 +08:00
|
|
|
static void cpuset_fork(struct task_struct *task)
|
2016-08-09 11:25:01 +08:00
|
|
|
{
|
2023-04-11 21:35:58 +08:00
|
|
|
struct cpuset *cs;
|
|
|
|
bool same_cs;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
cs = task_cs(task);
|
|
|
|
same_cs = (cs == task_cs(current));
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
if (same_cs) {
|
|
|
|
if (cs == &top_cpuset)
|
|
|
|
return;
|
|
|
|
|
|
|
|
set_cpus_allowed_ptr(task, current->cpus_ptr);
|
|
|
|
task->mems_allowed = current->mems_allowed;
|
2016-08-09 11:25:01 +08:00
|
|
|
return;
|
2023-04-11 21:35:58 +08:00
|
|
|
}
|
2016-08-09 11:25:01 +08:00
|
|
|
|
2023-04-11 21:35:58 +08:00
|
|
|
/* CLONE_INTO_CGROUP */
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2023-04-11 21:35:58 +08:00
|
|
|
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
|
|
|
|
cpuset_attach_task(cs, task);
|
2023-04-11 21:35:59 +08:00
|
|
|
|
|
|
|
cs->attach_in_progress--;
|
|
|
|
if (!cs->attach_in_progress)
|
|
|
|
wake_up(&cpuset_attach_wq);
|
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2016-08-09 11:25:01 +08:00
|
|
|
}
|
|
|
|
|
2014-02-08 23:36:58 +08:00
|
|
|
struct cgroup_subsys cpuset_cgrp_subsys = {
|
2014-07-09 16:48:01 +08:00
|
|
|
.css_alloc = cpuset_css_alloc,
|
|
|
|
.css_online = cpuset_css_online,
|
|
|
|
.css_offline = cpuset_css_offline,
|
|
|
|
.css_free = cpuset_css_free,
|
|
|
|
.can_attach = cpuset_can_attach,
|
|
|
|
.cancel_attach = cpuset_cancel_attach,
|
|
|
|
.attach = cpuset_attach,
|
2016-04-22 07:06:48 +08:00
|
|
|
.post_attach = cpuset_post_attach,
|
2014-07-09 16:48:01 +08:00
|
|
|
.bind = cpuset_bind,
|
2023-04-11 21:35:59 +08:00
|
|
|
.can_fork = cpuset_can_fork,
|
|
|
|
.cancel_fork = cpuset_cancel_fork,
|
2016-08-09 11:25:01 +08:00
|
|
|
.fork = cpuset_fork,
|
2018-11-08 23:08:35 +08:00
|
|
|
.legacy_cftypes = legacy_files,
|
|
|
|
.dfl_cftypes = dfl_files,
|
2016-02-23 23:00:50 +08:00
|
|
|
.early_init = true,
|
2018-11-08 23:08:35 +08:00
|
|
|
.threaded = true,
|
2007-10-19 14:39:39 +08:00
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* cpuset_init - initialize cpusets at system boot
|
|
|
|
*
|
2019-05-14 00:33:22 +08:00
|
|
|
* Description: Initialize top_cpuset
|
2005-04-17 06:20:36 +08:00
|
|
|
**/
|
|
|
|
|
|
|
|
int __init cpuset_init(void)
|
|
|
|
{
|
2017-03-27 00:24:06 +08:00
|
|
|
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
|
|
|
|
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
|
2018-11-08 23:08:37 +08:00
|
|
|
BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
|
cpuset,mm: update tasks' mems_allowed in time
Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.
In order to update tasks' mems_allowed in time, we must modify the code of
memory policy. Because the memory policy is applied in the process's
context originally. After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.
But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression. But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set. In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.
[lee.schermerhorn@hp.com:
The rework of mpol_new() to extract the adjusting of the node mask to
apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
allocation. Fix this by adding the check for MPOL_PREFERRED and empty
node mask to mpol_new_mpolicy().
Remove the now unneeded 'nodes = NULL' from mpol_new().
Note that mpol_new_mempolicy() is always called with a non-NULL
'nodes' parameter now that it has been removed from mpol_new().
Therefore, we don't need to test nodes for NULL before testing it for
'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to
verify this assumption.]
[lee.schermerhorn@hp.com:
I don't think the function name 'mpol_new_mempolicy' is descriptive
enough to differentiate it from mpol_new().
This function applies cpuset set context, usually constraining nodes
to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag
is set, it also translates the nodes. So I settled on
'mpol_set_nodemask()', because the comment block for mpol_new() mentions
that we need to call this function to "set nodes".
Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-17 06:31:49 +08:00
|
|
|
|
2009-01-08 10:08:44 +08:00
|
|
|
cpumask_setall(top_cpuset.cpus_allowed);
|
2008-04-05 09:11:07 +08:00
|
|
|
nodes_setall(top_cpuset.mems_allowed);
|
cpuset: add cs->effective_cpus and cs->effective_mems
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierachy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
This patch adds the effective masks to struct cpuset and initializes
them. The effective masks of the top cpuset is the same with configured
masks, and a child cpuset inherits its parent's effective masks.
This won't introduce behavior change.
v2:
- s/real_{mems,cpus}_allowed/effective_{mems,cpus}, suggested by Tejun.
- don't init effective masks in cpuset_css_online() if !cgroup_on_dfl.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:03 +08:00
|
|
|
cpumask_setall(top_cpuset.effective_cpus);
|
|
|
|
nodes_setall(top_cpuset.effective_mems);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
fmeter_init(&top_cpuset.fmeter);
|
2007-10-19 14:40:20 +08:00
|
|
|
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
|
2008-04-15 13:04:23 +08:00
|
|
|
top_cpuset.relax_domain_level = -1;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2017-03-27 00:24:06 +08:00
|
|
|
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
|
2009-01-08 10:08:42 +08:00
|
|
|
|
2007-10-19 14:39:39 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2006-09-29 17:01:17 +08:00
|
|
|
/*
|
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.
This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.
Here are some more details:
rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().
In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).
This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.
I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.
The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.
It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.
Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-08-12 05:33:53 +08:00
|
|
|
* If CPU and/or memory hotplug handlers, below, unplug any CPUs
|
2006-09-29 17:01:17 +08:00
|
|
|
* or memory nodes, we need to walk over the cpuset hierarchy,
|
|
|
|
* removing that CPU or node from all cpusets. If this removes the
|
2008-02-07 16:14:43 +08:00
|
|
|
* last CPU or node from a cpuset, then move the tasks in the empty
|
|
|
|
* cpuset to its next-highest non-empty parent.
|
2006-09-29 17:01:17 +08:00
|
|
|
*/
|
2008-02-07 16:14:43 +08:00
|
|
|
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
|
|
|
|
{
|
|
|
|
struct cpuset *parent;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find its next-highest non-empty parent, (top cpuset
|
|
|
|
* has online cpus, so can't be empty).
|
|
|
|
*/
|
2013-01-08 00:51:08 +08:00
|
|
|
parent = parent_cs(cs);
|
2009-01-08 10:08:44 +08:00
|
|
|
while (cpumask_empty(parent->cpus_allowed) ||
|
2008-02-07 16:14:47 +08:00
|
|
|
nodes_empty(parent->mems_allowed))
|
2013-01-08 00:51:08 +08:00
|
|
|
parent = parent_cs(parent);
|
2008-02-07 16:14:43 +08:00
|
|
|
|
2013-04-08 00:29:50 +08:00
|
|
|
if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
|
2014-05-06 01:49:00 +08:00
|
|
|
pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
|
2014-02-12 22:29:50 +08:00
|
|
|
pr_cont_cgroup_name(cs->css.cgroup);
|
|
|
|
pr_cont("\n");
|
2013-04-08 00:29:50 +08:00
|
|
|
}
|
2008-02-07 16:14:43 +08:00
|
|
|
}
|
|
|
|
|
2014-07-09 16:49:04 +08:00
|
|
|
static void
|
|
|
|
hotplug_update_tasks_legacy(struct cpuset *cs,
|
|
|
|
struct cpumask *new_cpus, nodemask_t *new_mems,
|
|
|
|
bool cpus_updated, bool mems_updated)
|
2014-07-09 16:48:54 +08:00
|
|
|
{
|
|
|
|
bool is_empty;
|
|
|
|
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_lock_irq(&callback_lock);
|
2014-07-09 16:49:04 +08:00
|
|
|
cpumask_copy(cs->cpus_allowed, new_cpus);
|
|
|
|
cpumask_copy(cs->effective_cpus, new_cpus);
|
|
|
|
cs->mems_allowed = *new_mems;
|
|
|
|
cs->effective_mems = *new_mems;
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
2014-07-09 16:48:54 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
|
2022-03-06 04:46:57 +08:00
|
|
|
* as the tasks will be migrated to an ancestor.
|
2014-07-09 16:48:54 +08:00
|
|
|
*/
|
2014-07-09 16:49:04 +08:00
|
|
|
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
|
2023-02-01 06:17:19 +08:00
|
|
|
update_tasks_cpumask(cs, new_cpus);
|
2014-07-09 16:49:04 +08:00
|
|
|
if (mems_updated && !nodes_empty(cs->mems_allowed))
|
2014-07-09 16:48:54 +08:00
|
|
|
update_tasks_nodemask(cs);
|
|
|
|
|
|
|
|
is_empty = cpumask_empty(cs->cpus_allowed) ||
|
|
|
|
nodes_empty(cs->mems_allowed);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Move tasks to the nearest ancestor with execution resources,
|
|
|
|
* This is full cgroup operation which will also call back into
|
|
|
|
* cpuset. Should be done outside any lock.
|
|
|
|
*/
|
2023-07-04 20:03:52 +08:00
|
|
|
if (is_empty) {
|
|
|
|
mutex_unlock(&cpuset_mutex);
|
2014-07-09 16:48:54 +08:00
|
|
|
remove_tasks_in_empty_cpuset(cs);
|
2023-07-04 20:03:52 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
|
|
|
}
|
2014-07-09 16:48:54 +08:00
|
|
|
}
|
|
|
|
|
2014-07-09 16:49:04 +08:00
|
|
|
static void
|
|
|
|
hotplug_update_tasks(struct cpuset *cs,
|
|
|
|
struct cpumask *new_cpus, nodemask_t *new_mems,
|
|
|
|
bool cpus_updated, bool mems_updated)
|
2014-07-09 16:48:54 +08:00
|
|
|
{
|
2022-09-02 04:57:38 +08:00
|
|
|
/* A partition root is allowed to have empty effective cpus */
|
|
|
|
if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
|
2014-07-09 16:49:04 +08:00
|
|
|
cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
|
|
|
|
if (nodes_empty(*new_mems))
|
|
|
|
*new_mems = parent_cs(cs)->effective_mems;
|
|
|
|
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_lock_irq(&callback_lock);
|
2014-07-09 16:49:04 +08:00
|
|
|
cpumask_copy(cs->effective_cpus, new_cpus);
|
|
|
|
cs->effective_mems = *new_mems;
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
2014-07-09 16:48:54 +08:00
|
|
|
|
2014-07-09 16:49:04 +08:00
|
|
|
if (cpus_updated)
|
2023-02-01 06:17:19 +08:00
|
|
|
update_tasks_cpumask(cs, new_cpus);
|
2014-07-09 16:49:04 +08:00
|
|
|
if (mems_updated)
|
2014-07-09 16:48:54 +08:00
|
|
|
update_tasks_nodemask(cs);
|
|
|
|
}
|
|
|
|
|
2018-11-08 23:08:41 +08:00
|
|
|
static bool force_rebuild;
|
|
|
|
|
|
|
|
void cpuset_force_rebuild(void)
|
|
|
|
{
|
|
|
|
force_rebuild = true;
|
|
|
|
}
|
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
/**
|
2013-06-09 17:14:47 +08:00
|
|
|
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
|
2013-01-08 00:51:07 +08:00
|
|
|
* @cs: cpuset in interest
|
2018-11-08 23:08:41 +08:00
|
|
|
* @tmp: the tmpmasks structure pointer
|
2008-02-07 16:14:43 +08:00
|
|
|
*
|
2013-01-08 00:51:07 +08:00
|
|
|
* Compare @cs's cpu and mem masks against top_cpuset and if some have gone
|
|
|
|
* offline, update @cs accordingly. If @cs ends up with no CPU or memory,
|
|
|
|
* all its tasks are moved to the nearest ancestor with both resources.
|
2012-05-24 22:16:41 +08:00
|
|
|
*/
|
2018-11-08 23:08:41 +08:00
|
|
|
static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
|
2012-05-24 22:16:41 +08:00
|
|
|
{
|
2014-07-09 16:49:04 +08:00
|
|
|
static cpumask_t new_cpus;
|
|
|
|
static nodemask_t new_mems;
|
|
|
|
bool cpus_updated;
|
|
|
|
bool mems_updated;
|
2018-11-08 23:08:41 +08:00
|
|
|
struct cpuset *parent;
|
2013-06-09 17:14:22 +08:00
|
|
|
retry:
|
|
|
|
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
|
2012-05-24 22:16:41 +08:00
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2012-05-24 22:16:55 +08:00
|
|
|
|
2013-06-09 17:14:22 +08:00
|
|
|
/*
|
|
|
|
* We have raced with task attaching. We wait until attaching
|
|
|
|
* is finished, so we won't attach a task to an empty cpuset.
|
|
|
|
*/
|
|
|
|
if (cs->attach_in_progress) {
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2013-06-09 17:14:22 +08:00
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
2021-07-20 22:18:26 +08:00
|
|
|
parent = parent_cs(cs);
|
2018-11-08 23:08:41 +08:00
|
|
|
compute_effective_cpumask(&new_cpus, cs, parent);
|
|
|
|
nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
|
|
|
|
|
|
|
|
if (cs->nr_subparts_cpus)
|
|
|
|
/*
|
|
|
|
* Make sure that CPUs allocated to child partitions
|
|
|
|
* do not show up in effective_cpus.
|
|
|
|
*/
|
|
|
|
cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
|
|
|
|
|
|
|
|
if (!tmp || !cs->partition_root_state)
|
|
|
|
goto update_tasks;
|
2012-05-24 22:16:41 +08:00
|
|
|
|
2018-11-08 23:08:41 +08:00
|
|
|
/*
|
|
|
|
* In the unlikely event that a partition root has empty
|
2022-09-02 04:57:39 +08:00
|
|
|
* effective_cpus with tasks, we will have to invalidate child
|
|
|
|
* partitions, if present, by setting nr_subparts_cpus to 0 to
|
|
|
|
* reclaim their cpus.
|
2018-11-08 23:08:41 +08:00
|
|
|
*/
|
2022-09-02 04:57:39 +08:00
|
|
|
if (cs->nr_subparts_cpus && is_partition_valid(cs) &&
|
|
|
|
cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) {
|
|
|
|
spin_lock_irq(&callback_lock);
|
|
|
|
cs->nr_subparts_cpus = 0;
|
|
|
|
cpumask_clear(cs->subparts_cpus);
|
|
|
|
spin_unlock_irq(&callback_lock);
|
|
|
|
compute_effective_cpumask(&new_cpus, cs, parent);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Force the partition to become invalid if either one of
|
|
|
|
* the following conditions hold:
|
|
|
|
* 1) empty effective cpus but not valid empty partition.
|
|
|
|
* 2) parent is invalid or doesn't grant any cpus to child
|
|
|
|
* partitions.
|
|
|
|
*/
|
|
|
|
if (is_partition_valid(cs) && (!parent->nr_subparts_cpus ||
|
|
|
|
(cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) {
|
2022-09-02 04:57:41 +08:00
|
|
|
int old_prs, parent_prs;
|
2022-09-02 04:57:39 +08:00
|
|
|
|
|
|
|
update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp);
|
2018-11-08 23:08:41 +08:00
|
|
|
if (cs->nr_subparts_cpus) {
|
2021-07-20 22:18:28 +08:00
|
|
|
spin_lock_irq(&callback_lock);
|
2018-11-08 23:08:41 +08:00
|
|
|
cs->nr_subparts_cpus = 0;
|
|
|
|
cpumask_clear(cs->subparts_cpus);
|
2021-07-20 22:18:28 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
2018-11-08 23:08:41 +08:00
|
|
|
compute_effective_cpumask(&new_cpus, cs, parent);
|
|
|
|
}
|
2012-05-24 22:16:41 +08:00
|
|
|
|
2022-09-02 04:57:39 +08:00
|
|
|
old_prs = cs->partition_root_state;
|
2022-09-02 04:57:41 +08:00
|
|
|
parent_prs = parent->partition_root_state;
|
2022-09-02 04:57:39 +08:00
|
|
|
if (is_partition_valid(cs)) {
|
|
|
|
spin_lock_irq(&callback_lock);
|
|
|
|
make_partition_invalid(cs);
|
|
|
|
spin_unlock_irq(&callback_lock);
|
2022-09-02 04:57:41 +08:00
|
|
|
if (is_prs_invalid(parent_prs))
|
|
|
|
WRITE_ONCE(cs->prs_err, PERR_INVPARENT);
|
|
|
|
else if (!parent_prs)
|
|
|
|
WRITE_ONCE(cs->prs_err, PERR_NOTPART);
|
|
|
|
else
|
|
|
|
WRITE_ONCE(cs->prs_err, PERR_HOTPLUG);
|
2022-09-02 04:57:39 +08:00
|
|
|
notify_partition_change(cs, old_prs);
|
2018-11-08 23:08:41 +08:00
|
|
|
}
|
|
|
|
cpuset_force_rebuild();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2022-09-02 04:57:37 +08:00
|
|
|
* On the other hand, an invalid partition root may be transitioned
|
2022-09-02 04:57:39 +08:00
|
|
|
* back to a regular one.
|
2018-11-08 23:08:41 +08:00
|
|
|
*/
|
2022-09-02 04:57:39 +08:00
|
|
|
else if (is_partition_valid(parent) && is_partition_invalid(cs)) {
|
|
|
|
update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp);
|
|
|
|
if (is_partition_valid(cs))
|
|
|
|
cpuset_force_rebuild();
|
|
|
|
}
|
2018-11-08 23:08:41 +08:00
|
|
|
|
|
|
|
update_tasks:
|
2014-07-09 16:49:04 +08:00
|
|
|
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
|
|
|
|
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
|
2023-03-17 23:15:06 +08:00
|
|
|
if (!cpus_updated && !mems_updated)
|
|
|
|
goto unlock; /* Hotplug doesn't affect this cpuset */
|
2013-01-08 00:51:07 +08:00
|
|
|
|
mm/page_alloc: detect allocation forbidden by cpuset and bail out early
There was a report that starting an Ubuntu in docker while using cpuset
to bind it to movable nodes (a node only has movable zone, like a node
for hotplug or a Persistent Memory node in normal usage) will fail due
to memory allocation failure, and then OOM is involved and many other
innocent processes got killed.
It can be reproduced with command:
$ docker run -it --rm --cpuset-mems 4 ubuntu:latest bash -c "grep Mems_allowed /proc/self/status"
(where node 4 is a movable node)
runc:[2:INIT] invoked oom-killer: gfp_mask=0x500cc2(GFP_HIGHUSER|__GFP_ACCOUNT), order=0, oom_score_adj=0
CPU: 8 PID: 8291 Comm: runc:[2:INIT] Tainted: G W I E 5.8.2-0.g71b519a-default #1 openSUSE Tumbleweed (unreleased)
Hardware name: Dell Inc. PowerEdge R640/0PHYDR, BIOS 2.6.4 04/09/2020
Call Trace:
dump_stack+0x6b/0x88
dump_header+0x4a/0x1e2
oom_kill_process.cold+0xb/0x10
out_of_memory.part.0+0xaf/0x230
out_of_memory+0x3d/0x80
__alloc_pages_slowpath.constprop.0+0x954/0xa20
__alloc_pages_nodemask+0x2d3/0x300
pipe_write+0x322/0x590
new_sync_write+0x196/0x1b0
vfs_write+0x1c3/0x1f0
ksys_write+0xa7/0xe0
do_syscall_64+0x52/0xd0
entry_SYSCALL_64_after_hwframe+0x44/0xa9
Mem-Info:
active_anon:392832 inactive_anon:182 isolated_anon:0
active_file:68130 inactive_file:151527 isolated_file:0
unevictable:2701 dirty:0 writeback:7
slab_reclaimable:51418 slab_unreclaimable:116300
mapped:45825 shmem:735 pagetables:2540 bounce:0
free:159849484 free_pcp:73 free_cma:0
Node 4 active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:0kB dirty:0kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 0kB writeback_tmp:0kB all_unreclaimable? no
Node 4 Movable free:130021408kB min:9140kB low:139160kB high:269180kB reserved_highatomic:0KB active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:130023424kB managed:130023424kB mlocked:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:292kB local_pcp:84kB free_cma:0kB
lowmem_reserve[]: 0 0 0 0 0
Node 4 Movable: 1*4kB (M) 0*8kB 0*16kB 1*32kB (M) 0*64kB 0*128kB 1*256kB (M) 1*512kB (M) 1*1024kB (M) 0*2048kB 31743*4096kB (M) = 130021156kB
oom-kill:constraint=CONSTRAINT_CPUSET,nodemask=(null),cpuset=docker-9976a269caec812c134fa317f27487ee36e1129beba7278a463dd53e5fb9997b.scope,mems_allowed=4,global_oom,task_memcg=/system.slice/containerd.service,task=containerd,pid=4100,uid=0
Out of memory: Killed process 4100 (containerd) total-vm:4077036kB, anon-rss:51184kB, file-rss:26016kB, shmem-rss:0kB, UID:0 pgtables:676kB oom_score_adj:0
oom_reaper: reaped process 8248 (docker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
oom_reaper: reaped process 2054 (node_exporter), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
oom_reaper: reaped process 1452 (systemd-journal), now anon-rss:0kB, file-rss:8564kB, shmem-rss:4kB
oom_reaper: reaped process 2146 (munin-node), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
oom_reaper: reaped process 8291 (runc:[2:INIT]), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
The reason is that in this case, the target cpuset nodes only have
movable zone, while the creation of an OS in docker sometimes needs to
allocate memory in non-movable zones (dma/dma32/normal) like
GFP_HIGHUSER, and the cpuset limit forbids the allocation, then
out-of-memory killing is involved even when normal nodes and movable
nodes both have many free memory.
The OOM killer cannot help to resolve the situation as there is no
usable memory for the request in the cpuset scope. The only reasonable
measure to take is to fail the allocation right away and have the caller
to deal with it.
So add a check for cases like this in the slowpath of allocation, and
bail out early returning NULL for the allocation.
As page allocation is one of the hottest path in kernel, this check will
hurt all users with sane cpuset configuration, add a static branch check
and detect the abnormal config in cpuset memory binding setup so that
the extra check cost in page allocation is not paid by everyone.
[thanks to Micho Hocko and David Rientjes for suggesting not handling
it inside OOM code, adding cpuset check, refining comments]
Link: https://lkml.kernel.org/r/1632481657-68112-1-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-06 04:40:34 +08:00
|
|
|
if (mems_updated)
|
|
|
|
check_insane_mems_config(&new_mems);
|
|
|
|
|
2017-08-18 03:33:10 +08:00
|
|
|
if (is_in_v2_mode())
|
2014-07-09 16:49:04 +08:00
|
|
|
hotplug_update_tasks(cs, &new_cpus, &new_mems,
|
|
|
|
cpus_updated, mems_updated);
|
2014-07-09 16:48:54 +08:00
|
|
|
else
|
2014-07-09 16:49:04 +08:00
|
|
|
hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
|
|
|
|
cpus_updated, mems_updated);
|
2013-01-08 00:51:07 +08:00
|
|
|
|
2023-03-17 23:15:06 +08:00
|
|
|
unlock:
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2006-09-29 17:01:17 +08:00
|
|
|
}
|
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
/**
|
2020-04-03 23:32:13 +08:00
|
|
|
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
|
2023-08-02 11:04:12 +08:00
|
|
|
* @work: unused
|
2008-02-07 16:14:43 +08:00
|
|
|
*
|
2013-01-08 00:51:07 +08:00
|
|
|
* This function is called after either CPU or memory configuration has
|
|
|
|
* changed and updates cpuset accordingly. The top_cpuset is always
|
|
|
|
* synchronized to cpu_active_mask and N_MEMORY, which is necessary in
|
|
|
|
* order to make cpusets transparent (of no affect) on systems that are
|
|
|
|
* actively using CPU hotplug but making no active use of cpusets.
|
2008-02-07 16:14:43 +08:00
|
|
|
*
|
2013-01-08 00:51:07 +08:00
|
|
|
* Non-root cpusets are only affected by offlining. If any CPUs or memory
|
2013-06-09 17:14:47 +08:00
|
|
|
* nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
|
|
|
|
* all descendants.
|
2008-02-07 16:14:43 +08:00
|
|
|
*
|
2013-01-08 00:51:07 +08:00
|
|
|
* Note that CPU offlining during suspend is ignored. We don't modify
|
|
|
|
* cpusets across suspend/resume cycles at all.
|
2008-02-07 16:14:43 +08:00
|
|
|
*/
|
2020-04-03 23:32:13 +08:00
|
|
|
static void cpuset_hotplug_workfn(struct work_struct *work)
|
2006-09-29 17:01:17 +08:00
|
|
|
{
|
2013-06-09 17:16:29 +08:00
|
|
|
static cpumask_t new_cpus;
|
|
|
|
static nodemask_t new_mems;
|
2013-01-08 00:51:07 +08:00
|
|
|
bool cpus_updated, mems_updated;
|
2017-08-18 03:33:10 +08:00
|
|
|
bool on_dfl = is_in_v2_mode();
|
2018-11-08 23:08:41 +08:00
|
|
|
struct tmpmasks tmp, *ptmp = NULL;
|
|
|
|
|
|
|
|
if (on_dfl && !alloc_cpumasks(NULL, &tmp))
|
|
|
|
ptmp = &tmp;
|
2006-09-29 17:01:17 +08:00
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_lock(&cpuset_mutex);
|
2008-02-07 16:14:43 +08:00
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
/* fetch the available cpus/mems and find out which changed how */
|
|
|
|
cpumask_copy(&new_cpus, cpu_active_mask);
|
|
|
|
new_mems = node_states[N_MEMORY];
|
2012-05-24 22:16:55 +08:00
|
|
|
|
2018-11-08 23:08:41 +08:00
|
|
|
/*
|
|
|
|
* If subparts_cpus is populated, it is likely that the check below
|
|
|
|
* will produce a false positive on cpus_updated when the cpu list
|
|
|
|
* isn't changed. It is extra work, but it is better to be safe.
|
|
|
|
*/
|
2014-07-09 16:48:42 +08:00
|
|
|
cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
|
|
|
|
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
|
2012-05-24 22:16:55 +08:00
|
|
|
|
2021-07-20 22:18:27 +08:00
|
|
|
/*
|
|
|
|
* In the rare case that hotplug removes all the cpus in subparts_cpus,
|
|
|
|
* we assumed that cpus are updated.
|
|
|
|
*/
|
|
|
|
if (!cpus_updated && top_cpuset.nr_subparts_cpus)
|
|
|
|
cpus_updated = true;
|
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
/* synchronize cpus_allowed to cpu_active_mask */
|
|
|
|
if (cpus_updated) {
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_lock_irq(&callback_lock);
|
2014-07-09 16:48:42 +08:00
|
|
|
if (!on_dfl)
|
|
|
|
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
|
2018-11-08 23:08:41 +08:00
|
|
|
/*
|
|
|
|
* Make sure that CPUs allocated to child partitions
|
|
|
|
* do not show up in effective_cpus. If no CPU is left,
|
|
|
|
* we clear the subparts_cpus & let the child partitions
|
|
|
|
* fight for the CPUs again.
|
|
|
|
*/
|
|
|
|
if (top_cpuset.nr_subparts_cpus) {
|
|
|
|
if (cpumask_subset(&new_cpus,
|
|
|
|
top_cpuset.subparts_cpus)) {
|
|
|
|
top_cpuset.nr_subparts_cpus = 0;
|
|
|
|
cpumask_clear(top_cpuset.subparts_cpus);
|
|
|
|
} else {
|
|
|
|
cpumask_andnot(&new_cpus, &new_cpus,
|
|
|
|
top_cpuset.subparts_cpus);
|
|
|
|
}
|
|
|
|
}
|
cpuset: update cpuset->effective_{cpus,mems} at hotplug
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The first item is done here.
This won't introduce behavior change.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:16 +08:00
|
|
|
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
2013-01-08 00:51:07 +08:00
|
|
|
/* we don't mess with cpumasks of tasks in top_cpuset */
|
|
|
|
}
|
2008-02-07 16:14:47 +08:00
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
/* synchronize mems_allowed to N_MEMORY */
|
|
|
|
if (mems_updated) {
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_lock_irq(&callback_lock);
|
2014-07-09 16:48:42 +08:00
|
|
|
if (!on_dfl)
|
|
|
|
top_cpuset.mems_allowed = new_mems;
|
cpuset: update cpuset->effective_{cpus,mems} at hotplug
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
To make cs->effective_{cpus,mems} to be effective masks, we need to
- update the effective masks at hotplug
- update the effective masks at config change
- take on ancestor's mask when the effective mask is empty
The first item is done here.
This won't introduce behavior change.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:16 +08:00
|
|
|
top_cpuset.effective_mems = new_mems;
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_unlock_irq(&callback_lock);
|
2014-02-13 19:58:40 +08:00
|
|
|
update_tasks_nodemask(&top_cpuset);
|
2013-01-08 00:51:07 +08:00
|
|
|
}
|
2008-02-07 16:14:47 +08:00
|
|
|
|
2023-05-08 15:58:50 +08:00
|
|
|
mutex_unlock(&cpuset_mutex);
|
2013-06-09 17:14:47 +08:00
|
|
|
|
2013-06-09 17:16:29 +08:00
|
|
|
/* if cpus or mems changed, we need to propagate to descendants */
|
|
|
|
if (cpus_updated || mems_updated) {
|
2013-01-08 00:51:07 +08:00
|
|
|
struct cpuset *cs;
|
2013-08-09 08:11:25 +08:00
|
|
|
struct cgroup_subsys_state *pos_css;
|
2008-07-25 16:47:22 +08:00
|
|
|
|
2013-01-08 00:51:08 +08:00
|
|
|
rcu_read_lock();
|
2013-08-09 08:11:25 +08:00
|
|
|
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
|
2014-05-14 00:11:01 +08:00
|
|
|
if (cs == &top_cpuset || !css_tryget_online(&cs->css))
|
2013-06-09 17:14:47 +08:00
|
|
|
continue;
|
|
|
|
rcu_read_unlock();
|
2012-05-24 22:16:55 +08:00
|
|
|
|
2018-11-08 23:08:41 +08:00
|
|
|
cpuset_hotplug_update_tasks(cs, ptmp);
|
2008-02-07 16:14:47 +08:00
|
|
|
|
2013-06-09 17:14:47 +08:00
|
|
|
rcu_read_lock();
|
|
|
|
css_put(&cs->css);
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
2013-01-08 00:51:07 +08:00
|
|
|
|
2013-01-08 00:51:07 +08:00
|
|
|
/* rebuild sched domains if cpus_allowed has changed */
|
2017-09-07 17:13:38 +08:00
|
|
|
if (cpus_updated || force_rebuild) {
|
|
|
|
force_rebuild = false;
|
2020-04-03 23:32:13 +08:00
|
|
|
rebuild_sched_domains();
|
2017-09-07 17:13:38 +08:00
|
|
|
}
|
2018-11-08 23:08:41 +08:00
|
|
|
|
|
|
|
free_cpumasks(NULL, ptmp);
|
2006-09-29 17:01:17 +08:00
|
|
|
}
|
|
|
|
|
2020-04-03 23:32:13 +08:00
|
|
|
void cpuset_update_active_cpus(void)
|
[PATCH] cpuset: top_cpuset tracks hotplug changes to cpu_online_map
Change the list of cpus allowed to tasks in the top (root) cpuset to
dynamically track what cpus are online, using a CPU hotplug notifier. Make
this top cpus file read-only.
On systems that have cpusets configured in their kernel, but that aren't
actively using cpusets (for some distros, this covers the majority of
systems) all tasks end up in the top cpuset.
If that system does support CPU hotplug, then these tasks cannot make use
of CPUs that are added after system boot, because the CPUs are not allowed
in the top cpuset. This is a surprising regression over earlier kernels
that didn't have cpusets enabled.
In order to keep the behaviour of cpusets consistent between systems
actively making use of them and systems not using them, this patch changes
the behaviour of the 'cpus' file in the top (root) cpuset, making it read
only, and making it automatically track the value of cpu_online_map. Thus
tasks in the top cpuset will have automatic use of hot plugged CPUs allowed
by their cpuset.
Thanks to Anton Blanchard and Nathan Lynch for reporting this problem,
driving the fix, and earlier versions of this patch.
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Nathan Lynch <ntl@pobox.com>
Cc: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-08-27 16:23:51 +08:00
|
|
|
{
|
2020-04-03 23:32:13 +08:00
|
|
|
/*
|
|
|
|
* We're inside cpu hotplug critical region which usually nests
|
|
|
|
* inside cgroup synchronization. Bounce actual hotplug processing
|
|
|
|
* to a work item to avoid reverse locking order.
|
|
|
|
*/
|
|
|
|
schedule_work(&cpuset_hotplug_work);
|
[PATCH] cpuset: top_cpuset tracks hotplug changes to cpu_online_map
Change the list of cpus allowed to tasks in the top (root) cpuset to
dynamically track what cpus are online, using a CPU hotplug notifier. Make
this top cpus file read-only.
On systems that have cpusets configured in their kernel, but that aren't
actively using cpusets (for some distros, this covers the majority of
systems) all tasks end up in the top cpuset.
If that system does support CPU hotplug, then these tasks cannot make use
of CPUs that are added after system boot, because the CPUs are not allowed
in the top cpuset. This is a surprising regression over earlier kernels
that didn't have cpusets enabled.
In order to keep the behaviour of cpusets consistent between systems
actively making use of them and systems not using them, this patch changes
the behaviour of the 'cpus' file in the top (root) cpuset, making it read
only, and making it automatically track the value of cpu_online_map. Thus
tasks in the top cpuset will have automatic use of hot plugged CPUs allowed
by their cpuset.
Thanks to Anton Blanchard and Nathan Lynch for reporting this problem,
driving the fix, and earlier versions of this patch.
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Nathan Lynch <ntl@pobox.com>
Cc: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-08-27 16:23:51 +08:00
|
|
|
}
|
|
|
|
|
2020-04-03 23:32:13 +08:00
|
|
|
void cpuset_wait_for_hotplug(void)
|
2017-09-07 17:13:38 +08:00
|
|
|
{
|
2020-04-03 23:32:13 +08:00
|
|
|
flush_work(&cpuset_hotplug_work);
|
2017-09-07 17:13:38 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] cpuset: top_cpuset tracks hotplug changes to node_online_map
Change the list of memory nodes allowed to tasks in the top (root) nodeset
to dynamically track what cpus are online, using a call to a cpuset hook
from the memory hotplug code. Make this top cpus file read-only.
On systems that have cpusets configured in their kernel, but that aren't
actively using cpusets (for some distros, this covers the majority of
systems) all tasks end up in the top cpuset.
If that system does support memory hotplug, then these tasks cannot make
use of memory nodes that are added after system boot, because the memory
nodes are not allowed in the top cpuset. This is a surprising regression
over earlier kernels that didn't have cpusets enabled.
One key motivation for this change is to remain consistent with the
behaviour for the top_cpuset's 'cpus', which is also read-only, and which
automatically tracks the cpu_online_map.
This change also has the minor benefit that it fixes a long standing,
little noticed, minor bug in cpusets. The cpuset performance tweak to
short circuit the cpuset_zone_allowed() check on systems with just a single
cpuset (see 'number_of_cpusets', in linux/cpuset.h) meant that simply
changing the 'mems' of the top_cpuset had no affect, even though the change
(the write system call) appeared to succeed. With the following change,
that write to the 'mems' file fails -EACCES, and the 'mems' file stubbornly
refuses to be changed via user space writes. Thus no one should be mislead
into thinking they've changed the top_cpusets's 'mems' when in affect they
haven't.
In order to keep the behaviour of cpusets consistent between systems
actively making use of them and systems not using them, this patch changes
the behaviour of the 'mems' file in the top (root) cpuset, making it read
only, and making it automatically track the value of node_online_map. Thus
tasks in the top cpuset will have automatic use of hot plugged memory nodes
allowed by their cpuset.
[akpm@osdl.org: build fix]
[bunk@stusta.de: build fix]
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-29 17:01:16 +08:00
|
|
|
/*
|
2012-12-13 05:51:24 +08:00
|
|
|
* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
|
|
|
|
* Call this routine anytime after node_states[N_MEMORY] changes.
|
2012-05-24 22:17:03 +08:00
|
|
|
* See cpuset_update_active_cpus() for CPU hotplug handling.
|
[PATCH] cpuset: top_cpuset tracks hotplug changes to node_online_map
Change the list of memory nodes allowed to tasks in the top (root) nodeset
to dynamically track what cpus are online, using a call to a cpuset hook
from the memory hotplug code. Make this top cpus file read-only.
On systems that have cpusets configured in their kernel, but that aren't
actively using cpusets (for some distros, this covers the majority of
systems) all tasks end up in the top cpuset.
If that system does support memory hotplug, then these tasks cannot make
use of memory nodes that are added after system boot, because the memory
nodes are not allowed in the top cpuset. This is a surprising regression
over earlier kernels that didn't have cpusets enabled.
One key motivation for this change is to remain consistent with the
behaviour for the top_cpuset's 'cpus', which is also read-only, and which
automatically tracks the cpu_online_map.
This change also has the minor benefit that it fixes a long standing,
little noticed, minor bug in cpusets. The cpuset performance tweak to
short circuit the cpuset_zone_allowed() check on systems with just a single
cpuset (see 'number_of_cpusets', in linux/cpuset.h) meant that simply
changing the 'mems' of the top_cpuset had no affect, even though the change
(the write system call) appeared to succeed. With the following change,
that write to the 'mems' file fails -EACCES, and the 'mems' file stubbornly
refuses to be changed via user space writes. Thus no one should be mislead
into thinking they've changed the top_cpusets's 'mems' when in affect they
haven't.
In order to keep the behaviour of cpusets consistent between systems
actively making use of them and systems not using them, this patch changes
the behaviour of the 'mems' file in the top (root) cpuset, making it read
only, and making it automatically track the value of node_online_map. Thus
tasks in the top cpuset will have automatic use of hot plugged memory nodes
allowed by their cpuset.
[akpm@osdl.org: build fix]
[bunk@stusta.de: build fix]
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-29 17:01:16 +08:00
|
|
|
*/
|
2008-11-20 07:36:30 +08:00
|
|
|
static int cpuset_track_online_nodes(struct notifier_block *self,
|
|
|
|
unsigned long action, void *arg)
|
[PATCH] cpuset: top_cpuset tracks hotplug changes to node_online_map
Change the list of memory nodes allowed to tasks in the top (root) nodeset
to dynamically track what cpus are online, using a call to a cpuset hook
from the memory hotplug code. Make this top cpus file read-only.
On systems that have cpusets configured in their kernel, but that aren't
actively using cpusets (for some distros, this covers the majority of
systems) all tasks end up in the top cpuset.
If that system does support memory hotplug, then these tasks cannot make
use of memory nodes that are added after system boot, because the memory
nodes are not allowed in the top cpuset. This is a surprising regression
over earlier kernels that didn't have cpusets enabled.
One key motivation for this change is to remain consistent with the
behaviour for the top_cpuset's 'cpus', which is also read-only, and which
automatically tracks the cpu_online_map.
This change also has the minor benefit that it fixes a long standing,
little noticed, minor bug in cpusets. The cpuset performance tweak to
short circuit the cpuset_zone_allowed() check on systems with just a single
cpuset (see 'number_of_cpusets', in linux/cpuset.h) meant that simply
changing the 'mems' of the top_cpuset had no affect, even though the change
(the write system call) appeared to succeed. With the following change,
that write to the 'mems' file fails -EACCES, and the 'mems' file stubbornly
refuses to be changed via user space writes. Thus no one should be mislead
into thinking they've changed the top_cpusets's 'mems' when in affect they
haven't.
In order to keep the behaviour of cpusets consistent between systems
actively making use of them and systems not using them, this patch changes
the behaviour of the 'mems' file in the top (root) cpuset, making it read
only, and making it automatically track the value of node_online_map. Thus
tasks in the top cpuset will have automatic use of hot plugged memory nodes
allowed by their cpuset.
[akpm@osdl.org: build fix]
[bunk@stusta.de: build fix]
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-29 17:01:16 +08:00
|
|
|
{
|
2013-01-08 00:51:07 +08:00
|
|
|
schedule_work(&cpuset_hotplug_work);
|
2008-11-20 07:36:30 +08:00
|
|
|
return NOTIFY_OK;
|
[PATCH] cpuset: top_cpuset tracks hotplug changes to node_online_map
Change the list of memory nodes allowed to tasks in the top (root) nodeset
to dynamically track what cpus are online, using a call to a cpuset hook
from the memory hotplug code. Make this top cpus file read-only.
On systems that have cpusets configured in their kernel, but that aren't
actively using cpusets (for some distros, this covers the majority of
systems) all tasks end up in the top cpuset.
If that system does support memory hotplug, then these tasks cannot make
use of memory nodes that are added after system boot, because the memory
nodes are not allowed in the top cpuset. This is a surprising regression
over earlier kernels that didn't have cpusets enabled.
One key motivation for this change is to remain consistent with the
behaviour for the top_cpuset's 'cpus', which is also read-only, and which
automatically tracks the cpu_online_map.
This change also has the minor benefit that it fixes a long standing,
little noticed, minor bug in cpusets. The cpuset performance tweak to
short circuit the cpuset_zone_allowed() check on systems with just a single
cpuset (see 'number_of_cpusets', in linux/cpuset.h) meant that simply
changing the 'mems' of the top_cpuset had no affect, even though the change
(the write system call) appeared to succeed. With the following change,
that write to the 'mems' file fails -EACCES, and the 'mems' file stubbornly
refuses to be changed via user space writes. Thus no one should be mislead
into thinking they've changed the top_cpusets's 'mems' when in affect they
haven't.
In order to keep the behaviour of cpusets consistent between systems
actively making use of them and systems not using them, this patch changes
the behaviour of the 'mems' file in the top (root) cpuset, making it read
only, and making it automatically track the value of node_online_map. Thus
tasks in the top cpuset will have automatic use of hot plugged memory nodes
allowed by their cpuset.
[akpm@osdl.org: build fix]
[bunk@stusta.de: build fix]
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-29 17:01:16 +08:00
|
|
|
}
|
2013-04-30 06:08:08 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* cpuset_init_smp - initialize cpus_allowed
|
|
|
|
*
|
|
|
|
* Description: Finish top cpuset after cpu, node maps are initialized
|
2013-04-30 06:08:08 +08:00
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
void __init cpuset_init_smp(void)
|
|
|
|
{
|
2022-04-27 22:54:28 +08:00
|
|
|
/*
|
|
|
|
* cpus_allowd/mems_allowed set to v2 values in the initial
|
|
|
|
* cpuset_bind() call will be reset to v1 values in another
|
|
|
|
* cpuset_bind() call when v1 cpuset is mounted.
|
|
|
|
*/
|
2013-06-09 17:15:08 +08:00
|
|
|
top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
|
[PATCH] cpuset: top_cpuset tracks hotplug changes to cpu_online_map
Change the list of cpus allowed to tasks in the top (root) cpuset to
dynamically track what cpus are online, using a CPU hotplug notifier. Make
this top cpus file read-only.
On systems that have cpusets configured in their kernel, but that aren't
actively using cpusets (for some distros, this covers the majority of
systems) all tasks end up in the top cpuset.
If that system does support CPU hotplug, then these tasks cannot make use
of CPUs that are added after system boot, because the CPUs are not allowed
in the top cpuset. This is a surprising regression over earlier kernels
that didn't have cpusets enabled.
In order to keep the behaviour of cpusets consistent between systems
actively making use of them and systems not using them, this patch changes
the behaviour of the 'cpus' file in the top (root) cpuset, making it read
only, and making it automatically track the value of cpu_online_map. Thus
tasks in the top cpuset will have automatic use of hot plugged CPUs allowed
by their cpuset.
Thanks to Anton Blanchard and Nathan Lynch for reporting this problem,
driving the fix, and earlier versions of this patch.
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Nathan Lynch <ntl@pobox.com>
Cc: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-08-27 16:23:51 +08:00
|
|
|
|
cpuset: add cs->effective_cpus and cs->effective_mems
We're going to have separate user-configured masks and effective ones.
Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.
We calculate effective mask this way:
- top cpuset's effective_mask == online_mask, otherwise
- cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.
Those behavior changes are for default hierarchy only. For legacy
hierachy, effective_mask and configured_mask are the same, so we won't
break old interfaces.
This patch adds the effective masks to struct cpuset and initializes
them. The effective masks of the top cpuset is the same with configured
masks, and a child cpuset inherits its parent's effective masks.
This won't introduce behavior change.
v2:
- s/real_{mems,cpus}_allowed/effective_{mems,cpus}, suggested by Tejun.
- don't init effective masks in cpuset_css_online() if !cgroup_on_dfl.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-07-09 16:47:03 +08:00
|
|
|
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
|
|
|
|
top_cpuset.effective_mems = node_states[N_MEMORY];
|
|
|
|
|
2022-09-23 11:33:47 +08:00
|
|
|
hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
|
2016-01-20 01:18:41 +08:00
|
|
|
|
|
|
|
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
|
|
|
|
BUG_ON(!cpuset_migrate_mm_wq);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
|
|
|
|
* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
|
2009-01-08 10:08:45 +08:00
|
|
|
* @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2009-01-08 10:08:44 +08:00
|
|
|
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
|
2005-04-17 06:20:36 +08:00
|
|
|
* attached to the specified @tsk. Guaranteed to return some non-empty
|
2012-03-29 13:08:31 +08:00
|
|
|
* subset of cpu_online_mask, even if this means going outside the
|
2023-02-06 11:48:53 +08:00
|
|
|
* tasks cpuset, except when the task is in the top cpuset.
|
2005-04-17 06:20:36 +08:00
|
|
|
**/
|
|
|
|
|
2009-01-08 10:08:45 +08:00
|
|
|
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2014-10-20 19:50:29 +08:00
|
|
|
unsigned long flags;
|
2023-02-06 11:48:53 +08:00
|
|
|
struct cpuset *cs;
|
2014-10-20 19:50:29 +08:00
|
|
|
|
|
|
|
spin_lock_irqsave(&callback_lock, flags);
|
2023-02-06 11:48:53 +08:00
|
|
|
rcu_read_lock();
|
|
|
|
|
|
|
|
cs = task_cs(tsk);
|
|
|
|
if (cs != &top_cpuset)
|
|
|
|
guarantee_online_cpus(tsk, pmask);
|
|
|
|
/*
|
|
|
|
* Tasks in the top cpuset won't get update to their cpumasks
|
|
|
|
* when a hotplug online/offline event happens. So we include all
|
|
|
|
* offline cpus in the allowed cpu list.
|
|
|
|
*/
|
|
|
|
if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
|
|
|
|
const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We first exclude cpus allocated to partitions. If there is no
|
|
|
|
* allowable online cpu left, we fall back to all possible cpus.
|
|
|
|
*/
|
|
|
|
cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus);
|
|
|
|
if (!cpumask_intersects(pmask, cpu_online_mask))
|
|
|
|
cpumask_copy(pmask, possible_mask);
|
|
|
|
}
|
|
|
|
|
|
|
|
rcu_read_unlock();
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_unlock_irqrestore(&callback_lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
cpuset: restore sanity to cpuset_cpus_allowed_fallback()
In the case that a process is constrained by taskset(1) (i.e.
sched_setaffinity(2)) to a subset of available cpus, and all of those are
subsequently offlined, the scheduler will set tsk->cpus_allowed to
the current value of task_cs(tsk)->effective_cpus.
This is done via a call to do_set_cpus_allowed() in the context of
cpuset_cpus_allowed_fallback() made by the scheduler when this case is
detected. This is the only call made to cpuset_cpus_allowed_fallback()
in the latest mainline kernel.
However, this is not sane behavior.
I will demonstrate this on a system running the latest upstream kernel
with the following initial configuration:
# grep -i cpu /proc/$$/status
Cpus_allowed: ffffffff,fffffff
Cpus_allowed_list: 0-63
(Where cpus 32-63 are provided via smt.)
If we limit our current shell process to cpu2 only and then offline it
and reonline it:
# taskset -p 4 $$
pid 2272's current affinity mask: ffffffffffffffff
pid 2272's new affinity mask: 4
# echo off > /sys/devices/system/cpu/cpu2/online
# dmesg | tail -3
[ 2195.866089] process 2272 (bash) no longer affine to cpu2
[ 2195.872700] IRQ 114: no longer affine to CPU2
[ 2195.879128] smpboot: CPU 2 is now offline
# echo on > /sys/devices/system/cpu/cpu2/online
# dmesg | tail -1
[ 2617.043572] smpboot: Booting Node 0 Processor 2 APIC 0x4
We see that our current process now has an affinity mask containing
every cpu available on the system _except_ the one we originally
constrained it to:
# grep -i cpu /proc/$$/status
Cpus_allowed: ffffffff,fffffffb
Cpus_allowed_list: 0-1,3-63
This is not sane behavior, as the scheduler can now not only place the
process on previously forbidden cpus, it can't even schedule it on
the cpu it was originally constrained to!
Other cases result in even more exotic affinity masks. Take for instance
a process with an affinity mask containing only cpus provided by smt at
the moment that smt is toggled, in a configuration such as the following:
# taskset -p f000000000 $$
# grep -i cpu /proc/$$/status
Cpus_allowed: 000000f0,00000000
Cpus_allowed_list: 36-39
A double toggle of smt results in the following behavior:
# echo off > /sys/devices/system/cpu/smt/control
# echo on > /sys/devices/system/cpu/smt/control
# grep -i cpus /proc/$$/status
Cpus_allowed: ffffff00,ffffffff
Cpus_allowed_list: 0-31,40-63
This is even less sane than the previous case, as the new affinity mask
excludes all smt-provided cpus with ids less than those that were
previously in the affinity mask, as well as those that were actually in
the mask.
With this patch applied, both of these cases end in the following state:
# grep -i cpu /proc/$$/status
Cpus_allowed: ffffffff,ffffffff
Cpus_allowed_list: 0-63
The original policy is discarded. Though not ideal, it is the simplest way
to restore sanity to this fallback case without reinventing the cpuset
wheel that rolls down the kernel just fine in cgroup v2. A user who wishes
for the previous affinity mask to be restored in this fallback case can use
that mechanism instead.
This patch modifies scheduler behavior by instead resetting the mask to
task_cs(tsk)->cpus_allowed by default, and cpu_possible mask in legacy
mode. I tested the cases above on both modes.
Note that the scheduler uses this fallback mechanism if and only if
_every_ other valid avenue has been traveled, and it is the last resort
before calling BUG().
Suggested-by: Waiman Long <longman@redhat.com>
Suggested-by: Phil Auld <pauld@redhat.com>
Signed-off-by: Joel Savitz <jsavitz@redhat.com>
Acked-by: Phil Auld <pauld@redhat.com>
Acked-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
2019-06-12 23:50:48 +08:00
|
|
|
/**
|
|
|
|
* cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
|
|
|
|
* @tsk: pointer to task_struct with which the scheduler is struggling
|
|
|
|
*
|
|
|
|
* Description: In the case that the scheduler cannot find an allowed cpu in
|
|
|
|
* tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
|
|
|
|
* mode however, this value is the same as task_cs(tsk)->effective_cpus,
|
|
|
|
* which will not contain a sane cpumask during cases such as cpu hotplugging.
|
|
|
|
* This is the absolute last resort for the scheduler and it is only used if
|
|
|
|
* _every_ other avenue has been traveled.
|
2021-07-30 19:24:31 +08:00
|
|
|
*
|
|
|
|
* Returns true if the affinity of @tsk was changed, false otherwise.
|
cpuset: restore sanity to cpuset_cpus_allowed_fallback()
In the case that a process is constrained by taskset(1) (i.e.
sched_setaffinity(2)) to a subset of available cpus, and all of those are
subsequently offlined, the scheduler will set tsk->cpus_allowed to
the current value of task_cs(tsk)->effective_cpus.
This is done via a call to do_set_cpus_allowed() in the context of
cpuset_cpus_allowed_fallback() made by the scheduler when this case is
detected. This is the only call made to cpuset_cpus_allowed_fallback()
in the latest mainline kernel.
However, this is not sane behavior.
I will demonstrate this on a system running the latest upstream kernel
with the following initial configuration:
# grep -i cpu /proc/$$/status
Cpus_allowed: ffffffff,fffffff
Cpus_allowed_list: 0-63
(Where cpus 32-63 are provided via smt.)
If we limit our current shell process to cpu2 only and then offline it
and reonline it:
# taskset -p 4 $$
pid 2272's current affinity mask: ffffffffffffffff
pid 2272's new affinity mask: 4
# echo off > /sys/devices/system/cpu/cpu2/online
# dmesg | tail -3
[ 2195.866089] process 2272 (bash) no longer affine to cpu2
[ 2195.872700] IRQ 114: no longer affine to CPU2
[ 2195.879128] smpboot: CPU 2 is now offline
# echo on > /sys/devices/system/cpu/cpu2/online
# dmesg | tail -1
[ 2617.043572] smpboot: Booting Node 0 Processor 2 APIC 0x4
We see that our current process now has an affinity mask containing
every cpu available on the system _except_ the one we originally
constrained it to:
# grep -i cpu /proc/$$/status
Cpus_allowed: ffffffff,fffffffb
Cpus_allowed_list: 0-1,3-63
This is not sane behavior, as the scheduler can now not only place the
process on previously forbidden cpus, it can't even schedule it on
the cpu it was originally constrained to!
Other cases result in even more exotic affinity masks. Take for instance
a process with an affinity mask containing only cpus provided by smt at
the moment that smt is toggled, in a configuration such as the following:
# taskset -p f000000000 $$
# grep -i cpu /proc/$$/status
Cpus_allowed: 000000f0,00000000
Cpus_allowed_list: 36-39
A double toggle of smt results in the following behavior:
# echo off > /sys/devices/system/cpu/smt/control
# echo on > /sys/devices/system/cpu/smt/control
# grep -i cpus /proc/$$/status
Cpus_allowed: ffffff00,ffffffff
Cpus_allowed_list: 0-31,40-63
This is even less sane than the previous case, as the new affinity mask
excludes all smt-provided cpus with ids less than those that were
previously in the affinity mask, as well as those that were actually in
the mask.
With this patch applied, both of these cases end in the following state:
# grep -i cpu /proc/$$/status
Cpus_allowed: ffffffff,ffffffff
Cpus_allowed_list: 0-63
The original policy is discarded. Though not ideal, it is the simplest way
to restore sanity to this fallback case without reinventing the cpuset
wheel that rolls down the kernel just fine in cgroup v2. A user who wishes
for the previous affinity mask to be restored in this fallback case can use
that mechanism instead.
This patch modifies scheduler behavior by instead resetting the mask to
task_cs(tsk)->cpus_allowed by default, and cpu_possible mask in legacy
mode. I tested the cases above on both modes.
Note that the scheduler uses this fallback mechanism if and only if
_every_ other valid avenue has been traveled, and it is the last resort
before calling BUG().
Suggested-by: Waiman Long <longman@redhat.com>
Suggested-by: Phil Auld <pauld@redhat.com>
Signed-off-by: Joel Savitz <jsavitz@redhat.com>
Acked-by: Phil Auld <pauld@redhat.com>
Acked-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
2019-06-12 23:50:48 +08:00
|
|
|
**/
|
|
|
|
|
2021-07-30 19:24:31 +08:00
|
|
|
bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
|
2010-03-15 17:10:27 +08:00
|
|
|
{
|
2021-07-30 19:24:29 +08:00
|
|
|
const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
|
|
|
|
const struct cpumask *cs_mask;
|
2021-07-30 19:24:31 +08:00
|
|
|
bool changed = false;
|
2021-07-30 19:24:29 +08:00
|
|
|
|
2010-03-15 17:10:27 +08:00
|
|
|
rcu_read_lock();
|
2021-07-30 19:24:29 +08:00
|
|
|
cs_mask = task_cs(tsk)->cpus_allowed;
|
2021-07-30 19:24:31 +08:00
|
|
|
if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
|
2021-07-30 19:24:29 +08:00
|
|
|
do_set_cpus_allowed(tsk, cs_mask);
|
2021-07-30 19:24:31 +08:00
|
|
|
changed = true;
|
|
|
|
}
|
2010-03-15 17:10:27 +08:00
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We own tsk->cpus_allowed, nobody can change it under us.
|
|
|
|
*
|
|
|
|
* But we used cs && cs->cpus_allowed lockless and thus can
|
|
|
|
* race with cgroup_attach_task() or update_cpumask() and get
|
|
|
|
* the wrong tsk->cpus_allowed. However, both cases imply the
|
|
|
|
* subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
|
|
|
|
* which takes task_rq_lock().
|
|
|
|
*
|
|
|
|
* If we are called after it dropped the lock we must see all
|
|
|
|
* changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
|
|
|
|
* set any mask even if it is not right from task_cs() pov,
|
|
|
|
* the pending set_cpus_allowed_ptr() will fix things.
|
sched: Fix select_fallback_rq() vs cpu_active/cpu_online
Commit 5fbd036b55 ("sched: Cleanup cpu_active madness"), which was
supposed to finally sort the cpu_active mess, instead uncovered more.
Since CPU_STARTING is ran before setting the cpu online, there's a
(small) window where the cpu has active,!online.
If during this time there's a wakeup of a task that used to reside on
that cpu select_task_rq() will use select_fallback_rq() to compute an
alternative cpu to run on since we find !online.
select_fallback_rq() however will compute the new cpu against
cpu_active, this means that it can return the same cpu it started out
with, the !online one, since that cpu is in fact marked active.
This results in us trying to scheduling a task on an offline cpu and
triggering a WARN in the IPI code.
The solution proposed by Chuansheng Liu of setting cpu_active in
set_cpu_online() is buggy, firstly not all archs actually use
set_cpu_online(), secondly, not all archs call set_cpu_online() with
IRQs disabled, this means we would introduce either the same race or
the race from fd8a7de17 ("x86: cpu-hotplug: Prevent softirq wakeup on
wrong CPU") -- albeit much narrower.
[ By setting online first and active later we have a window of
online,!active, fresh and bound kthreads have task_cpu() of 0 and
since cpu0 isn't in tsk_cpus_allowed() we end up in
select_fallback_rq() which excludes !active, resulting in a reset
of ->cpus_allowed and the thread running all over the place. ]
The solution is to re-work select_fallback_rq() to require active
_and_ online. This makes the active,!online case work as expected,
OTOH archs running CPU_STARTING after setting online are now
vulnerable to the issue from fd8a7de17 -- these are alpha and
blackfin.
Reported-by: Chuansheng Liu <chuansheng.liu@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: linux-alpha@vger.kernel.org
Link: http://lkml.kernel.org/n/tip-hubqk1i10o4dpvlm06gq7v6j@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-03-20 22:57:01 +08:00
|
|
|
*
|
|
|
|
* select_fallback_rq() will fix things ups and set cpu_possible_mask
|
|
|
|
* if required.
|
2010-03-15 17:10:27 +08:00
|
|
|
*/
|
2021-07-30 19:24:31 +08:00
|
|
|
return changed;
|
2010-03-15 17:10:27 +08:00
|
|
|
}
|
|
|
|
|
2015-02-13 07:00:16 +08:00
|
|
|
void __init cpuset_init_current_mems_allowed(void)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-04-05 09:11:07 +08:00
|
|
|
nodes_setall(current->mems_allowed);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2006-01-08 17:01:55 +08:00
|
|
|
/**
|
|
|
|
* cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
|
|
|
|
* @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
|
|
|
|
*
|
|
|
|
* Description: Returns the nodemask_t mems_allowed of the cpuset
|
|
|
|
* attached to the specified @tsk. Guaranteed to return some non-empty
|
2012-12-13 05:51:24 +08:00
|
|
|
* subset of node_states[N_MEMORY], even if this means going outside the
|
2006-01-08 17:01:55 +08:00
|
|
|
* tasks cpuset.
|
|
|
|
**/
|
|
|
|
|
|
|
|
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
nodemask_t mask;
|
2014-10-20 19:50:29 +08:00
|
|
|
unsigned long flags;
|
2006-01-08 17:01:55 +08:00
|
|
|
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_lock_irqsave(&callback_lock, flags);
|
2014-03-04 06:28:36 +08:00
|
|
|
rcu_read_lock();
|
2014-07-09 16:48:32 +08:00
|
|
|
guarantee_online_mems(task_cs(tsk), &mask);
|
2014-03-04 06:28:36 +08:00
|
|
|
rcu_read_unlock();
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_unlock_irqrestore(&callback_lock, flags);
|
2006-01-08 17:01:55 +08:00
|
|
|
|
|
|
|
return mask;
|
|
|
|
}
|
|
|
|
|
2005-07-28 02:45:11 +08:00
|
|
|
/**
|
2021-05-24 16:29:43 +08:00
|
|
|
* cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
|
2008-04-28 17:12:18 +08:00
|
|
|
* @nodemask: the nodemask to be checked
|
2005-07-28 02:45:11 +08:00
|
|
|
*
|
2008-04-28 17:12:18 +08:00
|
|
|
* Are any of the nodes in the nodemask allowed in current->mems_allowed?
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2008-04-28 17:12:18 +08:00
|
|
|
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-04-28 17:12:18 +08:00
|
|
|
return nodes_intersects(*nodemask, current->mems_allowed);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
/*
|
2008-04-29 16:00:26 +08:00
|
|
|
* nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
|
|
|
|
* mem_hardwall ancestor to the specified cpuset. Call holding
|
2014-10-20 19:50:29 +08:00
|
|
|
* callback_lock. If no ancestor is mem_exclusive or mem_hardwall
|
2008-04-29 16:00:26 +08:00
|
|
|
* (an unusual configuration), then returns the root cpuset.
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
*/
|
2013-08-09 08:11:22 +08:00
|
|
|
static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
{
|
2013-01-08 00:51:08 +08:00
|
|
|
while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
|
|
|
|
cs = parent_cs(cs);
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
return cs;
|
|
|
|
}
|
|
|
|
|
2022-02-16 11:17:53 +08:00
|
|
|
/*
|
2023-02-28 16:35:37 +08:00
|
|
|
* cpuset_node_allowed - Can we allocate on a memory node?
|
2009-04-03 07:57:54 +08:00
|
|
|
* @node: is this an allowed node?
|
[PATCH] cpuset: rework cpuset_zone_allowed api
Elaborate the API for calling cpuset_zone_allowed(), so that users have to
explicitly choose between the two variants:
cpuset_zone_allowed_hardwall()
cpuset_zone_allowed_softwall()
Until now, whether or not you got the hardwall flavor depended solely on
whether or not you or'd in the __GFP_HARDWALL gfp flag to the gfp_mask
argument.
If you didn't specify __GFP_HARDWALL, you implicitly got the softwall
version.
Unfortunately, this meant that users would end up with the softwall version
without thinking about it. Since only the softwall version might sleep,
this led to bugs with possible sleeping in interrupt context on more than
one occassion.
The hardwall version requires that the current tasks mems_allowed allows
the node of the specified zone (or that you're in interrupt or that
__GFP_THISNODE is set or that you're on a one cpuset system.)
The softwall version, depending on the gfp_mask, might allow a node if it
was allowed in the nearest enclusing cpuset marked mem_exclusive (which
requires taking the cpuset lock 'callback_mutex' to evaluate.)
This patch removes the cpuset_zone_allowed() call, and forces the caller to
explicitly choose between the hardwall and the softwall case.
If the caller wants the gfp_mask to determine this choice, they should (1)
be sure they can sleep or that __GFP_HARDWALL is set, and (2) invoke the
cpuset_zone_allowed_softwall() routine.
This adds another 100 or 200 bytes to the kernel text space, due to the few
lines of nearly duplicate code at the top of both cpuset_zone_allowed_*
routines. It should save a few instructions executed for the calls that
turned into calls of cpuset_zone_allowed_hardwall, thanks to not having to
set (before the call) then check (within the call) the __GFP_HARDWALL flag.
For the most critical call, from get_page_from_freelist(), the same
instructions are executed as before -- the old cpuset_zone_allowed()
routine it used to call is the same code as the
cpuset_zone_allowed_softwall() routine that it calls now.
Not a perfect win, but seems worth it, to reduce this chance of hitting a
sleeping with irq off complaint again.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-13 16:34:25 +08:00
|
|
|
* @gfp_mask: memory allocation flags
|
2005-07-28 02:45:11 +08:00
|
|
|
*
|
2015-04-15 06:47:01 +08:00
|
|
|
* If we're in interrupt, yes, we can always allocate. If @node is set in
|
|
|
|
* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
|
|
|
|
* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
|
2017-09-07 07:24:53 +08:00
|
|
|
* yes. If current has access to memory reserves as an oom victim, yes.
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
* Otherwise, no.
|
|
|
|
*
|
|
|
|
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
|
2007-05-07 05:49:32 +08:00
|
|
|
* and do not allow allocations outside the current tasks cpuset
|
2017-09-07 07:24:53 +08:00
|
|
|
* unless the task has been OOM killed.
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
* GFP_KERNEL allocations are not so marked, so can escape to the
|
2008-04-29 16:00:26 +08:00
|
|
|
* nearest enclosing hardwalled ancestor cpuset.
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
*
|
2014-10-20 19:50:29 +08:00
|
|
|
* Scanning up parent cpusets requires callback_lock. The
|
[PATCH] cpuset: rework cpuset_zone_allowed api
Elaborate the API for calling cpuset_zone_allowed(), so that users have to
explicitly choose between the two variants:
cpuset_zone_allowed_hardwall()
cpuset_zone_allowed_softwall()
Until now, whether or not you got the hardwall flavor depended solely on
whether or not you or'd in the __GFP_HARDWALL gfp flag to the gfp_mask
argument.
If you didn't specify __GFP_HARDWALL, you implicitly got the softwall
version.
Unfortunately, this meant that users would end up with the softwall version
without thinking about it. Since only the softwall version might sleep,
this led to bugs with possible sleeping in interrupt context on more than
one occassion.
The hardwall version requires that the current tasks mems_allowed allows
the node of the specified zone (or that you're in interrupt or that
__GFP_THISNODE is set or that you're on a one cpuset system.)
The softwall version, depending on the gfp_mask, might allow a node if it
was allowed in the nearest enclusing cpuset marked mem_exclusive (which
requires taking the cpuset lock 'callback_mutex' to evaluate.)
This patch removes the cpuset_zone_allowed() call, and forces the caller to
explicitly choose between the hardwall and the softwall case.
If the caller wants the gfp_mask to determine this choice, they should (1)
be sure they can sleep or that __GFP_HARDWALL is set, and (2) invoke the
cpuset_zone_allowed_softwall() routine.
This adds another 100 or 200 bytes to the kernel text space, due to the few
lines of nearly duplicate code at the top of both cpuset_zone_allowed_*
routines. It should save a few instructions executed for the calls that
turned into calls of cpuset_zone_allowed_hardwall, thanks to not having to
set (before the call) then check (within the call) the __GFP_HARDWALL flag.
For the most critical call, from get_page_from_freelist(), the same
instructions are executed as before -- the old cpuset_zone_allowed()
routine it used to call is the same code as the
cpuset_zone_allowed_softwall() routine that it calls now.
Not a perfect win, but seems worth it, to reduce this chance of hitting a
sleeping with irq off complaint again.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-13 16:34:25 +08:00
|
|
|
* __alloc_pages() routine only calls here with __GFP_HARDWALL bit
|
|
|
|
* _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
|
|
|
|
* current tasks mems_allowed came up empty on the first pass over
|
|
|
|
* the zonelist. So only GFP_KERNEL allocations, if all nodes in the
|
2014-10-20 19:50:29 +08:00
|
|
|
* cpuset are short of memory, might require taking the callback_lock.
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
*
|
2006-05-21 06:00:10 +08:00
|
|
|
* The first call here from mm/page_alloc:get_page_from_freelist()
|
[PATCH] cpuset: rework cpuset_zone_allowed api
Elaborate the API for calling cpuset_zone_allowed(), so that users have to
explicitly choose between the two variants:
cpuset_zone_allowed_hardwall()
cpuset_zone_allowed_softwall()
Until now, whether or not you got the hardwall flavor depended solely on
whether or not you or'd in the __GFP_HARDWALL gfp flag to the gfp_mask
argument.
If you didn't specify __GFP_HARDWALL, you implicitly got the softwall
version.
Unfortunately, this meant that users would end up with the softwall version
without thinking about it. Since only the softwall version might sleep,
this led to bugs with possible sleeping in interrupt context on more than
one occassion.
The hardwall version requires that the current tasks mems_allowed allows
the node of the specified zone (or that you're in interrupt or that
__GFP_THISNODE is set or that you're on a one cpuset system.)
The softwall version, depending on the gfp_mask, might allow a node if it
was allowed in the nearest enclusing cpuset marked mem_exclusive (which
requires taking the cpuset lock 'callback_mutex' to evaluate.)
This patch removes the cpuset_zone_allowed() call, and forces the caller to
explicitly choose between the hardwall and the softwall case.
If the caller wants the gfp_mask to determine this choice, they should (1)
be sure they can sleep or that __GFP_HARDWALL is set, and (2) invoke the
cpuset_zone_allowed_softwall() routine.
This adds another 100 or 200 bytes to the kernel text space, due to the few
lines of nearly duplicate code at the top of both cpuset_zone_allowed_*
routines. It should save a few instructions executed for the calls that
turned into calls of cpuset_zone_allowed_hardwall, thanks to not having to
set (before the call) then check (within the call) the __GFP_HARDWALL flag.
For the most critical call, from get_page_from_freelist(), the same
instructions are executed as before -- the old cpuset_zone_allowed()
routine it used to call is the same code as the
cpuset_zone_allowed_softwall() routine that it calls now.
Not a perfect win, but seems worth it, to reduce this chance of hitting a
sleeping with irq off complaint again.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-13 16:34:25 +08:00
|
|
|
* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
|
|
|
|
* so no allocation on a node outside the cpuset is allowed (unless
|
|
|
|
* in interrupt, of course).
|
2006-05-21 06:00:10 +08:00
|
|
|
*
|
|
|
|
* The second pass through get_page_from_freelist() doesn't even call
|
|
|
|
* here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
|
|
|
|
* variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
|
|
|
|
* in alloc_flags. That logic and the checks below have the combined
|
|
|
|
* affect that:
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
* in_interrupt - any node ok (current task context irrelevant)
|
|
|
|
* GFP_ATOMIC - any node ok
|
2017-09-07 07:24:53 +08:00
|
|
|
* tsk_is_oom_victim - any node ok
|
2008-04-29 16:00:26 +08:00
|
|
|
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
* GFP_USER - only nodes in current tasks mems allowed ok.
|
[PATCH] cpuset: rework cpuset_zone_allowed api
Elaborate the API for calling cpuset_zone_allowed(), so that users have to
explicitly choose between the two variants:
cpuset_zone_allowed_hardwall()
cpuset_zone_allowed_softwall()
Until now, whether or not you got the hardwall flavor depended solely on
whether or not you or'd in the __GFP_HARDWALL gfp flag to the gfp_mask
argument.
If you didn't specify __GFP_HARDWALL, you implicitly got the softwall
version.
Unfortunately, this meant that users would end up with the softwall version
without thinking about it. Since only the softwall version might sleep,
this led to bugs with possible sleeping in interrupt context on more than
one occassion.
The hardwall version requires that the current tasks mems_allowed allows
the node of the specified zone (or that you're in interrupt or that
__GFP_THISNODE is set or that you're on a one cpuset system.)
The softwall version, depending on the gfp_mask, might allow a node if it
was allowed in the nearest enclusing cpuset marked mem_exclusive (which
requires taking the cpuset lock 'callback_mutex' to evaluate.)
This patch removes the cpuset_zone_allowed() call, and forces the caller to
explicitly choose between the hardwall and the softwall case.
If the caller wants the gfp_mask to determine this choice, they should (1)
be sure they can sleep or that __GFP_HARDWALL is set, and (2) invoke the
cpuset_zone_allowed_softwall() routine.
This adds another 100 or 200 bytes to the kernel text space, due to the few
lines of nearly duplicate code at the top of both cpuset_zone_allowed_*
routines. It should save a few instructions executed for the calls that
turned into calls of cpuset_zone_allowed_hardwall, thanks to not having to
set (before the call) then check (within the call) the __GFP_HARDWALL flag.
For the most critical call, from get_page_from_freelist(), the same
instructions are executed as before -- the old cpuset_zone_allowed()
routine it used to call is the same code as the
cpuset_zone_allowed_softwall() routine that it calls now.
Not a perfect win, but seems worth it, to reduce this chance of hitting a
sleeping with irq off complaint again.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-13 16:34:25 +08:00
|
|
|
*/
|
2023-02-28 16:35:37 +08:00
|
|
|
bool cpuset_node_allowed(int node, gfp_t gfp_mask)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2013-08-09 08:11:22 +08:00
|
|
|
struct cpuset *cs; /* current cpuset ancestors */
|
2021-12-19 10:41:54 +08:00
|
|
|
bool allowed; /* is allocation in zone z allowed? */
|
2014-10-20 19:50:29 +08:00
|
|
|
unsigned long flags;
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
|
2015-04-15 06:47:01 +08:00
|
|
|
if (in_interrupt())
|
2016-05-20 08:14:30 +08:00
|
|
|
return true;
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
if (node_isset(node, current->mems_allowed))
|
2016-05-20 08:14:30 +08:00
|
|
|
return true;
|
2007-05-07 05:49:32 +08:00
|
|
|
/*
|
|
|
|
* Allow tasks that have access to memory reserves because they have
|
|
|
|
* been OOM killed to get memory anywhere.
|
|
|
|
*/
|
2017-09-07 07:24:53 +08:00
|
|
|
if (unlikely(tsk_is_oom_victim(current)))
|
2016-05-20 08:14:30 +08:00
|
|
|
return true;
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
|
2016-05-20 08:14:30 +08:00
|
|
|
return false;
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
|
2005-11-14 08:06:35 +08:00
|
|
|
if (current->flags & PF_EXITING) /* Let dying task have memory */
|
2016-05-20 08:14:30 +08:00
|
|
|
return true;
|
2005-11-14 08:06:35 +08:00
|
|
|
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
/* Not hardwall and node outside mems_allowed: scan up cpusets */
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_lock_irqsave(&callback_lock, flags);
|
[PATCH] cpusets: dual semaphore locking overhaul
Overhaul cpuset locking. Replace single semaphore with two semaphores.
The suggestion to use two locks was made by Roman Zippel.
Both locks are global. Code that wants to modify cpusets must first
acquire the exclusive manage_sem, which allows them read-only access to
cpusets, and holds off other would-be modifiers. Before making actual
changes, the second semaphore, callback_sem must be acquired as well. Code
that needs only to query cpusets must acquire callback_sem, which is also a
global exclusive lock.
The earlier problems with double tripping are avoided, because it is
allowed for holders of manage_sem to nest the second callback_sem lock, and
only callback_sem is needed by code called from within __alloc_pages(),
where the double tripping had been possible.
This is not quite the same as a normal read/write semaphore, because
obtaining read-only access with intent to change must hold off other such
attempts, while allowing read-only access w/o such intention. Changing
cpusets involves several related checks and changes, which must be done
while allowing read-only queries (to avoid the double trip), but while
ensuring nothing changes (holding off other would be modifiers.)
This overhaul of cpuset locking also makes careful use of task_lock() to
guard access to the task->cpuset pointer, closing a couple of race
conditions noticed while reading this code (thanks, Roman). I've never
seen these races fail in any use or test.
See further the comments in the code.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-31 07:02:30 +08:00
|
|
|
|
2014-03-04 06:28:36 +08:00
|
|
|
rcu_read_lock();
|
2008-04-29 16:00:26 +08:00
|
|
|
cs = nearest_hardwall_ancestor(task_cs(current));
|
2014-02-27 18:19:36 +08:00
|
|
|
allowed = node_isset(node, cs->mems_allowed);
|
2014-03-04 06:28:36 +08:00
|
|
|
rcu_read_unlock();
|
[PATCH] cpusets: dual semaphore locking overhaul
Overhaul cpuset locking. Replace single semaphore with two semaphores.
The suggestion to use two locks was made by Roman Zippel.
Both locks are global. Code that wants to modify cpusets must first
acquire the exclusive manage_sem, which allows them read-only access to
cpusets, and holds off other would-be modifiers. Before making actual
changes, the second semaphore, callback_sem must be acquired as well. Code
that needs only to query cpusets must acquire callback_sem, which is also a
global exclusive lock.
The earlier problems with double tripping are avoided, because it is
allowed for holders of manage_sem to nest the second callback_sem lock, and
only callback_sem is needed by code called from within __alloc_pages(),
where the double tripping had been possible.
This is not quite the same as a normal read/write semaphore, because
obtaining read-only access with intent to change must hold off other such
attempts, while allowing read-only access w/o such intention. Changing
cpusets involves several related checks and changes, which must be done
while allowing read-only queries (to avoid the double trip), but while
ensuring nothing changes (holding off other would be modifiers.)
This overhaul of cpuset locking also makes careful use of task_lock() to
guard access to the task->cpuset pointer, closing a couple of race
conditions noticed while reading this code (thanks, Roman). I've never
seen these races fail in any use or test.
See further the comments in the code.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-31 07:02:30 +08:00
|
|
|
|
2014-10-20 19:50:29 +08:00
|
|
|
spin_unlock_irqrestore(&callback_lock, flags);
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
return allowed;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
|
|
|
/**
|
2023-01-08 10:12:17 +08:00
|
|
|
* cpuset_spread_node() - On which node to begin search for a page
|
2023-08-02 11:04:12 +08:00
|
|
|
* @rotor: round robin rotor
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
|
|
|
*
|
|
|
|
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
|
|
|
|
* tasks in a cpuset with is_spread_page or is_spread_slab set),
|
|
|
|
* and if the memory allocation used cpuset_mem_spread_node()
|
|
|
|
* to determine on which node to start looking, as it will for
|
|
|
|
* certain page cache or slab cache pages such as used for file
|
|
|
|
* system buffers and inode caches, then instead of starting on the
|
|
|
|
* local node to look for a free page, rather spread the starting
|
|
|
|
* node around the tasks mems_allowed nodes.
|
|
|
|
*
|
|
|
|
* We don't have to worry about the returned node being offline
|
|
|
|
* because "it can't happen", and even if it did, it would be ok.
|
|
|
|
*
|
|
|
|
* The routines calling guarantee_online_mems() are careful to
|
|
|
|
* only set nodes in task->mems_allowed that are online. So it
|
|
|
|
* should not be possible for the following code to return an
|
|
|
|
* offline node. But if it did, that would be ok, as this routine
|
|
|
|
* is not returning the node where the allocation must be, only
|
|
|
|
* the node where the search should start. The zonelist passed to
|
|
|
|
* __alloc_pages() will include all nodes. If the slab allocator
|
|
|
|
* is passed an offline node, it will fall back to the local node.
|
|
|
|
* See kmem_cache_alloc_node().
|
|
|
|
*/
|
2010-05-27 05:42:49 +08:00
|
|
|
static int cpuset_spread_node(int *rotor)
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
|
|
|
{
|
2016-05-20 08:10:58 +08:00
|
|
|
return *rotor = next_node_in(*rotor, current->mems_allowed);
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
|
|
|
}
|
2010-05-27 05:42:49 +08:00
|
|
|
|
2023-01-08 10:12:17 +08:00
|
|
|
/**
|
|
|
|
* cpuset_mem_spread_node() - On which node to begin search for a file page
|
|
|
|
*/
|
2010-05-27 05:42:49 +08:00
|
|
|
int cpuset_mem_spread_node(void)
|
|
|
|
{
|
2011-07-27 07:08:30 +08:00
|
|
|
if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
|
|
|
|
current->cpuset_mem_spread_rotor =
|
|
|
|
node_random(¤t->mems_allowed);
|
|
|
|
|
2010-05-27 05:42:49 +08:00
|
|
|
return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
|
|
|
|
}
|
|
|
|
|
2023-01-08 10:12:17 +08:00
|
|
|
/**
|
|
|
|
* cpuset_slab_spread_node() - On which node to begin search for a slab page
|
|
|
|
*/
|
2010-05-27 05:42:49 +08:00
|
|
|
int cpuset_slab_spread_node(void)
|
|
|
|
{
|
2011-07-27 07:08:30 +08:00
|
|
|
if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
|
|
|
|
current->cpuset_slab_spread_rotor =
|
|
|
|
node_random(¤t->mems_allowed);
|
|
|
|
|
2010-05-27 05:42:49 +08:00
|
|
|
return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
|
|
|
|
}
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
|
|
|
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
|
|
|
|
|
2005-09-07 06:18:13 +08:00
|
|
|
/**
|
2007-10-17 14:25:58 +08:00
|
|
|
* cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
|
|
|
|
* @tsk1: pointer to task_struct of some task.
|
|
|
|
* @tsk2: pointer to task_struct of some other task.
|
|
|
|
*
|
|
|
|
* Description: Return true if @tsk1's mems_allowed intersects the
|
|
|
|
* mems_allowed of @tsk2. Used by the OOM killer to determine if
|
|
|
|
* one of the task's memory usage might impact the memory available
|
|
|
|
* to the other.
|
2005-09-07 06:18:13 +08:00
|
|
|
**/
|
|
|
|
|
2007-10-17 14:25:58 +08:00
|
|
|
int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
|
|
|
|
const struct task_struct *tsk2)
|
2005-09-07 06:18:13 +08:00
|
|
|
{
|
2007-10-17 14:25:58 +08:00
|
|
|
return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
|
2005-09-07 06:18:13 +08:00
|
|
|
}
|
|
|
|
|
2009-01-07 06:39:01 +08:00
|
|
|
/**
|
2015-11-06 10:48:05 +08:00
|
|
|
* cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
|
2009-01-07 06:39:01 +08:00
|
|
|
*
|
2015-11-06 10:48:05 +08:00
|
|
|
* Description: Prints current's name, cpuset name, and cached copy of its
|
2014-03-04 06:28:36 +08:00
|
|
|
* mems_allowed to the kernel log.
|
2009-01-07 06:39:01 +08:00
|
|
|
*/
|
2015-11-06 10:48:05 +08:00
|
|
|
void cpuset_print_current_mems_allowed(void)
|
2009-01-07 06:39:01 +08:00
|
|
|
{
|
2014-03-04 06:28:36 +08:00
|
|
|
struct cgroup *cgrp;
|
2009-01-07 06:39:01 +08:00
|
|
|
|
2014-03-04 06:28:36 +08:00
|
|
|
rcu_read_lock();
|
2013-01-25 16:08:01 +08:00
|
|
|
|
2015-11-06 10:48:05 +08:00
|
|
|
cgrp = task_cs(current)->css.cgroup;
|
mm, oom: reorganize the oom report in dump_header
OOM report contains several sections. The first one is the allocation
context that has triggered the OOM. Then we have cpuset context followed
by the stack trace of the OOM path. The tird one is the OOM memory
information. Followed by the current memory state of all system tasks.
At last, we will show oom eligible tasks and the information about the
chosen oom victim.
One thing that makes parsing more awkward than necessary is that we do not
have a single and easily parsable line about the oom context. This patch
is reorganizing the oom report to
1) who invoked oom and what was the allocation request
[ 515.902945] tuned invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
2) OOM stack trace
[ 515.904273] CPU: 24 PID: 1809 Comm: tuned Not tainted 4.20.0-rc3+ #3
[ 515.905518] Hardware name: Inspur SA5212M4/YZMB-00370-107, BIOS 4.1.10 11/14/2016
[ 515.906821] Call Trace:
[ 515.908062] dump_stack+0x5a/0x73
[ 515.909311] dump_header+0x55/0x28c
[ 515.914260] oom_kill_process+0x2d8/0x300
[ 515.916708] out_of_memory+0x145/0x4a0
[ 515.917932] __alloc_pages_slowpath+0x7d2/0xa16
[ 515.919157] __alloc_pages_nodemask+0x277/0x290
[ 515.920367] filemap_fault+0x3d0/0x6c0
[ 515.921529] ? filemap_map_pages+0x2b8/0x420
[ 515.922709] ext4_filemap_fault+0x2c/0x40 [ext4]
[ 515.923884] __do_fault+0x20/0x80
[ 515.925032] __handle_mm_fault+0xbc0/0xe80
[ 515.926195] handle_mm_fault+0xfa/0x210
[ 515.927357] __do_page_fault+0x233/0x4c0
[ 515.928506] do_page_fault+0x32/0x140
[ 515.929646] ? page_fault+0x8/0x30
[ 515.930770] page_fault+0x1e/0x30
3) OOM memory information
[ 515.958093] Mem-Info:
[ 515.959647] active_anon:26501758 inactive_anon:1179809 isolated_anon:0
active_file:4402672 inactive_file:483963 isolated_file:1344
unevictable:0 dirty:4886753 writeback:0 unstable:0
slab_reclaimable:148442 slab_unreclaimable:18741
mapped:1347 shmem:1347 pagetables:58669 bounce:0
free:88663 free_pcp:0 free_cma:0
...
4) current memory state of all system tasks
[ 516.079544] [ 744] 0 744 9211 1345 114688 82 0 systemd-journal
[ 516.082034] [ 787] 0 787 31764 0 143360 92 0 lvmetad
[ 516.084465] [ 792] 0 792 10930 1 110592 208 -1000 systemd-udevd
[ 516.086865] [ 1199] 0 1199 13866 0 131072 112 -1000 auditd
[ 516.089190] [ 1222] 0 1222 31990 1 110592 157 0 smartd
[ 516.091477] [ 1225] 0 1225 4864 85 81920 43 0 irqbalance
[ 516.093712] [ 1226] 0 1226 52612 0 258048 426 0 abrtd
[ 516.112128] [ 1280] 0 1280 109774 55 299008 400 0 NetworkManager
[ 516.113998] [ 1295] 0 1295 28817 37 69632 24 0 ksmtuned
[ 516.144596] [ 10718] 0 10718 2622484 1721372 15998976 267219 0 panic
[ 516.145792] [ 10719] 0 10719 2622484 1164767 9818112 53576 0 panic
[ 516.146977] [ 10720] 0 10720 2622484 1174361 9904128 53709 0 panic
[ 516.148163] [ 10721] 0 10721 2622484 1209070 10194944 54824 0 panic
[ 516.149329] [ 10722] 0 10722 2622484 1745799 14774272 91138 0 panic
5) oom context (contrains and the chosen victim).
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=/,mems_allowed=0-1,task=panic,pid=10737,uid=0
An admin can easily get the full oom context at a single line which
makes parsing much easier.
Link: http://lkml.kernel.org/r/1542799799-36184-1-git-send-email-ufo19890607@gmail.com
Signed-off-by: yuzhoujian <yuzhoujian@didichuxing.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Yang Shi <yang.s@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 16:36:07 +08:00
|
|
|
pr_cont(",cpuset=");
|
2014-02-12 22:29:50 +08:00
|
|
|
pr_cont_cgroup_name(cgrp);
|
mm, oom: reorganize the oom report in dump_header
OOM report contains several sections. The first one is the allocation
context that has triggered the OOM. Then we have cpuset context followed
by the stack trace of the OOM path. The tird one is the OOM memory
information. Followed by the current memory state of all system tasks.
At last, we will show oom eligible tasks and the information about the
chosen oom victim.
One thing that makes parsing more awkward than necessary is that we do not
have a single and easily parsable line about the oom context. This patch
is reorganizing the oom report to
1) who invoked oom and what was the allocation request
[ 515.902945] tuned invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
2) OOM stack trace
[ 515.904273] CPU: 24 PID: 1809 Comm: tuned Not tainted 4.20.0-rc3+ #3
[ 515.905518] Hardware name: Inspur SA5212M4/YZMB-00370-107, BIOS 4.1.10 11/14/2016
[ 515.906821] Call Trace:
[ 515.908062] dump_stack+0x5a/0x73
[ 515.909311] dump_header+0x55/0x28c
[ 515.914260] oom_kill_process+0x2d8/0x300
[ 515.916708] out_of_memory+0x145/0x4a0
[ 515.917932] __alloc_pages_slowpath+0x7d2/0xa16
[ 515.919157] __alloc_pages_nodemask+0x277/0x290
[ 515.920367] filemap_fault+0x3d0/0x6c0
[ 515.921529] ? filemap_map_pages+0x2b8/0x420
[ 515.922709] ext4_filemap_fault+0x2c/0x40 [ext4]
[ 515.923884] __do_fault+0x20/0x80
[ 515.925032] __handle_mm_fault+0xbc0/0xe80
[ 515.926195] handle_mm_fault+0xfa/0x210
[ 515.927357] __do_page_fault+0x233/0x4c0
[ 515.928506] do_page_fault+0x32/0x140
[ 515.929646] ? page_fault+0x8/0x30
[ 515.930770] page_fault+0x1e/0x30
3) OOM memory information
[ 515.958093] Mem-Info:
[ 515.959647] active_anon:26501758 inactive_anon:1179809 isolated_anon:0
active_file:4402672 inactive_file:483963 isolated_file:1344
unevictable:0 dirty:4886753 writeback:0 unstable:0
slab_reclaimable:148442 slab_unreclaimable:18741
mapped:1347 shmem:1347 pagetables:58669 bounce:0
free:88663 free_pcp:0 free_cma:0
...
4) current memory state of all system tasks
[ 516.079544] [ 744] 0 744 9211 1345 114688 82 0 systemd-journal
[ 516.082034] [ 787] 0 787 31764 0 143360 92 0 lvmetad
[ 516.084465] [ 792] 0 792 10930 1 110592 208 -1000 systemd-udevd
[ 516.086865] [ 1199] 0 1199 13866 0 131072 112 -1000 auditd
[ 516.089190] [ 1222] 0 1222 31990 1 110592 157 0 smartd
[ 516.091477] [ 1225] 0 1225 4864 85 81920 43 0 irqbalance
[ 516.093712] [ 1226] 0 1226 52612 0 258048 426 0 abrtd
[ 516.112128] [ 1280] 0 1280 109774 55 299008 400 0 NetworkManager
[ 516.113998] [ 1295] 0 1295 28817 37 69632 24 0 ksmtuned
[ 516.144596] [ 10718] 0 10718 2622484 1721372 15998976 267219 0 panic
[ 516.145792] [ 10719] 0 10719 2622484 1164767 9818112 53576 0 panic
[ 516.146977] [ 10720] 0 10720 2622484 1174361 9904128 53709 0 panic
[ 516.148163] [ 10721] 0 10721 2622484 1209070 10194944 54824 0 panic
[ 516.149329] [ 10722] 0 10722 2622484 1745799 14774272 91138 0 panic
5) oom context (contrains and the chosen victim).
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=/,mems_allowed=0-1,task=panic,pid=10737,uid=0
An admin can easily get the full oom context at a single line which
makes parsing much easier.
Link: http://lkml.kernel.org/r/1542799799-36184-1-git-send-email-ufo19890607@gmail.com
Signed-off-by: yuzhoujian <yuzhoujian@didichuxing.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Yang Shi <yang.s@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 16:36:07 +08:00
|
|
|
pr_cont(",mems_allowed=%*pbl",
|
2015-11-06 10:48:05 +08:00
|
|
|
nodemask_pr_args(¤t->mems_allowed));
|
2013-03-01 15:02:15 +08:00
|
|
|
|
2013-03-12 10:28:39 +08:00
|
|
|
rcu_read_unlock();
|
2009-01-07 06:39:01 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
/*
|
|
|
|
* Collection of memory_pressure is suppressed unless
|
|
|
|
* this flag is enabled by writing "1" to the special
|
|
|
|
* cpuset file 'memory_pressure_enabled' in the root cpuset.
|
|
|
|
*/
|
|
|
|
|
2006-01-08 17:01:51 +08:00
|
|
|
int cpuset_memory_pressure_enabled __read_mostly;
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
|
2022-02-16 11:17:53 +08:00
|
|
|
/*
|
|
|
|
* __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
*
|
|
|
|
* Keep a running average of the rate of synchronous (direct)
|
|
|
|
* page reclaim efforts initiated by tasks in each cpuset.
|
|
|
|
*
|
|
|
|
* This represents the rate at which some task in the cpuset
|
|
|
|
* ran low on memory on all nodes it was allowed to use, and
|
|
|
|
* had to enter the kernels page reclaim code in an effort to
|
|
|
|
* create more free memory by tossing clean pages or swapping
|
|
|
|
* or writing dirty pages.
|
|
|
|
*
|
|
|
|
* Display to user space in the per-cpuset read-only file
|
|
|
|
* "memory_pressure". Value displayed is an integer
|
|
|
|
* representing the recent rate of entry into the synchronous
|
|
|
|
* (direct) page reclaim by any task attached to the cpuset.
|
2022-02-16 11:17:53 +08:00
|
|
|
*/
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
|
|
|
|
void __cpuset_memory_pressure_bump(void)
|
|
|
|
{
|
2014-03-04 06:28:36 +08:00
|
|
|
rcu_read_lock();
|
2007-10-19 14:39:39 +08:00
|
|
|
fmeter_markevent(&task_cs(current)->fmeter);
|
2014-03-04 06:28:36 +08:00
|
|
|
rcu_read_unlock();
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
}
|
|
|
|
|
2007-10-19 14:39:39 +08:00
|
|
|
#ifdef CONFIG_PROC_PID_CPUSET
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* proc_cpuset_show()
|
|
|
|
* - Print tasks cpuset path into seq_file.
|
|
|
|
* - Used for /proc/<pid>/cpuset.
|
[PATCH] cpusets: dual semaphore locking overhaul
Overhaul cpuset locking. Replace single semaphore with two semaphores.
The suggestion to use two locks was made by Roman Zippel.
Both locks are global. Code that wants to modify cpusets must first
acquire the exclusive manage_sem, which allows them read-only access to
cpusets, and holds off other would-be modifiers. Before making actual
changes, the second semaphore, callback_sem must be acquired as well. Code
that needs only to query cpusets must acquire callback_sem, which is also a
global exclusive lock.
The earlier problems with double tripping are avoided, because it is
allowed for holders of manage_sem to nest the second callback_sem lock, and
only callback_sem is needed by code called from within __alloc_pages(),
where the double tripping had been possible.
This is not quite the same as a normal read/write semaphore, because
obtaining read-only access with intent to change must hold off other such
attempts, while allowing read-only access w/o such intention. Changing
cpusets involves several related checks and changes, which must be done
while allowing read-only queries (to avoid the double trip), but while
ensuring nothing changes (holding off other would be modifiers.)
This overhaul of cpuset locking also makes careful use of task_lock() to
guard access to the task->cpuset pointer, closing a couple of race
conditions noticed while reading this code (thanks, Roman). I've never
seen these races fail in any use or test.
See further the comments in the code.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-31 07:02:30 +08:00
|
|
|
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
|
|
|
|
* doesn't really matter if tsk->cpuset changes after we read it,
|
2023-05-08 15:58:50 +08:00
|
|
|
* and we take cpuset_mutex, keeping cpuset_attach() from changing it
|
2008-02-07 16:14:45 +08:00
|
|
|
* anyway.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2014-09-18 16:03:36 +08:00
|
|
|
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
|
|
|
|
struct pid *pid, struct task_struct *tsk)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2016-08-10 23:23:44 +08:00
|
|
|
char *buf;
|
2007-10-19 14:39:39 +08:00
|
|
|
struct cgroup_subsys_state *css;
|
2006-06-26 15:25:55 +08:00
|
|
|
int retval;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-06-26 15:25:55 +08:00
|
|
|
retval = -ENOMEM;
|
2014-02-12 22:29:50 +08:00
|
|
|
buf = kmalloc(PATH_MAX, GFP_KERNEL);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!buf)
|
2006-06-26 15:25:55 +08:00
|
|
|
goto out;
|
|
|
|
|
2016-01-29 16:54:06 +08:00
|
|
|
css = task_get_css(tsk, cpuset_cgrp_id);
|
2016-08-10 23:23:44 +08:00
|
|
|
retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
|
|
|
|
current->nsproxy->cgroup_ns);
|
2016-01-29 16:54:06 +08:00
|
|
|
css_put(css);
|
2016-08-10 23:23:44 +08:00
|
|
|
if (retval >= PATH_MAX)
|
2016-09-29 17:58:36 +08:00
|
|
|
retval = -ENAMETOOLONG;
|
|
|
|
if (retval < 0)
|
2014-09-18 16:03:36 +08:00
|
|
|
goto out_free;
|
2016-08-10 23:23:44 +08:00
|
|
|
seq_puts(m, buf);
|
2005-04-17 06:20:36 +08:00
|
|
|
seq_putc(m, '\n');
|
2014-02-12 22:29:50 +08:00
|
|
|
retval = 0;
|
2006-06-26 15:25:55 +08:00
|
|
|
out_free:
|
2005-04-17 06:20:36 +08:00
|
|
|
kfree(buf);
|
2006-06-26 15:25:55 +08:00
|
|
|
out:
|
2005-04-17 06:20:36 +08:00
|
|
|
return retval;
|
|
|
|
}
|
2007-10-19 14:39:39 +08:00
|
|
|
#endif /* CONFIG_PROC_PID_CPUSET */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-09-21 17:06:27 +08:00
|
|
|
/* Display task mems_allowed in /proc/<pid>/status file. */
|
2008-02-08 20:18:33 +08:00
|
|
|
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
|
|
|
|
{
|
2015-02-14 06:37:23 +08:00
|
|
|
seq_printf(m, "Mems_allowed:\t%*pb\n",
|
|
|
|
nodemask_pr_args(&task->mems_allowed));
|
|
|
|
seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
|
|
|
|
nodemask_pr_args(&task->mems_allowed));
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|