2023-05-18 11:02:08 +08:00
|
|
|
#!/usr/bin/env drgn
|
|
|
|
#
|
|
|
|
# Copyright (C) 2023 Tejun Heo <tj@kernel.org>
|
|
|
|
# Copyright (C) 2023 Meta Platforms, Inc. and affiliates.
|
|
|
|
|
|
|
|
desc = """
|
|
|
|
This is a drgn script to monitor workqueues. For more info on drgn, visit
|
|
|
|
https://github.com/osandov/drgn.
|
|
|
|
|
|
|
|
total Total number of work items executed by the workqueue.
|
|
|
|
|
|
|
|
infl The number of currently in-flight work items.
|
|
|
|
|
2023-05-18 11:02:09 +08:00
|
|
|
CPUtime Total CPU time consumed by the workqueue in seconds. This is
|
|
|
|
sampled from scheduler ticks and only provides ballpark
|
|
|
|
measurement. "nohz_full=" CPUs are excluded from measurement.
|
|
|
|
|
2023-05-18 11:02:08 +08:00
|
|
|
CPUitsv The number of times a concurrency-managed work item hogged CPU
|
|
|
|
longer than the threshold (workqueue.cpu_intensive_thresh_us)
|
|
|
|
and got excluded from concurrency management to avoid stalling
|
|
|
|
other work items.
|
|
|
|
|
workqueue: Implement non-strict affinity scope for unbound workqueues
An unbound workqueue can be served by multiple worker_pools to improve
locality. The segmentation is achieved by grouping CPUs into pods. By
default, the cache boundaries according to cpus_share_cache() define the
CPUs are grouped. Let's a workqueue is allowed to run on all CPUs and the
system has two L3 caches. The workqueue would be mapped to two worker_pools
each serving one L3 cache domains.
While this improves locality, because the pod boundaries are strict, it
limits the total bandwidth a given issuer can consume. For example, let's
say there is a thread pinned to a CPU issuing enough work items to saturate
the whole machine. With the machine segmented into two pods, no matter how
many work items it issues, it can only use half of the CPUs on the system.
While this limitation has existed for a very long time, it wasn't very
pronounced because the affinity grouping used to be always by NUMA nodes.
With cache boundaries as the default and support for even finer grained
scopes (smt and cpu), it is now an a lot more pressing problem.
This patch implements non-strict affinity scope where the pod boundaries
aren't enforced strictly. Going back to the previous example, the workqueue
would still be mapped to two worker_pools; however, the affinity enforcement
would be soft. The workers in both pools would have their cpus_allowed set
to the whole machine thus allowing the scheduler to migrate them anywhere on
the machine. However, whenever an idle worker is woken up, the workqueue
code asks the scheduler to bring back the task within the pod if the worker
is outside. ie. work items start executing within its affinity scope but can
be migrated outside as the scheduler sees fit. This removes the hard cap on
utilization while maintaining the benefits of affinity scopes.
After the earlier ->__pod_cpumask changes, the implementation is pretty
simple. When non-strict which is the new default:
* pool_allowed_cpus() returns @pool->attrs->cpumask instead of
->__pod_cpumask so that the workers are allowed to run on any CPU that
the associated workqueues allow.
* If the idle worker task's ->wake_cpu is outside the pod, kick_pool() sets
the field to a CPU within the pod.
This would be the first use of task_struct->wake_cpu outside scheduler
proper, so it isn't clear whether this would be acceptable. However, other
methods of migrating tasks are significantly more expensive and are likely
prohibitively so if we want to do this on every work item. This needs
discussion with scheduler folks.
There is also a race window where setting ->wake_cpu wouldn't be effective
as the target task is still on CPU. However, the window is pretty small and
this being a best-effort optimization, it doesn't seem to warrant more
complexity at the moment.
While the non-strict cache affinity scopes seem to be the best option, the
performance picture interacts with the affinity scope and is a bit
complicated to fully discuss in this patch, so the behavior is made easily
selectable through wqattrs and sysfs and the next patch will add
documentation to discuss performance implications.
v2: pool->attrs->affn_strict is set to true for per-cpu worker_pools.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
2023-08-08 09:57:25 +08:00
|
|
|
CMW/RPR For per-cpu workqueues, the number of concurrency-management
|
|
|
|
wake-ups while executing a work item of the workqueue. For
|
|
|
|
unbound workqueues, the number of times a worker was repatriated
|
|
|
|
to its affinity scope after being migrated to an off-scope CPU by
|
|
|
|
the scheduler.
|
2023-05-18 11:02:08 +08:00
|
|
|
|
|
|
|
mayday The number of times the rescuer was requested while waiting for
|
|
|
|
new worker creation.
|
|
|
|
|
|
|
|
rescued The number of work items executed by the rescuer.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import signal
|
|
|
|
import re
|
|
|
|
import time
|
|
|
|
import json
|
|
|
|
|
|
|
|
import drgn
|
2024-03-21 23:04:20 +08:00
|
|
|
from drgn.helpers.linux.list import list_for_each_entry
|
2023-05-18 11:02:08 +08:00
|
|
|
|
|
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description=desc,
|
|
|
|
formatter_class=argparse.RawTextHelpFormatter)
|
|
|
|
parser.add_argument('workqueue', metavar='REGEX', nargs='*',
|
|
|
|
help='Target workqueue name patterns (all if empty)')
|
|
|
|
parser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1,
|
|
|
|
help='Monitoring interval (0 to print once and exit)')
|
|
|
|
parser.add_argument('-j', '--json', action='store_true',
|
|
|
|
help='Output in json')
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
workqueues = prog['workqueues']
|
|
|
|
|
|
|
|
WQ_UNBOUND = prog['WQ_UNBOUND']
|
|
|
|
WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM']
|
|
|
|
|
|
|
|
PWQ_STAT_STARTED = prog['PWQ_STAT_STARTED'] # work items started execution
|
|
|
|
PWQ_STAT_COMPLETED = prog['PWQ_STAT_COMPLETED'] # work items completed execution
|
2023-05-18 11:02:09 +08:00
|
|
|
PWQ_STAT_CPU_TIME = prog['PWQ_STAT_CPU_TIME'] # total CPU time consumed
|
2023-05-18 11:02:08 +08:00
|
|
|
PWQ_STAT_CPU_INTENSIVE = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations
|
2023-05-18 11:02:08 +08:00
|
|
|
PWQ_STAT_CM_WAKEUP = prog['PWQ_STAT_CM_WAKEUP'] # concurrency-management worker wakeups
|
workqueue: Implement non-strict affinity scope for unbound workqueues
An unbound workqueue can be served by multiple worker_pools to improve
locality. The segmentation is achieved by grouping CPUs into pods. By
default, the cache boundaries according to cpus_share_cache() define the
CPUs are grouped. Let's a workqueue is allowed to run on all CPUs and the
system has two L3 caches. The workqueue would be mapped to two worker_pools
each serving one L3 cache domains.
While this improves locality, because the pod boundaries are strict, it
limits the total bandwidth a given issuer can consume. For example, let's
say there is a thread pinned to a CPU issuing enough work items to saturate
the whole machine. With the machine segmented into two pods, no matter how
many work items it issues, it can only use half of the CPUs on the system.
While this limitation has existed for a very long time, it wasn't very
pronounced because the affinity grouping used to be always by NUMA nodes.
With cache boundaries as the default and support for even finer grained
scopes (smt and cpu), it is now an a lot more pressing problem.
This patch implements non-strict affinity scope where the pod boundaries
aren't enforced strictly. Going back to the previous example, the workqueue
would still be mapped to two worker_pools; however, the affinity enforcement
would be soft. The workers in both pools would have their cpus_allowed set
to the whole machine thus allowing the scheduler to migrate them anywhere on
the machine. However, whenever an idle worker is woken up, the workqueue
code asks the scheduler to bring back the task within the pod if the worker
is outside. ie. work items start executing within its affinity scope but can
be migrated outside as the scheduler sees fit. This removes the hard cap on
utilization while maintaining the benefits of affinity scopes.
After the earlier ->__pod_cpumask changes, the implementation is pretty
simple. When non-strict which is the new default:
* pool_allowed_cpus() returns @pool->attrs->cpumask instead of
->__pod_cpumask so that the workers are allowed to run on any CPU that
the associated workqueues allow.
* If the idle worker task's ->wake_cpu is outside the pod, kick_pool() sets
the field to a CPU within the pod.
This would be the first use of task_struct->wake_cpu outside scheduler
proper, so it isn't clear whether this would be acceptable. However, other
methods of migrating tasks are significantly more expensive and are likely
prohibitively so if we want to do this on every work item. This needs
discussion with scheduler folks.
There is also a race window where setting ->wake_cpu wouldn't be effective
as the target task is still on CPU. However, the window is pretty small and
this being a best-effort optimization, it doesn't seem to warrant more
complexity at the moment.
While the non-strict cache affinity scopes seem to be the best option, the
performance picture interacts with the affinity scope and is a bit
complicated to fully discuss in this patch, so the behavior is made easily
selectable through wqattrs and sysfs and the next patch will add
documentation to discuss performance implications.
v2: pool->attrs->affn_strict is set to true for per-cpu worker_pools.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
2023-08-08 09:57:25 +08:00
|
|
|
PWQ_STAT_REPATRIATED = prog['PWQ_STAT_REPATRIATED'] # unbound workers brought back into scope
|
2023-05-18 11:02:08 +08:00
|
|
|
PWQ_STAT_MAYDAY = prog['PWQ_STAT_MAYDAY'] # maydays to rescuer
|
|
|
|
PWQ_STAT_RESCUED = prog['PWQ_STAT_RESCUED'] # linked work items executed by rescuer
|
|
|
|
PWQ_NR_STATS = prog['PWQ_NR_STATS']
|
|
|
|
|
|
|
|
class WqStats:
|
|
|
|
def __init__(self, wq):
|
|
|
|
self.name = wq.name.string_().decode()
|
|
|
|
self.unbound = wq.flags & WQ_UNBOUND != 0
|
|
|
|
self.mem_reclaim = wq.flags & WQ_MEM_RECLAIM != 0
|
|
|
|
self.stats = [0] * PWQ_NR_STATS
|
|
|
|
for pwq in list_for_each_entry('struct pool_workqueue', wq.pwqs.address_of_(), 'pwqs_node'):
|
|
|
|
for i in range(PWQ_NR_STATS):
|
|
|
|
self.stats[i] += int(pwq.stats[i])
|
|
|
|
|
|
|
|
def dict(self, now):
|
|
|
|
return { 'timestamp' : now,
|
|
|
|
'name' : self.name,
|
|
|
|
'unbound' : self.unbound,
|
|
|
|
'mem_reclaim' : self.mem_reclaim,
|
|
|
|
'started' : self.stats[PWQ_STAT_STARTED],
|
|
|
|
'completed' : self.stats[PWQ_STAT_COMPLETED],
|
2023-05-18 11:02:09 +08:00
|
|
|
'cpu_time' : self.stats[PWQ_STAT_CPU_TIME],
|
2023-05-18 11:02:08 +08:00
|
|
|
'cpu_intensive' : self.stats[PWQ_STAT_CPU_INTENSIVE],
|
2023-05-18 11:02:08 +08:00
|
|
|
'cm_wakeup' : self.stats[PWQ_STAT_CM_WAKEUP],
|
workqueue: Implement non-strict affinity scope for unbound workqueues
An unbound workqueue can be served by multiple worker_pools to improve
locality. The segmentation is achieved by grouping CPUs into pods. By
default, the cache boundaries according to cpus_share_cache() define the
CPUs are grouped. Let's a workqueue is allowed to run on all CPUs and the
system has two L3 caches. The workqueue would be mapped to two worker_pools
each serving one L3 cache domains.
While this improves locality, because the pod boundaries are strict, it
limits the total bandwidth a given issuer can consume. For example, let's
say there is a thread pinned to a CPU issuing enough work items to saturate
the whole machine. With the machine segmented into two pods, no matter how
many work items it issues, it can only use half of the CPUs on the system.
While this limitation has existed for a very long time, it wasn't very
pronounced because the affinity grouping used to be always by NUMA nodes.
With cache boundaries as the default and support for even finer grained
scopes (smt and cpu), it is now an a lot more pressing problem.
This patch implements non-strict affinity scope where the pod boundaries
aren't enforced strictly. Going back to the previous example, the workqueue
would still be mapped to two worker_pools; however, the affinity enforcement
would be soft. The workers in both pools would have their cpus_allowed set
to the whole machine thus allowing the scheduler to migrate them anywhere on
the machine. However, whenever an idle worker is woken up, the workqueue
code asks the scheduler to bring back the task within the pod if the worker
is outside. ie. work items start executing within its affinity scope but can
be migrated outside as the scheduler sees fit. This removes the hard cap on
utilization while maintaining the benefits of affinity scopes.
After the earlier ->__pod_cpumask changes, the implementation is pretty
simple. When non-strict which is the new default:
* pool_allowed_cpus() returns @pool->attrs->cpumask instead of
->__pod_cpumask so that the workers are allowed to run on any CPU that
the associated workqueues allow.
* If the idle worker task's ->wake_cpu is outside the pod, kick_pool() sets
the field to a CPU within the pod.
This would be the first use of task_struct->wake_cpu outside scheduler
proper, so it isn't clear whether this would be acceptable. However, other
methods of migrating tasks are significantly more expensive and are likely
prohibitively so if we want to do this on every work item. This needs
discussion with scheduler folks.
There is also a race window where setting ->wake_cpu wouldn't be effective
as the target task is still on CPU. However, the window is pretty small and
this being a best-effort optimization, it doesn't seem to warrant more
complexity at the moment.
While the non-strict cache affinity scopes seem to be the best option, the
performance picture interacts with the affinity scope and is a bit
complicated to fully discuss in this patch, so the behavior is made easily
selectable through wqattrs and sysfs and the next patch will add
documentation to discuss performance implications.
v2: pool->attrs->affn_strict is set to true for per-cpu worker_pools.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
2023-08-08 09:57:25 +08:00
|
|
|
'repatriated' : self.stats[PWQ_STAT_REPATRIATED],
|
2023-05-18 11:02:08 +08:00
|
|
|
'mayday' : self.stats[PWQ_STAT_MAYDAY],
|
|
|
|
'rescued' : self.stats[PWQ_STAT_RESCUED], }
|
|
|
|
|
|
|
|
def table_header_str():
|
2023-05-18 11:02:09 +08:00
|
|
|
return f'{"":>24} {"total":>8} {"infl":>5} {"CPUtime":>8} '\
|
workqueue: Implement non-strict affinity scope for unbound workqueues
An unbound workqueue can be served by multiple worker_pools to improve
locality. The segmentation is achieved by grouping CPUs into pods. By
default, the cache boundaries according to cpus_share_cache() define the
CPUs are grouped. Let's a workqueue is allowed to run on all CPUs and the
system has two L3 caches. The workqueue would be mapped to two worker_pools
each serving one L3 cache domains.
While this improves locality, because the pod boundaries are strict, it
limits the total bandwidth a given issuer can consume. For example, let's
say there is a thread pinned to a CPU issuing enough work items to saturate
the whole machine. With the machine segmented into two pods, no matter how
many work items it issues, it can only use half of the CPUs on the system.
While this limitation has existed for a very long time, it wasn't very
pronounced because the affinity grouping used to be always by NUMA nodes.
With cache boundaries as the default and support for even finer grained
scopes (smt and cpu), it is now an a lot more pressing problem.
This patch implements non-strict affinity scope where the pod boundaries
aren't enforced strictly. Going back to the previous example, the workqueue
would still be mapped to two worker_pools; however, the affinity enforcement
would be soft. The workers in both pools would have their cpus_allowed set
to the whole machine thus allowing the scheduler to migrate them anywhere on
the machine. However, whenever an idle worker is woken up, the workqueue
code asks the scheduler to bring back the task within the pod if the worker
is outside. ie. work items start executing within its affinity scope but can
be migrated outside as the scheduler sees fit. This removes the hard cap on
utilization while maintaining the benefits of affinity scopes.
After the earlier ->__pod_cpumask changes, the implementation is pretty
simple. When non-strict which is the new default:
* pool_allowed_cpus() returns @pool->attrs->cpumask instead of
->__pod_cpumask so that the workers are allowed to run on any CPU that
the associated workqueues allow.
* If the idle worker task's ->wake_cpu is outside the pod, kick_pool() sets
the field to a CPU within the pod.
This would be the first use of task_struct->wake_cpu outside scheduler
proper, so it isn't clear whether this would be acceptable. However, other
methods of migrating tasks are significantly more expensive and are likely
prohibitively so if we want to do this on every work item. This needs
discussion with scheduler folks.
There is also a race window where setting ->wake_cpu wouldn't be effective
as the target task is still on CPU. However, the window is pretty small and
this being a best-effort optimization, it doesn't seem to warrant more
complexity at the moment.
While the non-strict cache affinity scopes seem to be the best option, the
performance picture interacts with the affinity scope and is a bit
complicated to fully discuss in this patch, so the behavior is made easily
selectable through wqattrs and sysfs and the next patch will add
documentation to discuss performance implications.
v2: pool->attrs->affn_strict is set to true for per-cpu worker_pools.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
2023-08-08 09:57:25 +08:00
|
|
|
f'{"CPUitsv":>7} {"CMW/RPR":>7} {"mayday":>7} {"rescued":>7}'
|
2023-05-18 11:02:08 +08:00
|
|
|
|
|
|
|
def table_row_str(self):
|
2023-05-18 11:02:08 +08:00
|
|
|
cpu_intensive = '-'
|
workqueue: Implement non-strict affinity scope for unbound workqueues
An unbound workqueue can be served by multiple worker_pools to improve
locality. The segmentation is achieved by grouping CPUs into pods. By
default, the cache boundaries according to cpus_share_cache() define the
CPUs are grouped. Let's a workqueue is allowed to run on all CPUs and the
system has two L3 caches. The workqueue would be mapped to two worker_pools
each serving one L3 cache domains.
While this improves locality, because the pod boundaries are strict, it
limits the total bandwidth a given issuer can consume. For example, let's
say there is a thread pinned to a CPU issuing enough work items to saturate
the whole machine. With the machine segmented into two pods, no matter how
many work items it issues, it can only use half of the CPUs on the system.
While this limitation has existed for a very long time, it wasn't very
pronounced because the affinity grouping used to be always by NUMA nodes.
With cache boundaries as the default and support for even finer grained
scopes (smt and cpu), it is now an a lot more pressing problem.
This patch implements non-strict affinity scope where the pod boundaries
aren't enforced strictly. Going back to the previous example, the workqueue
would still be mapped to two worker_pools; however, the affinity enforcement
would be soft. The workers in both pools would have their cpus_allowed set
to the whole machine thus allowing the scheduler to migrate them anywhere on
the machine. However, whenever an idle worker is woken up, the workqueue
code asks the scheduler to bring back the task within the pod if the worker
is outside. ie. work items start executing within its affinity scope but can
be migrated outside as the scheduler sees fit. This removes the hard cap on
utilization while maintaining the benefits of affinity scopes.
After the earlier ->__pod_cpumask changes, the implementation is pretty
simple. When non-strict which is the new default:
* pool_allowed_cpus() returns @pool->attrs->cpumask instead of
->__pod_cpumask so that the workers are allowed to run on any CPU that
the associated workqueues allow.
* If the idle worker task's ->wake_cpu is outside the pod, kick_pool() sets
the field to a CPU within the pod.
This would be the first use of task_struct->wake_cpu outside scheduler
proper, so it isn't clear whether this would be acceptable. However, other
methods of migrating tasks are significantly more expensive and are likely
prohibitively so if we want to do this on every work item. This needs
discussion with scheduler folks.
There is also a race window where setting ->wake_cpu wouldn't be effective
as the target task is still on CPU. However, the window is pretty small and
this being a best-effort optimization, it doesn't seem to warrant more
complexity at the moment.
While the non-strict cache affinity scopes seem to be the best option, the
performance picture interacts with the affinity scope and is a bit
complicated to fully discuss in this patch, so the behavior is made easily
selectable through wqattrs and sysfs and the next patch will add
documentation to discuss performance implications.
v2: pool->attrs->affn_strict is set to true for per-cpu worker_pools.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
2023-08-08 09:57:25 +08:00
|
|
|
cmw_rpr = '-'
|
2023-05-18 11:02:08 +08:00
|
|
|
mayday = '-'
|
|
|
|
rescued = '-'
|
|
|
|
|
workqueue: Implement non-strict affinity scope for unbound workqueues
An unbound workqueue can be served by multiple worker_pools to improve
locality. The segmentation is achieved by grouping CPUs into pods. By
default, the cache boundaries according to cpus_share_cache() define the
CPUs are grouped. Let's a workqueue is allowed to run on all CPUs and the
system has two L3 caches. The workqueue would be mapped to two worker_pools
each serving one L3 cache domains.
While this improves locality, because the pod boundaries are strict, it
limits the total bandwidth a given issuer can consume. For example, let's
say there is a thread pinned to a CPU issuing enough work items to saturate
the whole machine. With the machine segmented into two pods, no matter how
many work items it issues, it can only use half of the CPUs on the system.
While this limitation has existed for a very long time, it wasn't very
pronounced because the affinity grouping used to be always by NUMA nodes.
With cache boundaries as the default and support for even finer grained
scopes (smt and cpu), it is now an a lot more pressing problem.
This patch implements non-strict affinity scope where the pod boundaries
aren't enforced strictly. Going back to the previous example, the workqueue
would still be mapped to two worker_pools; however, the affinity enforcement
would be soft. The workers in both pools would have their cpus_allowed set
to the whole machine thus allowing the scheduler to migrate them anywhere on
the machine. However, whenever an idle worker is woken up, the workqueue
code asks the scheduler to bring back the task within the pod if the worker
is outside. ie. work items start executing within its affinity scope but can
be migrated outside as the scheduler sees fit. This removes the hard cap on
utilization while maintaining the benefits of affinity scopes.
After the earlier ->__pod_cpumask changes, the implementation is pretty
simple. When non-strict which is the new default:
* pool_allowed_cpus() returns @pool->attrs->cpumask instead of
->__pod_cpumask so that the workers are allowed to run on any CPU that
the associated workqueues allow.
* If the idle worker task's ->wake_cpu is outside the pod, kick_pool() sets
the field to a CPU within the pod.
This would be the first use of task_struct->wake_cpu outside scheduler
proper, so it isn't clear whether this would be acceptable. However, other
methods of migrating tasks are significantly more expensive and are likely
prohibitively so if we want to do this on every work item. This needs
discussion with scheduler folks.
There is also a race window where setting ->wake_cpu wouldn't be effective
as the target task is still on CPU. However, the window is pretty small and
this being a best-effort optimization, it doesn't seem to warrant more
complexity at the moment.
While the non-strict cache affinity scopes seem to be the best option, the
performance picture interacts with the affinity scope and is a bit
complicated to fully discuss in this patch, so the behavior is made easily
selectable through wqattrs and sysfs and the next patch will add
documentation to discuss performance implications.
v2: pool->attrs->affn_strict is set to true for per-cpu worker_pools.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
2023-08-08 09:57:25 +08:00
|
|
|
if self.unbound:
|
|
|
|
cmw_rpr = str(self.stats[PWQ_STAT_REPATRIATED]);
|
|
|
|
else:
|
2023-05-18 11:02:08 +08:00
|
|
|
cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE])
|
workqueue: Implement non-strict affinity scope for unbound workqueues
An unbound workqueue can be served by multiple worker_pools to improve
locality. The segmentation is achieved by grouping CPUs into pods. By
default, the cache boundaries according to cpus_share_cache() define the
CPUs are grouped. Let's a workqueue is allowed to run on all CPUs and the
system has two L3 caches. The workqueue would be mapped to two worker_pools
each serving one L3 cache domains.
While this improves locality, because the pod boundaries are strict, it
limits the total bandwidth a given issuer can consume. For example, let's
say there is a thread pinned to a CPU issuing enough work items to saturate
the whole machine. With the machine segmented into two pods, no matter how
many work items it issues, it can only use half of the CPUs on the system.
While this limitation has existed for a very long time, it wasn't very
pronounced because the affinity grouping used to be always by NUMA nodes.
With cache boundaries as the default and support for even finer grained
scopes (smt and cpu), it is now an a lot more pressing problem.
This patch implements non-strict affinity scope where the pod boundaries
aren't enforced strictly. Going back to the previous example, the workqueue
would still be mapped to two worker_pools; however, the affinity enforcement
would be soft. The workers in both pools would have their cpus_allowed set
to the whole machine thus allowing the scheduler to migrate them anywhere on
the machine. However, whenever an idle worker is woken up, the workqueue
code asks the scheduler to bring back the task within the pod if the worker
is outside. ie. work items start executing within its affinity scope but can
be migrated outside as the scheduler sees fit. This removes the hard cap on
utilization while maintaining the benefits of affinity scopes.
After the earlier ->__pod_cpumask changes, the implementation is pretty
simple. When non-strict which is the new default:
* pool_allowed_cpus() returns @pool->attrs->cpumask instead of
->__pod_cpumask so that the workers are allowed to run on any CPU that
the associated workqueues allow.
* If the idle worker task's ->wake_cpu is outside the pod, kick_pool() sets
the field to a CPU within the pod.
This would be the first use of task_struct->wake_cpu outside scheduler
proper, so it isn't clear whether this would be acceptable. However, other
methods of migrating tasks are significantly more expensive and are likely
prohibitively so if we want to do this on every work item. This needs
discussion with scheduler folks.
There is also a race window where setting ->wake_cpu wouldn't be effective
as the target task is still on CPU. However, the window is pretty small and
this being a best-effort optimization, it doesn't seem to warrant more
complexity at the moment.
While the non-strict cache affinity scopes seem to be the best option, the
performance picture interacts with the affinity scope and is a bit
complicated to fully discuss in this patch, so the behavior is made easily
selectable through wqattrs and sysfs and the next patch will add
documentation to discuss performance implications.
v2: pool->attrs->affn_strict is set to true for per-cpu worker_pools.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
2023-08-08 09:57:25 +08:00
|
|
|
cmw_rpr = str(self.stats[PWQ_STAT_CM_WAKEUP])
|
2023-05-18 11:02:08 +08:00
|
|
|
|
|
|
|
if self.mem_reclaim:
|
|
|
|
mayday = str(self.stats[PWQ_STAT_MAYDAY])
|
|
|
|
rescued = str(self.stats[PWQ_STAT_RESCUED])
|
|
|
|
|
|
|
|
out = f'{self.name[-24:]:24} ' \
|
|
|
|
f'{self.stats[PWQ_STAT_STARTED]:8} ' \
|
|
|
|
f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \
|
2023-05-18 11:02:09 +08:00
|
|
|
f'{self.stats[PWQ_STAT_CPU_TIME] / 1000000:8.1f} ' \
|
2023-05-18 11:02:08 +08:00
|
|
|
f'{cpu_intensive:>7} ' \
|
workqueue: Implement non-strict affinity scope for unbound workqueues
An unbound workqueue can be served by multiple worker_pools to improve
locality. The segmentation is achieved by grouping CPUs into pods. By
default, the cache boundaries according to cpus_share_cache() define the
CPUs are grouped. Let's a workqueue is allowed to run on all CPUs and the
system has two L3 caches. The workqueue would be mapped to two worker_pools
each serving one L3 cache domains.
While this improves locality, because the pod boundaries are strict, it
limits the total bandwidth a given issuer can consume. For example, let's
say there is a thread pinned to a CPU issuing enough work items to saturate
the whole machine. With the machine segmented into two pods, no matter how
many work items it issues, it can only use half of the CPUs on the system.
While this limitation has existed for a very long time, it wasn't very
pronounced because the affinity grouping used to be always by NUMA nodes.
With cache boundaries as the default and support for even finer grained
scopes (smt and cpu), it is now an a lot more pressing problem.
This patch implements non-strict affinity scope where the pod boundaries
aren't enforced strictly. Going back to the previous example, the workqueue
would still be mapped to two worker_pools; however, the affinity enforcement
would be soft. The workers in both pools would have their cpus_allowed set
to the whole machine thus allowing the scheduler to migrate them anywhere on
the machine. However, whenever an idle worker is woken up, the workqueue
code asks the scheduler to bring back the task within the pod if the worker
is outside. ie. work items start executing within its affinity scope but can
be migrated outside as the scheduler sees fit. This removes the hard cap on
utilization while maintaining the benefits of affinity scopes.
After the earlier ->__pod_cpumask changes, the implementation is pretty
simple. When non-strict which is the new default:
* pool_allowed_cpus() returns @pool->attrs->cpumask instead of
->__pod_cpumask so that the workers are allowed to run on any CPU that
the associated workqueues allow.
* If the idle worker task's ->wake_cpu is outside the pod, kick_pool() sets
the field to a CPU within the pod.
This would be the first use of task_struct->wake_cpu outside scheduler
proper, so it isn't clear whether this would be acceptable. However, other
methods of migrating tasks are significantly more expensive and are likely
prohibitively so if we want to do this on every work item. This needs
discussion with scheduler folks.
There is also a race window where setting ->wake_cpu wouldn't be effective
as the target task is still on CPU. However, the window is pretty small and
this being a best-effort optimization, it doesn't seem to warrant more
complexity at the moment.
While the non-strict cache affinity scopes seem to be the best option, the
performance picture interacts with the affinity scope and is a bit
complicated to fully discuss in this patch, so the behavior is made easily
selectable through wqattrs and sysfs and the next patch will add
documentation to discuss performance implications.
v2: pool->attrs->affn_strict is set to true for per-cpu worker_pools.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
2023-08-08 09:57:25 +08:00
|
|
|
f'{cmw_rpr:>7} ' \
|
2023-05-18 11:02:08 +08:00
|
|
|
f'{mayday:>7} ' \
|
|
|
|
f'{rescued:>7} '
|
|
|
|
return out.rstrip(':')
|
|
|
|
|
|
|
|
exit_req = False
|
|
|
|
|
|
|
|
def sigint_handler(signr, frame):
|
|
|
|
global exit_req
|
|
|
|
exit_req = True
|
|
|
|
|
|
|
|
def main():
|
|
|
|
# handle args
|
|
|
|
table_fmt = not args.json
|
|
|
|
interval = args.interval
|
|
|
|
|
|
|
|
re_str = None
|
|
|
|
if args.workqueue:
|
|
|
|
for r in args.workqueue:
|
|
|
|
if re_str is None:
|
|
|
|
re_str = r
|
|
|
|
else:
|
|
|
|
re_str += '|' + r
|
|
|
|
|
|
|
|
filter_re = re.compile(re_str) if re_str else None
|
|
|
|
|
|
|
|
# monitoring loop
|
|
|
|
signal.signal(signal.SIGINT, sigint_handler)
|
|
|
|
|
|
|
|
while not exit_req:
|
|
|
|
now = time.time()
|
|
|
|
|
|
|
|
if table_fmt:
|
|
|
|
print()
|
|
|
|
print(WqStats.table_header_str())
|
|
|
|
|
|
|
|
for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'):
|
|
|
|
stats = WqStats(wq)
|
|
|
|
if filter_re and not filter_re.search(stats.name):
|
|
|
|
continue
|
|
|
|
if table_fmt:
|
|
|
|
print(stats.table_row_str())
|
|
|
|
else:
|
|
|
|
print(stats.dict(now))
|
|
|
|
|
|
|
|
if interval == 0:
|
|
|
|
break
|
|
|
|
time.sleep(interval)
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|