memcg: reduce memcg tree traversals for stats collection

Currently cgroup-v1's memcg_stat_show traverses the memcg tree ~17 times
to collect the stats while cgroup-v2's memory_stat_show traverses the
memcg tree thrice.  On a large machine, a couple thousand memcgs is very
normal and if the churn is high and memcgs stick around during to several
reasons, tens of thousands of nodes in memcg tree can exist.  This patch
has refactored and shared the stat collection code between cgroup-v1 and
cgroup-v2 and has reduced the tree traversal to just one.

I ran a simple benchmark which reads the root_mem_cgroup's stat file
1000 times in the presense of 2500 memcgs on cgroup-v1. The results are:

Without the patch:
$ time ./read-root-stat-1000-times

real    0m1.663s
user    0m0.000s
sys     0m1.660s

With the patch:
$ time ./read-root-stat-1000-times

real    0m0.468s
user    0m0.000s
sys     0m0.467s

Link: http://lkml.kernel.org/r/20180724224635.143944-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Bruce Merry <bmerry@ska.ac.za>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Shakeel Butt 2018-08-21 21:53:17 -07:00 committed by Linus Torvalds
parent 1c4c3b99c0
commit 8de7ecc648

View File

@ -2899,29 +2899,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return retval;
}
static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
struct accumulated_stats {
unsigned long stat[MEMCG_NR_STAT];
unsigned long events[NR_VM_EVENT_ITEMS];
unsigned long lru_pages[NR_LRU_LISTS];
const unsigned int *stats_array;
const unsigned int *events_array;
int stats_size;
int events_size;
};
static void accumulate_memcg_tree(struct mem_cgroup *memcg,
struct accumulated_stats *acc)
{
struct mem_cgroup *iter;
struct mem_cgroup *mi;
int i;
memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
for_each_mem_cgroup_tree(mi, memcg) {
for (i = 0; i < acc->stats_size; i++)
acc->stat[i] += memcg_page_state(mi,
acc->stats_array ? acc->stats_array[i] : i);
for_each_mem_cgroup_tree(iter, memcg) {
for (i = 0; i < MEMCG_NR_STAT; i++)
stat[i] += memcg_page_state(iter, i);
}
}
for (i = 0; i < acc->events_size; i++)
acc->events[i] += memcg_sum_events(mi,
acc->events_array ? acc->events_array[i] : i);
static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
{
struct mem_cgroup *iter;
int i;
memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS);
for_each_mem_cgroup_tree(iter, memcg) {
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
events[i] += memcg_sum_events(iter, i);
for (i = 0; i < NR_LRU_LISTS; i++)
acc->lru_pages[i] +=
mem_cgroup_nr_lru_pages(mi, BIT(i));
}
}
@ -3332,6 +3337,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
struct accumulated_stats acc;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@ -3364,32 +3370,27 @@ static int memcg_stat_show(struct seq_file *m, void *v)
seq_printf(m, "hierarchical_memsw_limit %llu\n",
(u64)memsw * PAGE_SIZE);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long long val = 0;
memset(&acc, 0, sizeof(acc));
acc.stats_size = ARRAY_SIZE(memcg1_stats);
acc.stats_array = memcg1_stats;
acc.events_size = ARRAY_SIZE(memcg1_events);
acc.events_array = memcg1_events;
accumulate_memcg_tree(memcg, &acc);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
for_each_mem_cgroup_tree(mi, memcg)
val += memcg_page_state(mi, memcg1_stats[i]) *
PAGE_SIZE;
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
(u64)acc.stat[i] * PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
unsigned long long val = 0;
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
(u64)acc.events[i]);
for_each_mem_cgroup_tree(mi, memcg)
val += memcg_sum_events(mi, memcg1_events[i]);
seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
}
for (i = 0; i < NR_LRU_LISTS; i++) {
unsigned long long val = 0;
for_each_mem_cgroup_tree(mi, memcg)
val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
}
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
(u64)acc.lru_pages[i] * PAGE_SIZE);
#ifdef CONFIG_DEBUG_VM
{
@ -5486,8 +5487,7 @@ static int memory_events_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
unsigned long stat[MEMCG_NR_STAT];
unsigned long events[NR_VM_EVENT_ITEMS];
struct accumulated_stats acc;
int i;
/*
@ -5501,66 +5501,62 @@ static int memory_stat_show(struct seq_file *m, void *v)
* Current memory state:
*/
tree_stat(memcg, stat);
tree_events(memcg, events);
memset(&acc, 0, sizeof(acc));
acc.stats_size = MEMCG_NR_STAT;
acc.events_size = NR_VM_EVENT_ITEMS;
accumulate_memcg_tree(memcg, &acc);
seq_printf(m, "anon %llu\n",
(u64)stat[MEMCG_RSS] * PAGE_SIZE);
(u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
seq_printf(m, "file %llu\n",
(u64)stat[MEMCG_CACHE] * PAGE_SIZE);
(u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
seq_printf(m, "kernel_stack %llu\n",
(u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
(u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
seq_printf(m, "slab %llu\n",
(u64)(stat[NR_SLAB_RECLAIMABLE] +
stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
(u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
seq_printf(m, "sock %llu\n",
(u64)stat[MEMCG_SOCK] * PAGE_SIZE);
(u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
seq_printf(m, "shmem %llu\n",
(u64)stat[NR_SHMEM] * PAGE_SIZE);
(u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n",
(u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
(u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n",
(u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
(u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
seq_printf(m, "file_writeback %llu\n",
(u64)stat[NR_WRITEBACK] * PAGE_SIZE);
(u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++) {
struct mem_cgroup *mi;
unsigned long val = 0;
for_each_mem_cgroup_tree(mi, memcg)
val += mem_cgroup_nr_lru_pages(mi, BIT(i));
seq_printf(m, "%s %llu\n",
mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
}
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
(u64)acc.lru_pages[i] * PAGE_SIZE);
seq_printf(m, "slab_reclaimable %llu\n",
(u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
(u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
seq_printf(m, "slab_unreclaimable %llu\n",
(u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
(u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
/* Accumulated memory events */
seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
events[PGSCAN_DIRECT]);
seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
events[PGSTEAL_DIRECT]);
seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
acc.events[PGSCAN_DIRECT]);
seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
acc.events[PGSTEAL_DIRECT]);
seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
seq_printf(m, "workingset_refault %lu\n",
stat[WORKINGSET_REFAULT]);
acc.stat[WORKINGSET_REFAULT]);
seq_printf(m, "workingset_activate %lu\n",
stat[WORKINGSET_ACTIVATE]);
acc.stat[WORKINGSET_ACTIVATE]);
seq_printf(m, "workingset_nodereclaim %lu\n",
stat[WORKINGSET_NODERECLAIM]);
acc.stat[WORKINGSET_NODERECLAIM]);
return 0;
}