2006-06-30 16:55:32 +08:00
|
|
|
/*
|
|
|
|
* linux/mm/vmstat.c
|
|
|
|
*
|
|
|
|
* Manages VM statistics
|
|
|
|
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
|
2006-06-30 16:55:33 +08:00
|
|
|
*
|
|
|
|
* zoned VM statistics
|
|
|
|
* Copyright (C) 2006 Silicon Graphics, Inc.,
|
|
|
|
* Christoph Lameter <christoph@lameter.com>
|
2006-06-30 16:55:32 +08:00
|
|
|
*/
|
2008-10-06 08:13:52 +08:00
|
|
|
#include <linux/fs.h>
|
2006-06-30 16:55:32 +08:00
|
|
|
#include <linux/mm.h>
|
2007-07-30 06:36:13 +08:00
|
|
|
#include <linux/err.h>
|
2006-06-30 16:55:33 +08:00
|
|
|
#include <linux/module.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
|
|
|
#include <linux/slab.h>
|
2006-09-01 12:27:35 +08:00
|
|
|
#include <linux/cpu.h>
|
2008-07-24 12:27:03 +08:00
|
|
|
#include <linux/vmstat.h>
|
Detach sched.h from mm.h
First thing mm.h does is including sched.h solely for can_do_mlock() inline
function which has "current" dereference inside. By dealing with can_do_mlock()
mm.h can be detached from sched.h which is good. See below, why.
This patch
a) removes unconditional inclusion of sched.h from mm.h
b) makes can_do_mlock() normal function in mm/mlock.c
c) exports can_do_mlock() to not break compilation
d) adds sched.h inclusions back to files that were getting it indirectly.
e) adds less bloated headers to some files (asm/signal.h, jiffies.h) that were
getting them indirectly
Net result is:
a) mm.h users would get less code to open, read, preprocess, parse, ... if
they don't need sched.h
b) sched.h stops being dependency for significant number of files:
on x86_64 allmodconfig touching sched.h results in recompile of 4083 files,
after patch it's only 3744 (-8.3%).
Cross-compile tested on
all arm defconfigs, all mips defconfigs, all powerpc defconfigs,
alpha alpha-up
arm
i386 i386-up i386-defconfig i386-allnoconfig
ia64 ia64-up
m68k
mips
parisc parisc-up
powerpc powerpc-up
s390 s390-up
sparc sparc-up
sparc64 sparc64-up
um-x86_64
x86_64 x86_64-up x86_64-defconfig x86_64-allnoconfig
as well as my two usual configs.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-21 05:22:52 +08:00
|
|
|
#include <linux/sched.h>
|
2010-05-25 05:32:26 +08:00
|
|
|
#include <linux/math64.h>
|
2010-10-27 05:21:36 +08:00
|
|
|
#include <linux/writeback.h>
|
2010-10-27 05:22:04 +08:00
|
|
|
#include <linux/compaction.h>
|
2006-06-30 16:55:32 +08:00
|
|
|
|
2006-06-30 16:55:45 +08:00
|
|
|
#ifdef CONFIG_VM_EVENT_COUNTERS
|
|
|
|
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
|
|
|
|
EXPORT_PER_CPU_SYMBOL(vm_event_states);
|
|
|
|
|
2010-08-10 08:18:59 +08:00
|
|
|
static void sum_vm_events(unsigned long *ret)
|
2006-06-30 16:55:45 +08:00
|
|
|
{
|
2008-02-05 14:29:22 +08:00
|
|
|
int cpu;
|
2006-06-30 16:55:45 +08:00
|
|
|
int i;
|
|
|
|
|
|
|
|
memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
|
|
|
|
|
2010-08-10 08:18:59 +08:00
|
|
|
for_each_online_cpu(cpu) {
|
2006-06-30 16:55:45 +08:00
|
|
|
struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
|
|
|
|
|
|
|
|
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
|
|
|
|
ret[i] += this->event[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Accumulate the vm event counters across all CPUs.
|
|
|
|
* The result is unavoidably approximate - it can change
|
|
|
|
* during and after execution of this function.
|
|
|
|
*/
|
|
|
|
void all_vm_events(unsigned long *ret)
|
|
|
|
{
|
2008-05-13 05:02:06 +08:00
|
|
|
get_online_cpus();
|
2010-08-10 08:18:59 +08:00
|
|
|
sum_vm_events(ret);
|
2008-05-13 05:02:06 +08:00
|
|
|
put_online_cpus();
|
2006-06-30 16:55:45 +08:00
|
|
|
}
|
2006-07-10 19:44:31 +08:00
|
|
|
EXPORT_SYMBOL_GPL(all_vm_events);
|
2006-06-30 16:55:45 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_HOTPLUG
|
|
|
|
/*
|
|
|
|
* Fold the foreign cpu events into our own.
|
|
|
|
*
|
|
|
|
* This is adding to the events on one processor
|
|
|
|
* but keeps the global counts constant.
|
|
|
|
*/
|
|
|
|
void vm_events_fold_cpu(int cpu)
|
|
|
|
{
|
|
|
|
struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
|
|
|
|
count_vm_events(i, fold_state->event[i]);
|
|
|
|
fold_state->event[i] = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_HOTPLUG */
|
|
|
|
|
|
|
|
#endif /* CONFIG_VM_EVENT_COUNTERS */
|
|
|
|
|
2006-06-30 16:55:33 +08:00
|
|
|
/*
|
|
|
|
* Manage combined zone based / global counters
|
|
|
|
*
|
|
|
|
* vm_stat contains the global counters
|
|
|
|
*/
|
|
|
|
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
|
|
|
|
EXPORT_SYMBOL(vm_stat);
|
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
2011-01-14 07:45:43 +08:00
|
|
|
int calculate_pressure_threshold(struct zone *zone)
|
mm: page allocator: adjust the per-cpu counter threshold when memory is low
Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory
is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To
avoid synchronization overhead, these counters are maintained on a per-cpu
basis and drained both periodically and when a threshold is above a
threshold. On large CPU systems, the difference between the estimate and
real value of NR_FREE_PAGES can be very high. The system can get into a
case where pages are allocated far below the min watermark potentially
causing livelock issues. The commit solved the problem by taking a better
reading of NR_FREE_PAGES when memory was low.
Unfortately, as reported by Shaohua Li this accurate reading can consume a
large amount of CPU time on systems with many sockets due to cache line
bouncing. This patch takes a different approach. For large machines
where counter drift might be unsafe and while kswapd is awake, the per-cpu
thresholds for the target pgdat are reduced to limit the level of drift to
what should be a safe level. This incurs a performance penalty in heavy
memory pressure by a factor that depends on the workload and the machine
but the machine should function correctly without accidentally exhausting
all memory on a node. There is an additional cost when kswapd wakes and
sleeps but the event is not expected to be frequent - in Shaohua's test
case, there was one recorded sleep and wake event at least.
To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is
introduced that takes a more accurate reading of NR_FREE_PAGES when called
from wakeup_kswapd, when deciding whether it is really safe to go back to
sleep in sleeping_prematurely() and when deciding if a zone is really
balanced or not in balance_pgdat(). We are still using an expensive
function but limiting how often it is called.
When the test case is reproduced, the time spent in the watermark
functions is reduced. The following report is on the percentage of time
spent cumulatively spent in the functions zone_nr_free_pages(),
zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(),
zone_page_state_snapshot(), zone_page_state().
vanilla 11.6615%
disable-threshold 0.2584%
David said:
: We had to pull aa454840 "mm: page allocator: calculate a better estimate
: of NR_FREE_PAGES when memory is low and kswapd is awake" from 2.6.36
: internally because tests showed that it would cause the machine to stall
: as the result of heavy kswapd activity. I merged it back with this fix as
: it is pending in the -mm tree and it solves the issue we were seeing, so I
: definitely think this should be pushed to -stable (and I would seriously
: consider it for 2.6.37 inclusion even at this late date).
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reported-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Tested-by: Nicolas Bareil <nico@chdir.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: <stable@kernel.org> [2.6.37.1, 2.6.36.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-01-14 07:45:41 +08:00
|
|
|
{
|
|
|
|
int threshold;
|
|
|
|
int watermark_distance;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* As vmstats are not up to date, there is drift between the estimated
|
|
|
|
* and real values. For high thresholds and a high number of CPUs, it
|
|
|
|
* is possible for the min watermark to be breached while the estimated
|
|
|
|
* value looks fine. The pressure threshold is a reduced value such
|
|
|
|
* that even the maximum amount of drift will not accidentally breach
|
|
|
|
* the min watermark
|
|
|
|
*/
|
|
|
|
watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
|
|
|
|
threshold = max(1, (int)(watermark_distance / num_online_cpus()));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Maximum threshold is 125
|
|
|
|
*/
|
|
|
|
threshold = min(125, threshold);
|
|
|
|
|
|
|
|
return threshold;
|
|
|
|
}
|
|
|
|
|
2011-01-14 07:45:43 +08:00
|
|
|
int calculate_normal_threshold(struct zone *zone)
|
2006-09-01 12:27:35 +08:00
|
|
|
{
|
|
|
|
int threshold;
|
|
|
|
int mem; /* memory in 128 MB units */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The threshold scales with the number of processors and the amount
|
|
|
|
* of memory per zone. More memory means that we can defer updates for
|
|
|
|
* longer, more processors could lead to more contention.
|
|
|
|
* fls() is used to have a cheap way of logarithmic scaling.
|
|
|
|
*
|
|
|
|
* Some sample thresholds:
|
|
|
|
*
|
|
|
|
* Threshold Processors (fls) Zonesize fls(mem+1)
|
|
|
|
* ------------------------------------------------------------------
|
|
|
|
* 8 1 1 0.9-1 GB 4
|
|
|
|
* 16 2 2 0.9-1 GB 4
|
|
|
|
* 20 2 2 1-2 GB 5
|
|
|
|
* 24 2 2 2-4 GB 6
|
|
|
|
* 28 2 2 4-8 GB 7
|
|
|
|
* 32 2 2 8-16 GB 8
|
|
|
|
* 4 2 2 <128M 1
|
|
|
|
* 30 4 3 2-4 GB 5
|
|
|
|
* 48 4 3 8-16 GB 8
|
|
|
|
* 32 8 4 1-2 GB 4
|
|
|
|
* 32 8 4 0.9-1GB 4
|
|
|
|
* 10 16 5 <128M 1
|
|
|
|
* 40 16 5 900M 4
|
|
|
|
* 70 64 7 2-4 GB 5
|
|
|
|
* 84 64 7 4-8 GB 6
|
|
|
|
* 108 512 9 4-8 GB 6
|
|
|
|
* 125 1024 10 8-16 GB 8
|
|
|
|
* 125 1024 10 16-32 GB 9
|
|
|
|
*/
|
|
|
|
|
|
|
|
mem = zone->present_pages >> (27 - PAGE_SHIFT);
|
|
|
|
|
|
|
|
threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Maximum threshold is 125
|
|
|
|
*/
|
|
|
|
threshold = min(125, threshold);
|
|
|
|
|
|
|
|
return threshold;
|
|
|
|
}
|
2006-06-30 16:55:33 +08:00
|
|
|
|
|
|
|
/*
|
2006-09-01 12:27:35 +08:00
|
|
|
* Refresh the thresholds for each zone.
|
2006-06-30 16:55:33 +08:00
|
|
|
*/
|
2011-05-25 08:11:33 +08:00
|
|
|
void refresh_zone_stat_thresholds(void)
|
2006-06-30 16:55:33 +08:00
|
|
|
{
|
2006-09-01 12:27:35 +08:00
|
|
|
struct zone *zone;
|
|
|
|
int cpu;
|
|
|
|
int threshold;
|
|
|
|
|
2009-04-01 06:19:31 +08:00
|
|
|
for_each_populated_zone(zone) {
|
2010-09-10 07:38:17 +08:00
|
|
|
unsigned long max_drift, tolerate_drift;
|
|
|
|
|
2011-01-14 07:45:43 +08:00
|
|
|
threshold = calculate_normal_threshold(zone);
|
2006-09-01 12:27:35 +08:00
|
|
|
|
|
|
|
for_each_online_cpu(cpu)
|
2010-01-05 14:34:51 +08:00
|
|
|
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
|
|
|
|
= threshold;
|
2010-09-10 07:38:17 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Only set percpu_drift_mark if there is a danger that
|
|
|
|
* NR_FREE_PAGES reports the low watermark is ok when in fact
|
|
|
|
* the min watermark could be breached by an allocation
|
|
|
|
*/
|
|
|
|
tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
|
|
|
|
max_drift = num_online_cpus() * threshold;
|
|
|
|
if (max_drift > tolerate_drift)
|
|
|
|
zone->percpu_drift_mark = high_wmark_pages(zone) +
|
|
|
|
max_drift;
|
2006-09-01 12:27:35 +08:00
|
|
|
}
|
2006-06-30 16:55:33 +08:00
|
|
|
}
|
|
|
|
|
2011-01-14 07:45:43 +08:00
|
|
|
void set_pgdat_percpu_threshold(pg_data_t *pgdat,
|
|
|
|
int (*calculate_pressure)(struct zone *))
|
mm: page allocator: adjust the per-cpu counter threshold when memory is low
Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory
is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To
avoid synchronization overhead, these counters are maintained on a per-cpu
basis and drained both periodically and when a threshold is above a
threshold. On large CPU systems, the difference between the estimate and
real value of NR_FREE_PAGES can be very high. The system can get into a
case where pages are allocated far below the min watermark potentially
causing livelock issues. The commit solved the problem by taking a better
reading of NR_FREE_PAGES when memory was low.
Unfortately, as reported by Shaohua Li this accurate reading can consume a
large amount of CPU time on systems with many sockets due to cache line
bouncing. This patch takes a different approach. For large machines
where counter drift might be unsafe and while kswapd is awake, the per-cpu
thresholds for the target pgdat are reduced to limit the level of drift to
what should be a safe level. This incurs a performance penalty in heavy
memory pressure by a factor that depends on the workload and the machine
but the machine should function correctly without accidentally exhausting
all memory on a node. There is an additional cost when kswapd wakes and
sleeps but the event is not expected to be frequent - in Shaohua's test
case, there was one recorded sleep and wake event at least.
To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is
introduced that takes a more accurate reading of NR_FREE_PAGES when called
from wakeup_kswapd, when deciding whether it is really safe to go back to
sleep in sleeping_prematurely() and when deciding if a zone is really
balanced or not in balance_pgdat(). We are still using an expensive
function but limiting how often it is called.
When the test case is reproduced, the time spent in the watermark
functions is reduced. The following report is on the percentage of time
spent cumulatively spent in the functions zone_nr_free_pages(),
zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(),
zone_page_state_snapshot(), zone_page_state().
vanilla 11.6615%
disable-threshold 0.2584%
David said:
: We had to pull aa454840 "mm: page allocator: calculate a better estimate
: of NR_FREE_PAGES when memory is low and kswapd is awake" from 2.6.36
: internally because tests showed that it would cause the machine to stall
: as the result of heavy kswapd activity. I merged it back with this fix as
: it is pending in the -mm tree and it solves the issue we were seeing, so I
: definitely think this should be pushed to -stable (and I would seriously
: consider it for 2.6.37 inclusion even at this late date).
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reported-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Tested-by: Nicolas Bareil <nico@chdir.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: <stable@kernel.org> [2.6.37.1, 2.6.36.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-01-14 07:45:41 +08:00
|
|
|
{
|
|
|
|
struct zone *zone;
|
|
|
|
int cpu;
|
|
|
|
int threshold;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < pgdat->nr_zones; i++) {
|
|
|
|
zone = &pgdat->node_zones[i];
|
|
|
|
if (!zone->percpu_drift_mark)
|
|
|
|
continue;
|
|
|
|
|
2011-01-14 07:45:43 +08:00
|
|
|
threshold = (*calculate_pressure)(zone);
|
|
|
|
for_each_possible_cpu(cpu)
|
mm: page allocator: adjust the per-cpu counter threshold when memory is low
Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory
is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To
avoid synchronization overhead, these counters are maintained on a per-cpu
basis and drained both periodically and when a threshold is above a
threshold. On large CPU systems, the difference between the estimate and
real value of NR_FREE_PAGES can be very high. The system can get into a
case where pages are allocated far below the min watermark potentially
causing livelock issues. The commit solved the problem by taking a better
reading of NR_FREE_PAGES when memory was low.
Unfortately, as reported by Shaohua Li this accurate reading can consume a
large amount of CPU time on systems with many sockets due to cache line
bouncing. This patch takes a different approach. For large machines
where counter drift might be unsafe and while kswapd is awake, the per-cpu
thresholds for the target pgdat are reduced to limit the level of drift to
what should be a safe level. This incurs a performance penalty in heavy
memory pressure by a factor that depends on the workload and the machine
but the machine should function correctly without accidentally exhausting
all memory on a node. There is an additional cost when kswapd wakes and
sleeps but the event is not expected to be frequent - in Shaohua's test
case, there was one recorded sleep and wake event at least.
To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is
introduced that takes a more accurate reading of NR_FREE_PAGES when called
from wakeup_kswapd, when deciding whether it is really safe to go back to
sleep in sleeping_prematurely() and when deciding if a zone is really
balanced or not in balance_pgdat(). We are still using an expensive
function but limiting how often it is called.
When the test case is reproduced, the time spent in the watermark
functions is reduced. The following report is on the percentage of time
spent cumulatively spent in the functions zone_nr_free_pages(),
zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(),
zone_page_state_snapshot(), zone_page_state().
vanilla 11.6615%
disable-threshold 0.2584%
David said:
: We had to pull aa454840 "mm: page allocator: calculate a better estimate
: of NR_FREE_PAGES when memory is low and kswapd is awake" from 2.6.36
: internally because tests showed that it would cause the machine to stall
: as the result of heavy kswapd activity. I merged it back with this fix as
: it is pending in the -mm tree and it solves the issue we were seeing, so I
: definitely think this should be pushed to -stable (and I would seriously
: consider it for 2.6.37 inclusion even at this late date).
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reported-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Tested-by: Nicolas Bareil <nico@chdir.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: <stable@kernel.org> [2.6.37.1, 2.6.36.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-01-14 07:45:41 +08:00
|
|
|
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
|
|
|
|
= threshold;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-06-30 16:55:33 +08:00
|
|
|
/*
|
|
|
|
* For use when we know that interrupts are disabled.
|
|
|
|
*/
|
|
|
|
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
|
|
|
|
int delta)
|
|
|
|
{
|
2010-12-07 01:16:20 +08:00
|
|
|
struct per_cpu_pageset __percpu *pcp = zone->pageset;
|
|
|
|
s8 __percpu *p = pcp->vm_stat_diff + item;
|
2006-06-30 16:55:33 +08:00
|
|
|
long x;
|
2010-12-07 01:16:20 +08:00
|
|
|
long t;
|
|
|
|
|
|
|
|
x = delta + __this_cpu_read(*p);
|
2006-06-30 16:55:33 +08:00
|
|
|
|
2010-12-07 01:16:20 +08:00
|
|
|
t = __this_cpu_read(pcp->stat_threshold);
|
2006-06-30 16:55:33 +08:00
|
|
|
|
2010-12-07 01:16:20 +08:00
|
|
|
if (unlikely(x > t || x < -t)) {
|
2006-06-30 16:55:33 +08:00
|
|
|
zone_page_state_add(x, zone, item);
|
|
|
|
x = 0;
|
|
|
|
}
|
2010-12-07 01:16:20 +08:00
|
|
|
__this_cpu_write(*p, x);
|
2006-06-30 16:55:33 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__mod_zone_page_state);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Optimized increment and decrement functions.
|
|
|
|
*
|
|
|
|
* These are only for a single page and therefore can take a struct page *
|
|
|
|
* argument instead of struct zone *. This allows the inclusion of the code
|
|
|
|
* generated for page_zone(page) into the optimized functions.
|
|
|
|
*
|
|
|
|
* No overflow check is necessary and therefore the differential can be
|
|
|
|
* incremented or decremented in place which may allow the compilers to
|
|
|
|
* generate better code.
|
|
|
|
* The increment or decrement is known and therefore one boundary check can
|
|
|
|
* be omitted.
|
|
|
|
*
|
2006-09-01 12:27:35 +08:00
|
|
|
* NOTE: These functions are very performance sensitive. Change only
|
|
|
|
* with care.
|
|
|
|
*
|
2006-06-30 16:55:33 +08:00
|
|
|
* Some processors have inc/dec instructions that are atomic vs an interrupt.
|
|
|
|
* However, the code must first determine the differential location in a zone
|
|
|
|
* based on the processor number and then inc/dec the counter. There is no
|
|
|
|
* guarantee without disabling preemption that the processor will not change
|
|
|
|
* in between and therefore the atomicity vs. interrupt cannot be exploited
|
|
|
|
* in a useful way here.
|
|
|
|
*/
|
2007-02-10 17:43:01 +08:00
|
|
|
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
|
2006-06-30 16:55:33 +08:00
|
|
|
{
|
2010-12-07 01:16:20 +08:00
|
|
|
struct per_cpu_pageset __percpu *pcp = zone->pageset;
|
|
|
|
s8 __percpu *p = pcp->vm_stat_diff + item;
|
|
|
|
s8 v, t;
|
2006-06-30 16:55:33 +08:00
|
|
|
|
2010-12-07 01:40:02 +08:00
|
|
|
v = __this_cpu_inc_return(*p);
|
2010-12-07 01:16:20 +08:00
|
|
|
t = __this_cpu_read(pcp->stat_threshold);
|
|
|
|
if (unlikely(v > t)) {
|
|
|
|
s8 overstep = t >> 1;
|
2006-09-01 12:27:35 +08:00
|
|
|
|
2010-12-07 01:16:20 +08:00
|
|
|
zone_page_state_add(v + overstep, zone, item);
|
|
|
|
__this_cpu_write(*p, -overstep);
|
2006-06-30 16:55:33 +08:00
|
|
|
}
|
|
|
|
}
|
2006-06-30 16:55:44 +08:00
|
|
|
|
|
|
|
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
|
|
|
|
{
|
|
|
|
__inc_zone_state(page_zone(page), item);
|
|
|
|
}
|
2006-06-30 16:55:33 +08:00
|
|
|
EXPORT_SYMBOL(__inc_zone_page_state);
|
|
|
|
|
2007-02-10 17:43:01 +08:00
|
|
|
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
|
2006-06-30 16:55:33 +08:00
|
|
|
{
|
2010-12-07 01:16:20 +08:00
|
|
|
struct per_cpu_pageset __percpu *pcp = zone->pageset;
|
|
|
|
s8 __percpu *p = pcp->vm_stat_diff + item;
|
|
|
|
s8 v, t;
|
2006-06-30 16:55:33 +08:00
|
|
|
|
2010-12-07 01:40:02 +08:00
|
|
|
v = __this_cpu_dec_return(*p);
|
2010-12-07 01:16:20 +08:00
|
|
|
t = __this_cpu_read(pcp->stat_threshold);
|
|
|
|
if (unlikely(v < - t)) {
|
|
|
|
s8 overstep = t >> 1;
|
2006-06-30 16:55:33 +08:00
|
|
|
|
2010-12-07 01:16:20 +08:00
|
|
|
zone_page_state_add(v - overstep, zone, item);
|
|
|
|
__this_cpu_write(*p, overstep);
|
2006-06-30 16:55:33 +08:00
|
|
|
}
|
|
|
|
}
|
2007-02-10 17:43:01 +08:00
|
|
|
|
|
|
|
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
|
|
|
|
{
|
|
|
|
__dec_zone_state(page_zone(page), item);
|
|
|
|
}
|
2006-06-30 16:55:33 +08:00
|
|
|
EXPORT_SYMBOL(__dec_zone_page_state);
|
|
|
|
|
2010-12-15 00:28:46 +08:00
|
|
|
#ifdef CONFIG_CMPXCHG_LOCAL
|
|
|
|
/*
|
|
|
|
* If we have cmpxchg_local support then we do not need to incur the overhead
|
|
|
|
* that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
|
|
|
|
*
|
|
|
|
* mod_state() modifies the zone counter state through atomic per cpu
|
|
|
|
* operations.
|
|
|
|
*
|
|
|
|
* Overstep mode specifies how overstep should handled:
|
|
|
|
* 0 No overstepping
|
|
|
|
* 1 Overstepping half of threshold
|
|
|
|
* -1 Overstepping minus half of threshold
|
|
|
|
*/
|
|
|
|
static inline void mod_state(struct zone *zone,
|
|
|
|
enum zone_stat_item item, int delta, int overstep_mode)
|
|
|
|
{
|
|
|
|
struct per_cpu_pageset __percpu *pcp = zone->pageset;
|
|
|
|
s8 __percpu *p = pcp->vm_stat_diff + item;
|
|
|
|
long o, n, t, z;
|
|
|
|
|
|
|
|
do {
|
|
|
|
z = 0; /* overflow to zone counters */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The fetching of the stat_threshold is racy. We may apply
|
|
|
|
* a counter threshold to the wrong the cpu if we get
|
2011-04-15 06:21:58 +08:00
|
|
|
* rescheduled while executing here. However, the next
|
|
|
|
* counter update will apply the threshold again and
|
|
|
|
* therefore bring the counter under the threshold again.
|
|
|
|
*
|
|
|
|
* Most of the time the thresholds are the same anyways
|
|
|
|
* for all cpus in a zone.
|
2010-12-15 00:28:46 +08:00
|
|
|
*/
|
|
|
|
t = this_cpu_read(pcp->stat_threshold);
|
|
|
|
|
|
|
|
o = this_cpu_read(*p);
|
|
|
|
n = delta + o;
|
|
|
|
|
|
|
|
if (n > t || n < -t) {
|
|
|
|
int os = overstep_mode * (t >> 1) ;
|
|
|
|
|
|
|
|
/* Overflow must be added to zone counters */
|
|
|
|
z = n + os;
|
|
|
|
n = -os;
|
|
|
|
}
|
|
|
|
} while (this_cpu_cmpxchg(*p, o, n) != o);
|
|
|
|
|
|
|
|
if (z)
|
|
|
|
zone_page_state_add(z, zone, item);
|
|
|
|
}
|
|
|
|
|
|
|
|
void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
|
|
|
|
int delta)
|
|
|
|
{
|
|
|
|
mod_state(zone, item, delta, 0);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(mod_zone_page_state);
|
|
|
|
|
|
|
|
void inc_zone_state(struct zone *zone, enum zone_stat_item item)
|
|
|
|
{
|
|
|
|
mod_state(zone, item, 1, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
void inc_zone_page_state(struct page *page, enum zone_stat_item item)
|
|
|
|
{
|
|
|
|
mod_state(page_zone(page), item, 1, 1);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(inc_zone_page_state);
|
|
|
|
|
|
|
|
void dec_zone_page_state(struct page *page, enum zone_stat_item item)
|
|
|
|
{
|
|
|
|
mod_state(page_zone(page), item, -1, -1);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(dec_zone_page_state);
|
|
|
|
#else
|
|
|
|
/*
|
|
|
|
* Use interrupt disable to serialize counter updates
|
|
|
|
*/
|
|
|
|
void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
|
|
|
|
int delta)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
__mod_zone_page_state(zone, item, delta);
|
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(mod_zone_page_state);
|
|
|
|
|
2006-06-30 16:55:44 +08:00
|
|
|
void inc_zone_state(struct zone *zone, enum zone_stat_item item)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
__inc_zone_state(zone, item);
|
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
|
2006-06-30 16:55:33 +08:00
|
|
|
void inc_zone_page_state(struct page *page, enum zone_stat_item item)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
struct zone *zone;
|
|
|
|
|
|
|
|
zone = page_zone(page);
|
|
|
|
local_irq_save(flags);
|
2006-06-30 16:55:44 +08:00
|
|
|
__inc_zone_state(zone, item);
|
2006-06-30 16:55:33 +08:00
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(inc_zone_page_state);
|
|
|
|
|
|
|
|
void dec_zone_page_state(struct page *page, enum zone_stat_item item)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
2006-09-01 12:27:34 +08:00
|
|
|
__dec_zone_page_state(page, item);
|
2006-06-30 16:55:33 +08:00
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(dec_zone_page_state);
|
2010-12-15 00:28:46 +08:00
|
|
|
#endif
|
2006-06-30 16:55:33 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Update the zone counters for one cpu.
|
2007-05-09 17:35:14 +08:00
|
|
|
*
|
2008-02-05 14:29:16 +08:00
|
|
|
* The cpu specified must be either the current cpu or a processor that
|
|
|
|
* is not online. If it is the current cpu then the execution thread must
|
|
|
|
* be pinned to the current cpu.
|
|
|
|
*
|
2007-05-09 17:35:14 +08:00
|
|
|
* Note that refresh_cpu_vm_stats strives to only access
|
|
|
|
* node local memory. The per cpu pagesets on remote zones are placed
|
|
|
|
* in the memory local to the processor using that pageset. So the
|
|
|
|
* loop over all zones will access a series of cachelines local to
|
|
|
|
* the processor.
|
|
|
|
*
|
|
|
|
* The call to zone_page_state_add updates the cachelines with the
|
|
|
|
* statistics in the remote zone struct as well as the global cachelines
|
|
|
|
* with the global counters. These could cause remote node cache line
|
|
|
|
* bouncing and will have to be only done when necessary.
|
2006-06-30 16:55:33 +08:00
|
|
|
*/
|
|
|
|
void refresh_cpu_vm_stats(int cpu)
|
|
|
|
{
|
|
|
|
struct zone *zone;
|
|
|
|
int i;
|
2008-02-05 14:29:16 +08:00
|
|
|
int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
|
2006-06-30 16:55:33 +08:00
|
|
|
|
2009-04-01 06:19:31 +08:00
|
|
|
for_each_populated_zone(zone) {
|
2007-05-09 17:35:14 +08:00
|
|
|
struct per_cpu_pageset *p;
|
2006-06-30 16:55:33 +08:00
|
|
|
|
2010-01-05 14:34:51 +08:00
|
|
|
p = per_cpu_ptr(zone->pageset, cpu);
|
2006-06-30 16:55:33 +08:00
|
|
|
|
|
|
|
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
|
2007-05-09 17:35:14 +08:00
|
|
|
if (p->vm_stat_diff[i]) {
|
2008-02-05 14:29:16 +08:00
|
|
|
unsigned long flags;
|
|
|
|
int v;
|
|
|
|
|
2006-06-30 16:55:33 +08:00
|
|
|
local_irq_save(flags);
|
2008-02-05 14:29:16 +08:00
|
|
|
v = p->vm_stat_diff[i];
|
2007-05-09 17:35:14 +08:00
|
|
|
p->vm_stat_diff[i] = 0;
|
2008-02-05 14:29:16 +08:00
|
|
|
local_irq_restore(flags);
|
|
|
|
atomic_long_add(v, &zone->vm_stat[i]);
|
|
|
|
global_diff[i] += v;
|
2007-05-09 17:35:14 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
/* 3 seconds idle till flush */
|
|
|
|
p->expire = 3;
|
|
|
|
#endif
|
2006-06-30 16:55:33 +08:00
|
|
|
}
|
2008-04-28 17:13:37 +08:00
|
|
|
cond_resched();
|
2007-05-09 17:35:14 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
/*
|
|
|
|
* Deal with draining the remote pageset of this
|
|
|
|
* processor
|
|
|
|
*
|
|
|
|
* Check if there are pages remaining in this pageset
|
|
|
|
* if not then there is nothing to expire.
|
|
|
|
*/
|
2008-02-05 14:29:19 +08:00
|
|
|
if (!p->expire || !p->pcp.count)
|
2007-05-09 17:35:14 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We never drain zones local to this processor.
|
|
|
|
*/
|
|
|
|
if (zone_to_nid(zone) == numa_node_id()) {
|
|
|
|
p->expire = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
p->expire--;
|
|
|
|
if (p->expire)
|
|
|
|
continue;
|
|
|
|
|
2008-02-05 14:29:19 +08:00
|
|
|
if (p->pcp.count)
|
|
|
|
drain_zone_pages(zone, &p->pcp);
|
2007-05-09 17:35:14 +08:00
|
|
|
#endif
|
2006-06-30 16:55:33 +08:00
|
|
|
}
|
2008-02-05 14:29:16 +08:00
|
|
|
|
|
|
|
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
|
|
|
|
if (global_diff[i])
|
|
|
|
atomic_long_add(global_diff[i], &vm_stat[i]);
|
2006-06-30 16:55:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2006-06-30 16:55:44 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
/*
|
|
|
|
* zonelist = the list of zones passed to the allocator
|
|
|
|
* z = the zone from which the allocation occurred.
|
|
|
|
*
|
|
|
|
* Must be called with interrupts disabled.
|
2011-03-23 07:33:12 +08:00
|
|
|
*
|
|
|
|
* When __GFP_OTHER_NODE is set assume the node of the preferred
|
|
|
|
* zone is the local node. This is useful for daemons who allocate
|
|
|
|
* memory on behalf of other processes.
|
2006-06-30 16:55:44 +08:00
|
|
|
*/
|
2011-03-23 07:33:12 +08:00
|
|
|
void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
|
2006-06-30 16:55:44 +08:00
|
|
|
{
|
2008-04-28 17:12:14 +08:00
|
|
|
if (z->zone_pgdat == preferred_zone->zone_pgdat) {
|
2006-06-30 16:55:44 +08:00
|
|
|
__inc_zone_state(z, NUMA_HIT);
|
|
|
|
} else {
|
|
|
|
__inc_zone_state(z, NUMA_MISS);
|
2008-04-28 17:12:14 +08:00
|
|
|
__inc_zone_state(preferred_zone, NUMA_FOREIGN);
|
2006-06-30 16:55:44 +08:00
|
|
|
}
|
2011-03-23 07:33:12 +08:00
|
|
|
if (z->node == ((flags & __GFP_OTHER_NODE) ?
|
|
|
|
preferred_zone->node : numa_node_id()))
|
2006-06-30 16:55:44 +08:00
|
|
|
__inc_zone_state(z, NUMA_LOCAL);
|
|
|
|
else
|
|
|
|
__inc_zone_state(z, NUMA_OTHER);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2010-05-25 05:32:25 +08:00
|
|
|
#ifdef CONFIG_COMPACTION
|
2010-10-27 05:22:04 +08:00
|
|
|
|
2010-05-25 05:32:25 +08:00
|
|
|
struct contig_page_info {
|
|
|
|
unsigned long free_pages;
|
|
|
|
unsigned long free_blocks_total;
|
|
|
|
unsigned long free_blocks_suitable;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate the number of free pages in a zone, how many contiguous
|
|
|
|
* pages are free and how many are large enough to satisfy an allocation of
|
|
|
|
* the target size. Note that this function makes no attempt to estimate
|
|
|
|
* how many suitable free blocks there *might* be if MOVABLE pages were
|
|
|
|
* migrated. Calculating that is possible, but expensive and can be
|
|
|
|
* figured out from userspace
|
|
|
|
*/
|
|
|
|
static void fill_contig_page_info(struct zone *zone,
|
|
|
|
unsigned int suitable_order,
|
|
|
|
struct contig_page_info *info)
|
|
|
|
{
|
|
|
|
unsigned int order;
|
|
|
|
|
|
|
|
info->free_pages = 0;
|
|
|
|
info->free_blocks_total = 0;
|
|
|
|
info->free_blocks_suitable = 0;
|
|
|
|
|
|
|
|
for (order = 0; order < MAX_ORDER; order++) {
|
|
|
|
unsigned long blocks;
|
|
|
|
|
|
|
|
/* Count number of free blocks */
|
|
|
|
blocks = zone->free_area[order].nr_free;
|
|
|
|
info->free_blocks_total += blocks;
|
|
|
|
|
|
|
|
/* Count free base pages */
|
|
|
|
info->free_pages += blocks << order;
|
|
|
|
|
|
|
|
/* Count the suitable free blocks */
|
|
|
|
if (order >= suitable_order)
|
|
|
|
info->free_blocks_suitable += blocks <<
|
|
|
|
(order - suitable_order);
|
|
|
|
}
|
|
|
|
}
|
2010-05-25 05:32:26 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* A fragmentation index only makes sense if an allocation of a requested
|
|
|
|
* size would fail. If that is true, the fragmentation index indicates
|
|
|
|
* whether external fragmentation or a lack of memory was the problem.
|
|
|
|
* The value can be used to determine if page reclaim or compaction
|
|
|
|
* should be used
|
|
|
|
*/
|
2010-05-25 05:32:30 +08:00
|
|
|
static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
|
2010-05-25 05:32:26 +08:00
|
|
|
{
|
|
|
|
unsigned long requested = 1UL << order;
|
|
|
|
|
|
|
|
if (!info->free_blocks_total)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Fragmentation index only makes sense when a request would fail */
|
|
|
|
if (info->free_blocks_suitable)
|
|
|
|
return -1000;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Index is between 0 and 1 so return within 3 decimal places
|
|
|
|
*
|
|
|
|
* 0 => allocation would fail due to lack of memory
|
|
|
|
* 1 => allocation would fail due to fragmentation
|
|
|
|
*/
|
|
|
|
return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
|
|
|
|
}
|
2010-05-25 05:32:30 +08:00
|
|
|
|
|
|
|
/* Same as __fragmentation index but allocs contig_page_info on stack */
|
|
|
|
int fragmentation_index(struct zone *zone, unsigned int order)
|
|
|
|
{
|
|
|
|
struct contig_page_info info;
|
|
|
|
|
|
|
|
fill_contig_page_info(zone, order, &info);
|
|
|
|
return __fragmentation_index(order, &info);
|
|
|
|
}
|
2010-05-25 05:32:25 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
|
2008-10-06 08:13:52 +08:00
|
|
|
#include <linux/proc_fs.h>
|
2006-06-30 16:55:32 +08:00
|
|
|
#include <linux/seq_file.h>
|
|
|
|
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
static char * const migratetype_names[MIGRATE_TYPES] = {
|
|
|
|
"Unmovable",
|
|
|
|
"Reclaimable",
|
|
|
|
"Movable",
|
|
|
|
"Reserve",
|
add "Isolate" migratetype name to /proc/pagetypeinfo
In a5d76b54a3f3a40385d7f76069a2feac9f1bad63 (memory unplug: page isolation by
KAMEZAWA Hiroyuki), "isolate" migratetype added. but unfortunately, it
doesn't treat /proc/pagetypeinfo display logic.
this patch add "Isolate" to pagetype name field.
/proc/pagetype
before:
------------------------------------------------------------------------------------------------------------------------
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 1 2 2 2 1 2 2 1 1 0 0
Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 2 3 3 1 3 3 2 0 0 0 0
Node 0, zone DMA, type Reserve 0 0 0 0 0 0 0 0 0 0 1
Node 0, zone DMA, type <NULL> 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Unmovable 1 9 7 4 1 1 1 1 0 0 0
Node 0, zone Normal, type Reclaimable 5 2 0 0 1 1 0 0 0 1 0
Node 0, zone Normal, type Movable 0 1 1 0 0 0 1 0 0 1 60
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 1
Node 0, zone Normal, type <NULL> 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone HighMem, type Unmovable 0 0 1 1 1 0 1 1 2 2 0
Node 0, zone HighMem, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone HighMem, type Movable 236 62 6 2 2 1 1 0 1 1 16
Node 0, zone HighMem, type Reserve 0 0 0 0 0 0 0 0 0 0 1
Node 0, zone HighMem, type <NULL> 0 0 0 0 0 0 0 0 0 0 0
Number of blocks type Unmovable Reclaimable Movable Reserve <NULL>
Node 0, zone DMA 1 0 2 1 0
Node 0, zone Normal 10 40 169 1 0
Node 0, zone HighMem 2 0 283 1 0
after:
------------------------------------------------------------------------------------------------------------------------
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 1 2 2 2 1 2 2 1 1 0 0
Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 2 3 3 1 3 3 2 0 0 0 0
Node 0, zone DMA, type Reserve 0 0 0 0 0 0 0 0 0 0 1
Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Unmovable 0 2 1 1 0 1 0 0 0 0 0
Node 0, zone Normal, type Reclaimable 1 1 1 1 1 0 1 1 1 0 0
Node 0, zone Normal, type Movable 0 1 1 1 0 1 0 1 0 0 196
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 1
Node 0, zone Normal, type Isolate 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone HighMem, type Unmovable 0 1 0 0 0 1 1 1 2 2 0
Node 0, zone HighMem, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone HighMem, type Movable 1 0 1 1 0 0 0 0 1 0 200
Node 0, zone HighMem, type Reserve 0 0 0 0 0 0 0 0 0 0 1
Node 0, zone HighMem, type Isolate 0 0 0 0 0 0 0 0 0 0 0
Number of blocks type Unmovable Reclaimable Movable Reserve Isolate
Node 0, zone DMA 1 0 2 1 0
Node 0, zone Normal 8 4 207 1 0
Node 0, zone HighMem 2 0 283 1 0
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-16 05:34:42 +08:00
|
|
|
"Isolate",
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
};
|
|
|
|
|
2006-06-30 16:55:32 +08:00
|
|
|
static void *frag_start(struct seq_file *m, loff_t *pos)
|
|
|
|
{
|
|
|
|
pg_data_t *pgdat;
|
|
|
|
loff_t node = *pos;
|
|
|
|
for (pgdat = first_online_pgdat();
|
|
|
|
pgdat && node;
|
|
|
|
pgdat = next_online_pgdat(pgdat))
|
|
|
|
--node;
|
|
|
|
|
|
|
|
return pgdat;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
|
|
|
|
{
|
|
|
|
pg_data_t *pgdat = (pg_data_t *)arg;
|
|
|
|
|
|
|
|
(*pos)++;
|
|
|
|
return next_online_pgdat(pgdat);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void frag_stop(struct seq_file *m, void *arg)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
/* Walk all the zones in a node and print using a callback */
|
|
|
|
static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
|
|
|
|
void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
|
2006-06-30 16:55:32 +08:00
|
|
|
{
|
|
|
|
struct zone *zone;
|
|
|
|
struct zone *node_zones = pgdat->node_zones;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
|
|
|
|
if (!populated_zone(zone))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&zone->lock, flags);
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
print(m, pgdat, zone);
|
2006-06-30 16:55:32 +08:00
|
|
|
spin_unlock_irqrestore(&zone->lock, flags);
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
}
|
|
|
|
}
|
2010-05-25 05:32:25 +08:00
|
|
|
#endif
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
|
2011-09-15 07:21:05 +08:00
|
|
|
#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
|
2011-05-25 08:11:28 +08:00
|
|
|
#ifdef CONFIG_ZONE_DMA
|
|
|
|
#define TEXT_FOR_DMA(xx) xx "_dma",
|
|
|
|
#else
|
|
|
|
#define TEXT_FOR_DMA(xx)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_ZONE_DMA32
|
|
|
|
#define TEXT_FOR_DMA32(xx) xx "_dma32",
|
|
|
|
#else
|
|
|
|
#define TEXT_FOR_DMA32(xx)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
|
|
#define TEXT_FOR_HIGHMEM(xx) xx "_high",
|
|
|
|
#else
|
|
|
|
#define TEXT_FOR_HIGHMEM(xx)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
|
|
|
|
TEXT_FOR_HIGHMEM(xx) xx "_movable",
|
|
|
|
|
|
|
|
const char * const vmstat_text[] = {
|
|
|
|
/* Zoned VM counters */
|
|
|
|
"nr_free_pages",
|
|
|
|
"nr_inactive_anon",
|
|
|
|
"nr_active_anon",
|
|
|
|
"nr_inactive_file",
|
|
|
|
"nr_active_file",
|
|
|
|
"nr_unevictable",
|
|
|
|
"nr_mlock",
|
|
|
|
"nr_anon_pages",
|
|
|
|
"nr_mapped",
|
|
|
|
"nr_file_pages",
|
|
|
|
"nr_dirty",
|
|
|
|
"nr_writeback",
|
|
|
|
"nr_slab_reclaimable",
|
|
|
|
"nr_slab_unreclaimable",
|
|
|
|
"nr_page_table_pages",
|
|
|
|
"nr_kernel_stack",
|
|
|
|
"nr_unstable",
|
|
|
|
"nr_bounce",
|
|
|
|
"nr_vmscan_write",
|
mm: vmscan: do not writeback filesystem pages in direct reclaim
Testing from the XFS folk revealed that there is still too much I/O from
the end of the LRU in kswapd. Previously it was considered acceptable by
VM people for a small number of pages to be written back from reclaim with
testing generally showing about 0.3% of pages reclaimed were written back
(higher if memory was low). That writing back a small number of pages is
ok has been heavily disputed for quite some time and Dave Chinner
explained it well;
It doesn't have to be a very high number to be a problem. IO
is orders of magnitude slower than the CPU time it takes to
flush a page, so the cost of making a bad flush decision is
very high. And single page writeback from the LRU is almost
always a bad flush decision.
To complicate matters, filesystems respond very differently to requests
from reclaim according to Christoph Hellwig;
xfs tries to write it back if the requester is kswapd
ext4 ignores the request if it's a delayed allocation
btrfs ignores the request
As a result, each filesystem has different performance characteristics
when under memory pressure and there are many pages being dirtied. In
some cases, the request is ignored entirely so the VM cannot depend on the
IO being dispatched.
The objective of this series is to reduce writing of filesystem-backed
pages from reclaim, play nicely with writeback that is already in progress
and throttle reclaim appropriately when writeback pages are encountered.
The assumption is that the flushers will always write pages faster than if
reclaim issues the IO.
A secondary goal is to avoid the problem whereby direct reclaim splices
two potentially deep call stacks together.
There is a potential new problem as reclaim has less control over how long
before a page in a particularly zone or container is cleaned and direct
reclaimers depend on kswapd or flusher threads to do the necessary work.
However, as filesystems sometimes ignore direct reclaim requests already,
it is not expected to be a serious issue.
Patch 1 disables writeback of filesystem pages from direct reclaim
entirely. Anonymous pages are still written.
Patch 2 removes dead code in lumpy reclaim as it is no longer able
to synchronously write pages. This hurts lumpy reclaim but
there is an expectation that compaction is used for hugepage
allocations these days and lumpy reclaim's days are numbered.
Patches 3-4 add warnings to XFS and ext4 if called from
direct reclaim. With patch 1, this "never happens" and is
intended to catch regressions in this logic in the future.
Patch 5 disables writeback of filesystem pages from kswapd unless
the priority is raised to the point where kswapd is considered
to be in trouble.
Patch 6 throttles reclaimers if too many dirty pages are being
encountered and the zones or backing devices are congested.
Patch 7 invalidates dirty pages found at the end of the LRU so they
are reclaimed quickly after being written back rather than
waiting for a reclaimer to find them
I consider this series to be orthogonal to the writeback work but it is
worth noting that the writeback work affects the viability of patch 8 in
particular.
I tested this on ext4 and xfs using fs_mark, a simple writeback test based
on dd and a micro benchmark that does a streaming write to a large mapping
(exercises use-once LRU logic) followed by streaming writes to a mix of
anonymous and file-backed mappings. The command line for fs_mark when
botted with 512M looked something like
./fs_mark -d /tmp/fsmark-2676 -D 100 -N 150 -n 150 -L 25 -t 1 -S0 -s 10485760
The number of files was adjusted depending on the amount of available
memory so that the files created was about 3xRAM. For multiple threads,
the -d switch is specified multiple times.
The test machine is x86-64 with an older generation of AMD processor with
4 cores. The underlying storage was 4 disks configured as RAID-0 as this
was the best configuration of storage I had available. Swap is on a
separate disk. Dirty ratio was tuned to 40% instead of the default of
20%.
Testing was run with and without monitors to both verify that the patches
were operating as expected and that any performance gain was real and not
due to interference from monitors.
Here is a summary of results based on testing XFS.
512M1P-xfs Files/s mean 32.69 ( 0.00%) 34.44 ( 5.08%)
512M1P-xfs Elapsed Time fsmark 51.41 48.29
512M1P-xfs Elapsed Time simple-wb 114.09 108.61
512M1P-xfs Elapsed Time mmap-strm 113.46 109.34
512M1P-xfs Kswapd efficiency fsmark 62% 63%
512M1P-xfs Kswapd efficiency simple-wb 56% 61%
512M1P-xfs Kswapd efficiency mmap-strm 44% 42%
512M-xfs Files/s mean 30.78 ( 0.00%) 35.94 (14.36%)
512M-xfs Elapsed Time fsmark 56.08 48.90
512M-xfs Elapsed Time simple-wb 112.22 98.13
512M-xfs Elapsed Time mmap-strm 219.15 196.67
512M-xfs Kswapd efficiency fsmark 54% 56%
512M-xfs Kswapd efficiency simple-wb 54% 55%
512M-xfs Kswapd efficiency mmap-strm 45% 44%
512M-4X-xfs Files/s mean 30.31 ( 0.00%) 33.33 ( 9.06%)
512M-4X-xfs Elapsed Time fsmark 63.26 55.88
512M-4X-xfs Elapsed Time simple-wb 100.90 90.25
512M-4X-xfs Elapsed Time mmap-strm 261.73 255.38
512M-4X-xfs Kswapd efficiency fsmark 49% 50%
512M-4X-xfs Kswapd efficiency simple-wb 54% 56%
512M-4X-xfs Kswapd efficiency mmap-strm 37% 36%
512M-16X-xfs Files/s mean 60.89 ( 0.00%) 65.22 ( 6.64%)
512M-16X-xfs Elapsed Time fsmark 67.47 58.25
512M-16X-xfs Elapsed Time simple-wb 103.22 90.89
512M-16X-xfs Elapsed Time mmap-strm 237.09 198.82
512M-16X-xfs Kswapd efficiency fsmark 45% 46%
512M-16X-xfs Kswapd efficiency simple-wb 53% 55%
512M-16X-xfs Kswapd efficiency mmap-strm 33% 33%
Up until 512-4X, the FSmark improvements were statistically significant.
For the 4X and 16X tests the results were within standard deviations but
just barely. The time to completion for all tests is improved which is an
important result. In general, kswapd efficiency is not affected by
skipping dirty pages.
1024M1P-xfs Files/s mean 39.09 ( 0.00%) 41.15 ( 5.01%)
1024M1P-xfs Elapsed Time fsmark 84.14 80.41
1024M1P-xfs Elapsed Time simple-wb 210.77 184.78
1024M1P-xfs Elapsed Time mmap-strm 162.00 160.34
1024M1P-xfs Kswapd efficiency fsmark 69% 75%
1024M1P-xfs Kswapd efficiency simple-wb 71% 77%
1024M1P-xfs Kswapd efficiency mmap-strm 43% 44%
1024M-xfs Files/s mean 35.45 ( 0.00%) 37.00 ( 4.19%)
1024M-xfs Elapsed Time fsmark 94.59 91.00
1024M-xfs Elapsed Time simple-wb 229.84 195.08
1024M-xfs Elapsed Time mmap-strm 405.38 440.29
1024M-xfs Kswapd efficiency fsmark 79% 71%
1024M-xfs Kswapd efficiency simple-wb 74% 74%
1024M-xfs Kswapd efficiency mmap-strm 39% 42%
1024M-4X-xfs Files/s mean 32.63 ( 0.00%) 35.05 ( 6.90%)
1024M-4X-xfs Elapsed Time fsmark 103.33 97.74
1024M-4X-xfs Elapsed Time simple-wb 204.48 178.57
1024M-4X-xfs Elapsed Time mmap-strm 528.38 511.88
1024M-4X-xfs Kswapd efficiency fsmark 81% 70%
1024M-4X-xfs Kswapd efficiency simple-wb 73% 72%
1024M-4X-xfs Kswapd efficiency mmap-strm 39% 38%
1024M-16X-xfs Files/s mean 42.65 ( 0.00%) 42.97 ( 0.74%)
1024M-16X-xfs Elapsed Time fsmark 103.11 99.11
1024M-16X-xfs Elapsed Time simple-wb 200.83 178.24
1024M-16X-xfs Elapsed Time mmap-strm 397.35 459.82
1024M-16X-xfs Kswapd efficiency fsmark 84% 69%
1024M-16X-xfs Kswapd efficiency simple-wb 74% 73%
1024M-16X-xfs Kswapd efficiency mmap-strm 39% 40%
All FSMark tests up to 16X had statistically significant improvements.
For the most part, tests are completing faster with the exception of the
streaming writes to a mixture of anonymous and file-backed mappings which
were slower in two cases
In the cases where the mmap-strm tests were slower, there was more
swapping due to dirty pages being skipped. The number of additional pages
swapped is almost identical to the fewer number of pages written from
reclaim. In other words, roughly the same number of pages were reclaimed
but swapping was slower. As the test is a bit unrealistic and stresses
memory heavily, the small shift is acceptable.
4608M1P-xfs Files/s mean 29.75 ( 0.00%) 30.96 ( 3.91%)
4608M1P-xfs Elapsed Time fsmark 512.01 492.15
4608M1P-xfs Elapsed Time simple-wb 618.18 566.24
4608M1P-xfs Elapsed Time mmap-strm 488.05 465.07
4608M1P-xfs Kswapd efficiency fsmark 93% 86%
4608M1P-xfs Kswapd efficiency simple-wb 88% 84%
4608M1P-xfs Kswapd efficiency mmap-strm 46% 45%
4608M-xfs Files/s mean 27.60 ( 0.00%) 28.85 ( 4.33%)
4608M-xfs Elapsed Time fsmark 555.96 532.34
4608M-xfs Elapsed Time simple-wb 659.72 571.85
4608M-xfs Elapsed Time mmap-strm 1082.57 1146.38
4608M-xfs Kswapd efficiency fsmark 89% 91%
4608M-xfs Kswapd efficiency simple-wb 88% 82%
4608M-xfs Kswapd efficiency mmap-strm 48% 46%
4608M-4X-xfs Files/s mean 26.00 ( 0.00%) 27.47 ( 5.35%)
4608M-4X-xfs Elapsed Time fsmark 592.91 564.00
4608M-4X-xfs Elapsed Time simple-wb 616.65 575.07
4608M-4X-xfs Elapsed Time mmap-strm 1773.02 1631.53
4608M-4X-xfs Kswapd efficiency fsmark 90% 94%
4608M-4X-xfs Kswapd efficiency simple-wb 87% 82%
4608M-4X-xfs Kswapd efficiency mmap-strm 43% 43%
4608M-16X-xfs Files/s mean 26.07 ( 0.00%) 26.42 ( 1.32%)
4608M-16X-xfs Elapsed Time fsmark 602.69 585.78
4608M-16X-xfs Elapsed Time simple-wb 606.60 573.81
4608M-16X-xfs Elapsed Time mmap-strm 1549.75 1441.86
4608M-16X-xfs Kswapd efficiency fsmark 98% 98%
4608M-16X-xfs Kswapd efficiency simple-wb 88% 82%
4608M-16X-xfs Kswapd efficiency mmap-strm 44% 42%
Unlike the other tests, the fsmark results are not statistically
significant but the min and max times are both improved and for the most
part, tests completed faster.
There are other indications that this is an improvement as well. For
example, in the vast majority of cases, there were fewer pages scanned by
direct reclaim implying in many cases that stalls due to direct reclaim
are reduced. KSwapd is scanning more due to skipping dirty pages which is
unfortunate but the CPU usage is still acceptable
In an earlier set of tests, I used blktrace and in almost all cases
throughput throughout the entire test was higher. However, I ended up
discarding those results as recording blktrace data was too heavy for my
liking.
On a laptop, I plugged in a USB stick and ran a similar tests of tests
using it as backing storage. A desktop environment was running and for
the entire duration of the tests, firefox and gnome terminal were
launching and exiting to vaguely simulate a user.
1024M-xfs Files/s mean 0.41 ( 0.00%) 0.44 ( 6.82%)
1024M-xfs Elapsed Time fsmark 2053.52 1641.03
1024M-xfs Elapsed Time simple-wb 1229.53 768.05
1024M-xfs Elapsed Time mmap-strm 4126.44 4597.03
1024M-xfs Kswapd efficiency fsmark 84% 85%
1024M-xfs Kswapd efficiency simple-wb 92% 81%
1024M-xfs Kswapd efficiency mmap-strm 60% 51%
1024M-xfs Avg wait ms fsmark 5404.53 4473.87
1024M-xfs Avg wait ms simple-wb 2541.35 1453.54
1024M-xfs Avg wait ms mmap-strm 3400.25 3852.53
The mmap-strm results were hurt because firefox launching had a tendency
to push the test out of memory. On the postive side, firefox launched
marginally faster with the patches applied. Time to completion for many
tests was faster but more importantly - the "Avg wait" time as measured by
iostat was far lower implying the system would be more responsive. It was
also the case that "Avg wait ms" on the root filesystem was lower. I
tested it manually and while the system felt slightly more responsive
while copying data to a USB stick, it was marginal enough that it could be
my imagination.
This patch: do not writeback filesystem pages in direct reclaim.
When kswapd is failing to keep zones above the min watermark, a process
will enter direct reclaim in the same manner kswapd does. If a dirty page
is encountered during the scan, this page is written to backing storage
using mapping->writepage.
This causes two problems. First, it can result in very deep call stacks,
particularly if the target storage or filesystem are complex. Some
filesystems ignore write requests from direct reclaim as a result. The
second is that a single-page flush is inefficient in terms of IO. While
there is an expectation that the elevator will merge requests, this does
not always happen. Quoting Christoph Hellwig;
The elevator has a relatively small window it can operate on,
and can never fix up a bad large scale writeback pattern.
This patch prevents direct reclaim writing back filesystem pages by
checking if current is kswapd. Anonymous pages are still written to swap
as there is not the equivalent of a flusher thread for anonymous pages.
If the dirty pages cannot be written back, they are placed back on the LRU
lists. There is now a direct dependency on dirty page balancing to
prevent too many pages in the system being dirtied which would prevent
reclaim making forward progress.
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Alex Elder <aelder@sgi.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-11-01 08:07:38 +08:00
|
|
|
"nr_vmscan_write_skip",
|
2011-05-25 08:11:28 +08:00
|
|
|
"nr_writeback_temp",
|
|
|
|
"nr_isolated_anon",
|
|
|
|
"nr_isolated_file",
|
|
|
|
"nr_shmem",
|
|
|
|
"nr_dirtied",
|
|
|
|
"nr_written",
|
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
"numa_hit",
|
|
|
|
"numa_miss",
|
|
|
|
"numa_foreign",
|
|
|
|
"numa_interleave",
|
|
|
|
"numa_local",
|
|
|
|
"numa_other",
|
|
|
|
#endif
|
|
|
|
"nr_anon_transparent_hugepages",
|
|
|
|
"nr_dirty_threshold",
|
|
|
|
"nr_dirty_background_threshold",
|
|
|
|
|
|
|
|
#ifdef CONFIG_VM_EVENT_COUNTERS
|
|
|
|
"pgpgin",
|
|
|
|
"pgpgout",
|
|
|
|
"pswpin",
|
|
|
|
"pswpout",
|
|
|
|
|
|
|
|
TEXTS_FOR_ZONES("pgalloc")
|
|
|
|
|
|
|
|
"pgfree",
|
|
|
|
"pgactivate",
|
|
|
|
"pgdeactivate",
|
|
|
|
|
|
|
|
"pgfault",
|
|
|
|
"pgmajfault",
|
|
|
|
|
|
|
|
TEXTS_FOR_ZONES("pgrefill")
|
|
|
|
TEXTS_FOR_ZONES("pgsteal")
|
|
|
|
TEXTS_FOR_ZONES("pgscan_kswapd")
|
|
|
|
TEXTS_FOR_ZONES("pgscan_direct")
|
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
"zone_reclaim_failed",
|
|
|
|
#endif
|
|
|
|
"pginodesteal",
|
|
|
|
"slabs_scanned",
|
|
|
|
"kswapd_steal",
|
|
|
|
"kswapd_inodesteal",
|
|
|
|
"kswapd_low_wmark_hit_quickly",
|
|
|
|
"kswapd_high_wmark_hit_quickly",
|
|
|
|
"kswapd_skip_congestion_wait",
|
|
|
|
"pageoutrun",
|
|
|
|
"allocstall",
|
|
|
|
|
|
|
|
"pgrotated",
|
|
|
|
|
|
|
|
#ifdef CONFIG_COMPACTION
|
|
|
|
"compact_blocks_moved",
|
|
|
|
"compact_pages_moved",
|
|
|
|
"compact_pagemigrate_failed",
|
|
|
|
"compact_stall",
|
|
|
|
"compact_fail",
|
|
|
|
"compact_success",
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_HUGETLB_PAGE
|
|
|
|
"htlb_buddy_alloc_success",
|
|
|
|
"htlb_buddy_alloc_fail",
|
|
|
|
#endif
|
|
|
|
"unevictable_pgs_culled",
|
|
|
|
"unevictable_pgs_scanned",
|
|
|
|
"unevictable_pgs_rescued",
|
|
|
|
"unevictable_pgs_mlocked",
|
|
|
|
"unevictable_pgs_munlocked",
|
|
|
|
"unevictable_pgs_cleared",
|
|
|
|
"unevictable_pgs_stranded",
|
|
|
|
"unevictable_pgs_mlockfreed",
|
|
|
|
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
"thp_fault_alloc",
|
|
|
|
"thp_fault_fallback",
|
|
|
|
"thp_collapse_alloc",
|
|
|
|
"thp_collapse_alloc_failed",
|
|
|
|
"thp_split",
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* CONFIG_VM_EVENTS_COUNTERS */
|
|
|
|
};
|
2011-09-15 07:21:05 +08:00
|
|
|
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
|
2011-05-25 08:11:28 +08:00
|
|
|
|
|
|
|
|
2010-05-25 05:32:25 +08:00
|
|
|
#ifdef CONFIG_PROC_FS
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
|
|
|
|
struct zone *zone)
|
|
|
|
{
|
|
|
|
int order;
|
|
|
|
|
|
|
|
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
|
|
|
|
for (order = 0; order < MAX_ORDER; ++order)
|
|
|
|
seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
|
|
|
|
seq_putc(m, '\n');
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This walks the free areas for each zone.
|
|
|
|
*/
|
|
|
|
static int frag_show(struct seq_file *m, void *arg)
|
|
|
|
{
|
|
|
|
pg_data_t *pgdat = (pg_data_t *)arg;
|
|
|
|
walk_zones_in_node(m, pgdat, frag_show_print);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void pagetypeinfo_showfree_print(struct seq_file *m,
|
|
|
|
pg_data_t *pgdat, struct zone *zone)
|
|
|
|
{
|
|
|
|
int order, mtype;
|
|
|
|
|
|
|
|
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
|
|
|
|
seq_printf(m, "Node %4d, zone %8s, type %12s ",
|
|
|
|
pgdat->node_id,
|
|
|
|
zone->name,
|
|
|
|
migratetype_names[mtype]);
|
|
|
|
for (order = 0; order < MAX_ORDER; ++order) {
|
|
|
|
unsigned long freecount = 0;
|
|
|
|
struct free_area *area;
|
|
|
|
struct list_head *curr;
|
|
|
|
|
|
|
|
area = &(zone->free_area[order]);
|
|
|
|
|
|
|
|
list_for_each(curr, &area->free_list[mtype])
|
|
|
|
freecount++;
|
|
|
|
seq_printf(m, "%6lu ", freecount);
|
|
|
|
}
|
2006-06-30 16:55:32 +08:00
|
|
|
seq_putc(m, '\n');
|
|
|
|
}
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Print out the free pages at each order for each migatetype */
|
|
|
|
static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
|
|
|
|
{
|
|
|
|
int order;
|
|
|
|
pg_data_t *pgdat = (pg_data_t *)arg;
|
|
|
|
|
|
|
|
/* Print header */
|
|
|
|
seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
|
|
|
|
for (order = 0; order < MAX_ORDER; ++order)
|
|
|
|
seq_printf(m, "%6d ", order);
|
|
|
|
seq_putc(m, '\n');
|
|
|
|
|
|
|
|
walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void pagetypeinfo_showblockcount_print(struct seq_file *m,
|
|
|
|
pg_data_t *pgdat, struct zone *zone)
|
|
|
|
{
|
|
|
|
int mtype;
|
|
|
|
unsigned long pfn;
|
|
|
|
unsigned long start_pfn = zone->zone_start_pfn;
|
|
|
|
unsigned long end_pfn = start_pfn + zone->spanned_pages;
|
|
|
|
unsigned long count[MIGRATE_TYPES] = { 0, };
|
|
|
|
|
|
|
|
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
if (!pfn_valid(pfn))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
page = pfn_to_page(pfn);
|
2009-05-14 00:34:48 +08:00
|
|
|
|
|
|
|
/* Watch for unexpected holes punched in the memmap */
|
|
|
|
if (!memmap_valid_within(pfn, page, zone))
|
2008-08-14 18:10:14 +08:00
|
|
|
continue;
|
2009-05-14 00:34:48 +08:00
|
|
|
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
mtype = get_pageblock_migratetype(page);
|
|
|
|
|
2008-08-14 18:10:14 +08:00
|
|
|
if (mtype < MIGRATE_TYPES)
|
|
|
|
count[mtype]++;
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Print counts */
|
|
|
|
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
|
|
|
|
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
|
|
|
|
seq_printf(m, "%12lu ", count[mtype]);
|
|
|
|
seq_putc(m, '\n');
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Print out the free pages at each order for each migratetype */
|
|
|
|
static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
|
|
|
|
{
|
|
|
|
int mtype;
|
|
|
|
pg_data_t *pgdat = (pg_data_t *)arg;
|
|
|
|
|
|
|
|
seq_printf(m, "\n%-23s", "Number of blocks type ");
|
|
|
|
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
|
|
|
|
seq_printf(m, "%12s ", migratetype_names[mtype]);
|
|
|
|
seq_putc(m, '\n');
|
|
|
|
walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This prints out statistics in relation to grouping pages by mobility.
|
|
|
|
* It is expensive to collect so do not constantly read the file.
|
|
|
|
*/
|
|
|
|
static int pagetypeinfo_show(struct seq_file *m, void *arg)
|
|
|
|
{
|
|
|
|
pg_data_t *pgdat = (pg_data_t *)arg;
|
|
|
|
|
2008-04-30 15:52:13 +08:00
|
|
|
/* check memoryless node */
|
|
|
|
if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
|
|
|
|
return 0;
|
|
|
|
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
seq_printf(m, "Page block order: %d\n", pageblock_order);
|
|
|
|
seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
|
|
|
|
seq_putc(m, '\n');
|
|
|
|
pagetypeinfo_showfree(m, pgdat);
|
|
|
|
pagetypeinfo_showblockcount(m, pgdat);
|
|
|
|
|
2006-06-30 16:55:32 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-10-06 08:13:52 +08:00
|
|
|
static const struct seq_operations fragmentation_op = {
|
2006-06-30 16:55:32 +08:00
|
|
|
.start = frag_start,
|
|
|
|
.next = frag_next,
|
|
|
|
.stop = frag_stop,
|
|
|
|
.show = frag_show,
|
|
|
|
};
|
|
|
|
|
2008-10-06 08:13:52 +08:00
|
|
|
static int fragmentation_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
return seq_open(file, &fragmentation_op);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations fragmentation_file_operations = {
|
|
|
|
.open = fragmentation_open,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = seq_release,
|
|
|
|
};
|
|
|
|
|
2008-10-06 08:15:36 +08:00
|
|
|
static const struct seq_operations pagetypeinfo_op = {
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
.start = frag_start,
|
|
|
|
.next = frag_next,
|
|
|
|
.stop = frag_stop,
|
|
|
|
.show = pagetypeinfo_show,
|
|
|
|
};
|
|
|
|
|
2008-10-06 08:15:36 +08:00
|
|
|
static int pagetypeinfo_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
return seq_open(file, &pagetypeinfo_op);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations pagetypeinfo_file_ops = {
|
|
|
|
.open = pagetypeinfo_open,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = seq_release,
|
|
|
|
};
|
|
|
|
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
|
|
|
|
struct zone *zone)
|
2006-06-30 16:55:32 +08:00
|
|
|
{
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
int i;
|
|
|
|
seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
|
|
|
|
seq_printf(m,
|
|
|
|
"\n pages free %lu"
|
|
|
|
"\n min %lu"
|
|
|
|
"\n low %lu"
|
|
|
|
"\n high %lu"
|
2009-06-17 06:32:30 +08:00
|
|
|
"\n scanned %lu"
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
"\n spanned %lu"
|
|
|
|
"\n present %lu",
|
mm: page allocator: adjust the per-cpu counter threshold when memory is low
Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory
is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To
avoid synchronization overhead, these counters are maintained on a per-cpu
basis and drained both periodically and when a threshold is above a
threshold. On large CPU systems, the difference between the estimate and
real value of NR_FREE_PAGES can be very high. The system can get into a
case where pages are allocated far below the min watermark potentially
causing livelock issues. The commit solved the problem by taking a better
reading of NR_FREE_PAGES when memory was low.
Unfortately, as reported by Shaohua Li this accurate reading can consume a
large amount of CPU time on systems with many sockets due to cache line
bouncing. This patch takes a different approach. For large machines
where counter drift might be unsafe and while kswapd is awake, the per-cpu
thresholds for the target pgdat are reduced to limit the level of drift to
what should be a safe level. This incurs a performance penalty in heavy
memory pressure by a factor that depends on the workload and the machine
but the machine should function correctly without accidentally exhausting
all memory on a node. There is an additional cost when kswapd wakes and
sleeps but the event is not expected to be frequent - in Shaohua's test
case, there was one recorded sleep and wake event at least.
To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is
introduced that takes a more accurate reading of NR_FREE_PAGES when called
from wakeup_kswapd, when deciding whether it is really safe to go back to
sleep in sleeping_prematurely() and when deciding if a zone is really
balanced or not in balance_pgdat(). We are still using an expensive
function but limiting how often it is called.
When the test case is reproduced, the time spent in the watermark
functions is reduced. The following report is on the percentage of time
spent cumulatively spent in the functions zone_nr_free_pages(),
zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(),
zone_page_state_snapshot(), zone_page_state().
vanilla 11.6615%
disable-threshold 0.2584%
David said:
: We had to pull aa454840 "mm: page allocator: calculate a better estimate
: of NR_FREE_PAGES when memory is low and kswapd is awake" from 2.6.36
: internally because tests showed that it would cause the machine to stall
: as the result of heavy kswapd activity. I merged it back with this fix as
: it is pending in the -mm tree and it solves the issue we were seeing, so I
: definitely think this should be pushed to -stable (and I would seriously
: consider it for 2.6.37 inclusion even at this late date).
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reported-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Tested-by: Nicolas Bareil <nico@chdir.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: <stable@kernel.org> [2.6.37.1, 2.6.36.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-01-14 07:45:41 +08:00
|
|
|
zone_page_state(zone, NR_FREE_PAGES),
|
2009-06-17 06:32:12 +08:00
|
|
|
min_wmark_pages(zone),
|
|
|
|
low_wmark_pages(zone),
|
|
|
|
high_wmark_pages(zone),
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
zone->pages_scanned,
|
|
|
|
zone->spanned_pages,
|
|
|
|
zone->present_pages);
|
|
|
|
|
|
|
|
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
|
|
|
|
seq_printf(m, "\n %-12s %lu", vmstat_text[i],
|
|
|
|
zone_page_state(zone, i));
|
|
|
|
|
|
|
|
seq_printf(m,
|
|
|
|
"\n protection: (%lu",
|
|
|
|
zone->lowmem_reserve[0]);
|
|
|
|
for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
|
|
|
|
seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
|
|
|
|
seq_printf(m,
|
|
|
|
")"
|
|
|
|
"\n pagesets");
|
|
|
|
for_each_online_cpu(i) {
|
|
|
|
struct per_cpu_pageset *pageset;
|
|
|
|
|
2010-01-05 14:34:51 +08:00
|
|
|
pageset = per_cpu_ptr(zone->pageset, i);
|
2008-02-05 14:29:19 +08:00
|
|
|
seq_printf(m,
|
|
|
|
"\n cpu: %i"
|
|
|
|
"\n count: %i"
|
|
|
|
"\n high: %i"
|
|
|
|
"\n batch: %i",
|
|
|
|
i,
|
|
|
|
pageset->pcp.count,
|
|
|
|
pageset->pcp.high,
|
|
|
|
pageset->pcp.batch);
|
2006-09-01 12:27:35 +08:00
|
|
|
#ifdef CONFIG_SMP
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
seq_printf(m, "\n vm stats threshold: %d",
|
|
|
|
pageset->stat_threshold);
|
2006-09-01 12:27:35 +08:00
|
|
|
#endif
|
2006-06-30 16:55:32 +08:00
|
|
|
}
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
seq_printf(m,
|
|
|
|
"\n all_unreclaimable: %u"
|
2008-10-19 11:26:34 +08:00
|
|
|
"\n start_pfn: %lu"
|
|
|
|
"\n inactive_ratio: %u",
|
2010-03-06 05:41:55 +08:00
|
|
|
zone->all_unreclaimable,
|
2008-10-19 11:26:34 +08:00
|
|
|
zone->zone_start_pfn,
|
|
|
|
zone->inactive_ratio);
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
seq_putc(m, '\n');
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Output information about zones in @pgdat.
|
|
|
|
*/
|
|
|
|
static int zoneinfo_show(struct seq_file *m, void *arg)
|
|
|
|
{
|
|
|
|
pg_data_t *pgdat = (pg_data_t *)arg;
|
|
|
|
walk_zones_in_node(m, pgdat, zoneinfo_show_print);
|
2006-06-30 16:55:32 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-10-06 08:19:42 +08:00
|
|
|
static const struct seq_operations zoneinfo_op = {
|
2006-06-30 16:55:32 +08:00
|
|
|
.start = frag_start, /* iterate over all zones. The same as in
|
|
|
|
* fragmentation. */
|
|
|
|
.next = frag_next,
|
|
|
|
.stop = frag_stop,
|
|
|
|
.show = zoneinfo_show,
|
|
|
|
};
|
|
|
|
|
2008-10-06 08:19:42 +08:00
|
|
|
static int zoneinfo_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
return seq_open(file, &zoneinfo_op);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations proc_zoneinfo_file_operations = {
|
|
|
|
.open = zoneinfo_open,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = seq_release,
|
|
|
|
};
|
|
|
|
|
2010-10-27 05:21:36 +08:00
|
|
|
enum writeback_stat_item {
|
|
|
|
NR_DIRTY_THRESHOLD,
|
|
|
|
NR_DIRTY_BG_THRESHOLD,
|
|
|
|
NR_VM_WRITEBACK_STAT_ITEMS,
|
|
|
|
};
|
|
|
|
|
2006-06-30 16:55:32 +08:00
|
|
|
static void *vmstat_start(struct seq_file *m, loff_t *pos)
|
|
|
|
{
|
2006-06-30 16:55:33 +08:00
|
|
|
unsigned long *v;
|
2010-10-27 05:21:36 +08:00
|
|
|
int i, stat_items_size;
|
2006-06-30 16:55:32 +08:00
|
|
|
|
|
|
|
if (*pos >= ARRAY_SIZE(vmstat_text))
|
|
|
|
return NULL;
|
2010-10-27 05:21:36 +08:00
|
|
|
stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
|
|
|
|
NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
|
2006-06-30 16:55:32 +08:00
|
|
|
|
2006-06-30 16:55:45 +08:00
|
|
|
#ifdef CONFIG_VM_EVENT_COUNTERS
|
2010-10-27 05:21:36 +08:00
|
|
|
stat_items_size += sizeof(struct vm_event_state);
|
2006-06-30 16:55:45 +08:00
|
|
|
#endif
|
2010-10-27 05:21:36 +08:00
|
|
|
|
|
|
|
v = kmalloc(stat_items_size, GFP_KERNEL);
|
2006-06-30 16:55:33 +08:00
|
|
|
m->private = v;
|
|
|
|
if (!v)
|
2006-06-30 16:55:32 +08:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2006-06-30 16:55:33 +08:00
|
|
|
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
|
|
|
|
v[i] = global_page_state(i);
|
2010-10-27 05:21:36 +08:00
|
|
|
v += NR_VM_ZONE_STAT_ITEMS;
|
|
|
|
|
|
|
|
global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
|
|
|
|
v + NR_DIRTY_THRESHOLD);
|
|
|
|
v += NR_VM_WRITEBACK_STAT_ITEMS;
|
|
|
|
|
2006-06-30 16:55:45 +08:00
|
|
|
#ifdef CONFIG_VM_EVENT_COUNTERS
|
2010-10-27 05:21:36 +08:00
|
|
|
all_vm_events(v);
|
|
|
|
v[PGPGIN] /= 2; /* sectors -> kbytes */
|
|
|
|
v[PGPGOUT] /= 2;
|
2006-06-30 16:55:45 +08:00
|
|
|
#endif
|
2010-11-04 01:56:49 +08:00
|
|
|
return (unsigned long *)m->private + *pos;
|
2006-06-30 16:55:32 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
|
|
|
|
{
|
|
|
|
(*pos)++;
|
|
|
|
if (*pos >= ARRAY_SIZE(vmstat_text))
|
|
|
|
return NULL;
|
|
|
|
return (unsigned long *)m->private + *pos;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vmstat_show(struct seq_file *m, void *arg)
|
|
|
|
{
|
|
|
|
unsigned long *l = arg;
|
|
|
|
unsigned long off = l - (unsigned long *)m->private;
|
|
|
|
|
|
|
|
seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vmstat_stop(struct seq_file *m, void *arg)
|
|
|
|
{
|
|
|
|
kfree(m->private);
|
|
|
|
m->private = NULL;
|
|
|
|
}
|
|
|
|
|
2008-10-06 08:17:48 +08:00
|
|
|
static const struct seq_operations vmstat_op = {
|
2006-06-30 16:55:32 +08:00
|
|
|
.start = vmstat_start,
|
|
|
|
.next = vmstat_next,
|
|
|
|
.stop = vmstat_stop,
|
|
|
|
.show = vmstat_show,
|
|
|
|
};
|
|
|
|
|
2008-10-06 08:17:48 +08:00
|
|
|
static int vmstat_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
return seq_open(file, &vmstat_op);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations proc_vmstat_file_operations = {
|
|
|
|
.open = vmstat_open,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = seq_release,
|
|
|
|
};
|
2006-06-30 16:55:32 +08:00
|
|
|
#endif /* CONFIG_PROC_FS */
|
|
|
|
|
2006-09-01 12:27:35 +08:00
|
|
|
#ifdef CONFIG_SMP
|
2007-05-09 17:35:12 +08:00
|
|
|
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
|
2007-05-09 17:35:13 +08:00
|
|
|
int sysctl_stat_interval __read_mostly = HZ;
|
2007-05-09 17:35:12 +08:00
|
|
|
|
|
|
|
static void vmstat_update(struct work_struct *w)
|
|
|
|
{
|
|
|
|
refresh_cpu_vm_stats(smp_processor_id());
|
2007-05-09 17:35:13 +08:00
|
|
|
schedule_delayed_work(&__get_cpu_var(vmstat_work),
|
2009-04-03 07:56:39 +08:00
|
|
|
round_jiffies_relative(sysctl_stat_interval));
|
2007-05-09 17:35:12 +08:00
|
|
|
}
|
|
|
|
|
2007-11-15 09:00:12 +08:00
|
|
|
static void __cpuinit start_cpu_timer(int cpu)
|
2007-05-09 17:35:12 +08:00
|
|
|
{
|
2009-10-29 21:34:13 +08:00
|
|
|
struct delayed_work *work = &per_cpu(vmstat_work, cpu);
|
2007-05-09 17:35:12 +08:00
|
|
|
|
2009-10-29 21:34:13 +08:00
|
|
|
INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
|
|
|
|
schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
|
2007-05-09 17:35:12 +08:00
|
|
|
}
|
|
|
|
|
2006-09-01 12:27:35 +08:00
|
|
|
/*
|
|
|
|
* Use the cpu notifier to insure that the thresholds are recalculated
|
|
|
|
* when necessary.
|
|
|
|
*/
|
|
|
|
static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
|
|
|
|
unsigned long action,
|
|
|
|
void *hcpu)
|
|
|
|
{
|
2007-05-09 17:35:12 +08:00
|
|
|
long cpu = (long)hcpu;
|
|
|
|
|
2006-09-01 12:27:35 +08:00
|
|
|
switch (action) {
|
2007-05-09 17:35:12 +08:00
|
|
|
case CPU_ONLINE:
|
|
|
|
case CPU_ONLINE_FROZEN:
|
2010-09-10 07:38:14 +08:00
|
|
|
refresh_zone_stat_thresholds();
|
2007-05-09 17:35:12 +08:00
|
|
|
start_cpu_timer(cpu);
|
2010-01-05 14:34:51 +08:00
|
|
|
node_set_state(cpu_to_node(cpu), N_CPU);
|
2007-05-09 17:35:12 +08:00
|
|
|
break;
|
|
|
|
case CPU_DOWN_PREPARE:
|
|
|
|
case CPU_DOWN_PREPARE_FROZEN:
|
2010-12-14 23:21:17 +08:00
|
|
|
cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
|
2007-05-09 17:35:12 +08:00
|
|
|
per_cpu(vmstat_work, cpu).work.func = NULL;
|
|
|
|
break;
|
|
|
|
case CPU_DOWN_FAILED:
|
|
|
|
case CPU_DOWN_FAILED_FROZEN:
|
|
|
|
start_cpu_timer(cpu);
|
|
|
|
break;
|
2006-12-07 12:33:08 +08:00
|
|
|
case CPU_DEAD:
|
2007-05-09 17:35:10 +08:00
|
|
|
case CPU_DEAD_FROZEN:
|
2006-12-07 12:33:08 +08:00
|
|
|
refresh_zone_stat_thresholds();
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
2006-09-01 12:27:35 +08:00
|
|
|
}
|
|
|
|
return NOTIFY_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct notifier_block __cpuinitdata vmstat_notifier =
|
|
|
|
{ &vmstat_cpuup_callback, NULL, 0 };
|
2008-10-06 08:13:52 +08:00
|
|
|
#endif
|
2006-09-01 12:27:35 +08:00
|
|
|
|
2007-10-16 16:26:27 +08:00
|
|
|
static int __init setup_vmstat(void)
|
2006-09-01 12:27:35 +08:00
|
|
|
{
|
2008-10-06 08:13:52 +08:00
|
|
|
#ifdef CONFIG_SMP
|
2007-05-09 17:35:12 +08:00
|
|
|
int cpu;
|
|
|
|
|
2006-09-01 12:27:35 +08:00
|
|
|
register_cpu_notifier(&vmstat_notifier);
|
2007-05-09 17:35:12 +08:00
|
|
|
|
|
|
|
for_each_online_cpu(cpu)
|
|
|
|
start_cpu_timer(cpu);
|
2008-10-06 08:13:52 +08:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_PROC_FS
|
|
|
|
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
|
2008-10-06 08:15:36 +08:00
|
|
|
proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
|
2008-10-06 08:17:48 +08:00
|
|
|
proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
|
2008-10-06 08:19:42 +08:00
|
|
|
proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
|
2008-10-06 08:13:52 +08:00
|
|
|
#endif
|
2006-09-01 12:27:35 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
module_init(setup_vmstat)
|
2010-05-25 05:32:25 +08:00
|
|
|
|
|
|
|
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
|
|
|
|
#include <linux/debugfs.h>
|
|
|
|
|
|
|
|
static struct dentry *extfrag_debug_root;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return an index indicating how much of the available free memory is
|
|
|
|
* unusable for an allocation of the requested size.
|
|
|
|
*/
|
|
|
|
static int unusable_free_index(unsigned int order,
|
|
|
|
struct contig_page_info *info)
|
|
|
|
{
|
|
|
|
/* No free memory is interpreted as all free memory is unusable */
|
|
|
|
if (info->free_pages == 0)
|
|
|
|
return 1000;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Index should be a value between 0 and 1. Return a value to 3
|
|
|
|
* decimal places.
|
|
|
|
*
|
|
|
|
* 0 => no fragmentation
|
|
|
|
* 1 => high fragmentation
|
|
|
|
*/
|
|
|
|
return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
static void unusable_show_print(struct seq_file *m,
|
|
|
|
pg_data_t *pgdat, struct zone *zone)
|
|
|
|
{
|
|
|
|
unsigned int order;
|
|
|
|
int index;
|
|
|
|
struct contig_page_info info;
|
|
|
|
|
|
|
|
seq_printf(m, "Node %d, zone %8s ",
|
|
|
|
pgdat->node_id,
|
|
|
|
zone->name);
|
|
|
|
for (order = 0; order < MAX_ORDER; ++order) {
|
|
|
|
fill_contig_page_info(zone, order, &info);
|
|
|
|
index = unusable_free_index(order, &info);
|
|
|
|
seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
|
|
|
|
}
|
|
|
|
|
|
|
|
seq_putc(m, '\n');
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Display unusable free space index
|
|
|
|
*
|
|
|
|
* The unusable free space index measures how much of the available free
|
|
|
|
* memory cannot be used to satisfy an allocation of a given size and is a
|
|
|
|
* value between 0 and 1. The higher the value, the more of free memory is
|
|
|
|
* unusable and by implication, the worse the external fragmentation is. This
|
|
|
|
* can be expressed as a percentage by multiplying by 100.
|
|
|
|
*/
|
|
|
|
static int unusable_show(struct seq_file *m, void *arg)
|
|
|
|
{
|
|
|
|
pg_data_t *pgdat = (pg_data_t *)arg;
|
|
|
|
|
|
|
|
/* check memoryless node */
|
|
|
|
if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
walk_zones_in_node(m, pgdat, unusable_show_print);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct seq_operations unusable_op = {
|
|
|
|
.start = frag_start,
|
|
|
|
.next = frag_next,
|
|
|
|
.stop = frag_stop,
|
|
|
|
.show = unusable_show,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int unusable_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
return seq_open(file, &unusable_op);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations unusable_file_ops = {
|
|
|
|
.open = unusable_open,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = seq_release,
|
|
|
|
};
|
|
|
|
|
2010-05-25 05:32:26 +08:00
|
|
|
static void extfrag_show_print(struct seq_file *m,
|
|
|
|
pg_data_t *pgdat, struct zone *zone)
|
|
|
|
{
|
|
|
|
unsigned int order;
|
|
|
|
int index;
|
|
|
|
|
|
|
|
/* Alloc on stack as interrupts are disabled for zone walk */
|
|
|
|
struct contig_page_info info;
|
|
|
|
|
|
|
|
seq_printf(m, "Node %d, zone %8s ",
|
|
|
|
pgdat->node_id,
|
|
|
|
zone->name);
|
|
|
|
for (order = 0; order < MAX_ORDER; ++order) {
|
|
|
|
fill_contig_page_info(zone, order, &info);
|
2010-05-25 05:32:30 +08:00
|
|
|
index = __fragmentation_index(order, &info);
|
2010-05-25 05:32:26 +08:00
|
|
|
seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
|
|
|
|
}
|
|
|
|
|
|
|
|
seq_putc(m, '\n');
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Display fragmentation index for orders that allocations would fail for
|
|
|
|
*/
|
|
|
|
static int extfrag_show(struct seq_file *m, void *arg)
|
|
|
|
{
|
|
|
|
pg_data_t *pgdat = (pg_data_t *)arg;
|
|
|
|
|
|
|
|
walk_zones_in_node(m, pgdat, extfrag_show_print);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct seq_operations extfrag_op = {
|
|
|
|
.start = frag_start,
|
|
|
|
.next = frag_next,
|
|
|
|
.stop = frag_stop,
|
|
|
|
.show = extfrag_show,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int extfrag_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
return seq_open(file, &extfrag_op);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations extfrag_file_ops = {
|
|
|
|
.open = extfrag_open,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = seq_release,
|
|
|
|
};
|
|
|
|
|
2010-05-25 05:32:25 +08:00
|
|
|
static int __init extfrag_debug_init(void)
|
|
|
|
{
|
|
|
|
extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
|
|
|
|
if (!extfrag_debug_root)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
if (!debugfs_create_file("unusable_index", 0444,
|
|
|
|
extfrag_debug_root, NULL, &unusable_file_ops))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2010-05-25 05:32:26 +08:00
|
|
|
if (!debugfs_create_file("extfrag_index", 0444,
|
|
|
|
extfrag_debug_root, NULL, &extfrag_file_ops))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2010-05-25 05:32:25 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
module_init(extfrag_debug_init);
|
|
|
|
#endif
|