2010-05-25 05:32:27 +08:00
|
|
|
#ifndef _LINUX_COMPACTION_H
|
|
|
|
#define _LINUX_COMPACTION_H
|
|
|
|
|
2010-05-25 05:32:30 +08:00
|
|
|
/* Return values for compact_zone() and try_to_compact_pages() */
|
2015-11-06 10:47:56 +08:00
|
|
|
/* When adding new states, please adjust include/trace/events/compaction.h */
|
2016-05-21 07:56:38 +08:00
|
|
|
enum compact_result {
|
2016-05-21 07:56:50 +08:00
|
|
|
/* For more detailed tracepoint output - internal to compaction */
|
|
|
|
COMPACT_NOT_SUITABLE_ZONE,
|
2016-05-21 07:56:38 +08:00
|
|
|
/*
|
|
|
|
* compaction didn't start as it was not possible or direct reclaim
|
|
|
|
* was more suitable
|
|
|
|
*/
|
|
|
|
COMPACT_SKIPPED,
|
2016-05-21 07:56:44 +08:00
|
|
|
/* compaction didn't start as it was deferred due to past failures */
|
|
|
|
COMPACT_DEFERRED,
|
2016-05-21 07:56:50 +08:00
|
|
|
|
2016-05-21 07:56:44 +08:00
|
|
|
/* compaction not active last round */
|
|
|
|
COMPACT_INACTIVE = COMPACT_DEFERRED,
|
|
|
|
|
2016-05-21 07:56:50 +08:00
|
|
|
/* For more detailed tracepoint output - internal to compaction */
|
|
|
|
COMPACT_NO_SUITABLE_PAGE,
|
2016-05-21 07:56:38 +08:00
|
|
|
/* compaction should continue to another pageblock */
|
|
|
|
COMPACT_CONTINUE,
|
2016-05-21 07:56:50 +08:00
|
|
|
|
2016-05-21 07:56:38 +08:00
|
|
|
/*
|
2016-05-21 07:56:50 +08:00
|
|
|
* The full zone was compacted scanned but wasn't successfull to compact
|
|
|
|
* suitable pages.
|
2016-05-21 07:56:38 +08:00
|
|
|
*/
|
2016-05-21 07:56:50 +08:00
|
|
|
COMPACT_COMPLETE,
|
2016-05-21 07:56:47 +08:00
|
|
|
/*
|
|
|
|
* direct compaction has scanned part of the zone but wasn't successfull
|
|
|
|
* to compact suitable pages.
|
|
|
|
*/
|
|
|
|
COMPACT_PARTIAL_SKIPPED,
|
2016-05-21 07:56:50 +08:00
|
|
|
|
|
|
|
/* compaction terminated prematurely due to lock contentions */
|
|
|
|
COMPACT_CONTENDED,
|
|
|
|
|
2016-05-21 07:56:47 +08:00
|
|
|
/*
|
2016-05-21 07:56:50 +08:00
|
|
|
* direct compaction partially compacted a zone and there might be
|
|
|
|
* suitable pages
|
2016-05-21 07:56:47 +08:00
|
|
|
*/
|
2016-05-21 07:56:50 +08:00
|
|
|
COMPACT_PARTIAL,
|
2016-05-21 07:56:38 +08:00
|
|
|
};
|
2010-05-25 05:32:27 +08:00
|
|
|
|
mm, compaction: khugepaged should not give up due to need_resched()
Async compaction aborts when it detects zone lock contention or
need_resched() is true. David Rientjes has reported that in practice,
most direct async compactions for THP allocation abort due to
need_resched(). This means that a second direct compaction is never
attempted, which might be OK for a page fault, but khugepaged is intended
to attempt a sync compaction in such case and in these cases it won't.
This patch replaces "bool contended" in compact_control with an int that
distinguishes between aborting due to need_resched() and aborting due to
lock contention. This allows propagating the abort through all compaction
functions as before, but passing the abort reason up to
__alloc_pages_slowpath() which decides when to continue with direct
reclaim and another compaction attempt.
Another problem is that try_to_compact_pages() did not act upon the
reported contention (both need_resched() or lock contention) immediately
and would proceed with another zone from the zonelist. When
need_resched() is true, that means initializing another zone compaction,
only to check again need_resched() in isolate_migratepages() and aborting.
For zone lock contention, the unintended consequence is that the lock
contended status reported back to the allocator is detrmined from the last
zone where compaction was attempted, which is rather arbitrary.
This patch fixes the problem in the following way:
- async compaction of a zone aborting due to need_resched() or fatal signal
pending means that further zones should not be tried. We report
COMPACT_CONTENDED_SCHED to the allocator.
- aborting zone compaction due to lock contention means we can still try
another zone, since it has different set of locks. We report back
COMPACT_CONTENDED_LOCK only if *all* zones where compaction was attempted,
it was aborted due to lock contention.
As a result of these fixes, khugepaged will proceed with second sync
compaction as intended, when the preceding async compaction aborted due to
need_resched(). Page fault compactions aborting due to need_resched()
will spare some cycles previously wasted by initializing another zone
compaction only to abort again. Lock contention will be reported only
when compaction in all zones aborted due to lock contention, and therefore
it's not a good idea to try again after reclaim.
In stress-highalloc from mmtests configured to use __GFP_NO_KSWAPD, this
has improved number of THP collapse allocations by 10%, which shows
positive effect on khugepaged. The benchmark's success rates are
unchanged as it is not recognized as khugepaged. Numbers of compact_stall
and compact_fail events have however decreased by 20%, with
compact_success still a bit improved, which is good. With benchmark
configured not to use __GFP_NO_KSWAPD, there is 6% improvement in THP
collapse allocations, and only slight improvement in stalls and failures.
[akpm@linux-foundation.org: fix warnings]
Reported-by: David Rientjes <rientjes@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-10 06:27:14 +08:00
|
|
|
/* Used to signal whether compaction detected need_sched() or lock contention */
|
|
|
|
/* No contention detected */
|
|
|
|
#define COMPACT_CONTENDED_NONE 0
|
|
|
|
/* Either need_sched() was true or fatal signal pending */
|
|
|
|
#define COMPACT_CONTENDED_SCHED 1
|
|
|
|
/* Zone lock or lru_lock was contended in async compaction */
|
|
|
|
#define COMPACT_CONTENDED_LOCK 2
|
|
|
|
|
2015-02-12 07:25:44 +08:00
|
|
|
struct alloc_context; /* in mm/internal.h */
|
|
|
|
|
2010-05-25 05:32:28 +08:00
|
|
|
#ifdef CONFIG_COMPACTION
|
|
|
|
extern int sysctl_compact_memory;
|
|
|
|
extern int sysctl_compaction_handler(struct ctl_table *table, int write,
|
|
|
|
void __user *buffer, size_t *length, loff_t *ppos);
|
2010-05-25 05:32:31 +08:00
|
|
|
extern int sysctl_extfrag_threshold;
|
|
|
|
extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
|
|
|
|
void __user *buffer, size_t *length, loff_t *ppos);
|
2015-04-16 07:13:20 +08:00
|
|
|
extern int sysctl_compact_unevictable_allowed;
|
2010-05-25 05:32:30 +08:00
|
|
|
|
|
|
|
extern int fragmentation_index(struct zone *zone, unsigned int order);
|
2016-05-21 07:56:38 +08:00
|
|
|
extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
|
|
|
|
unsigned int order,
|
2016-05-20 08:13:38 +08:00
|
|
|
unsigned int alloc_flags, const struct alloc_context *ac,
|
|
|
|
enum migrate_mode mode, int *contended);
|
2013-02-23 08:32:33 +08:00
|
|
|
extern void compact_pgdat(pg_data_t *pgdat, int order);
|
2012-10-09 07:32:47 +08:00
|
|
|
extern void reset_isolation_suitable(pg_data_t *pgdat);
|
2016-05-21 07:56:38 +08:00
|
|
|
extern enum compact_result compaction_suitable(struct zone *zone, int order,
|
2016-05-20 08:13:38 +08:00
|
|
|
unsigned int alloc_flags, int classzone_idx);
|
2010-05-25 05:32:32 +08:00
|
|
|
|
2015-02-12 07:27:09 +08:00
|
|
|
extern void defer_compaction(struct zone *zone, int order);
|
|
|
|
extern bool compaction_deferred(struct zone *zone, int order);
|
|
|
|
extern void compaction_defer_reset(struct zone *zone, int order,
|
|
|
|
bool alloc_success);
|
|
|
|
extern bool compaction_restarting(struct zone *zone, int order);
|
2012-10-09 07:32:47 +08:00
|
|
|
|
mm, compaction: introduce kcompactd
Memory compaction can be currently performed in several contexts:
- kswapd balancing a zone after a high-order allocation failure
- direct compaction to satisfy a high-order allocation, including THP
page fault attemps
- khugepaged trying to collapse a hugepage
- manually from /proc
The purpose of compaction is two-fold. The obvious purpose is to
satisfy a (pending or future) high-order allocation, and is easy to
evaluate. The other purpose is to keep overal memory fragmentation low
and help the anti-fragmentation mechanism. The success wrt the latter
purpose is more
The current situation wrt the purposes has a few drawbacks:
- compaction is invoked only when a high-order page or hugepage is not
available (or manually). This might be too late for the purposes of
keeping memory fragmentation low.
- direct compaction increases latency of allocations. Again, it would
be better if compaction was performed asynchronously to keep
fragmentation low, before the allocation itself comes.
- (a special case of the previous) the cost of compaction during THP
page faults can easily offset the benefits of THP.
- kswapd compaction appears to be complex, fragile and not working in
some scenarios. It could also end up compacting for a high-order
allocation request when it should be reclaiming memory for a later
order-0 request.
To improve the situation, we should be able to benefit from an
equivalent of kswapd, but for compaction - i.e. a background thread
which responds to fragmentation and the need for high-order allocations
(including hugepages) somewhat proactively.
One possibility is to extend the responsibilities of kswapd, which could
however complicate its design too much. It should be better to let
kswapd handle reclaim, as order-0 allocations are often more critical
than high-order ones.
Another possibility is to extend khugepaged, but this kthread is a
single instance and tied to THP configs.
This patch goes with the option of a new set of per-node kthreads called
kcompactd, and lays the foundations, without introducing any new
tunables. The lifecycle mimics kswapd kthreads, including the memory
hotplug hooks.
For compaction, kcompactd uses the standard compaction_suitable() and
ompact_finished() criteria and the deferred compaction functionality.
Unlike direct compaction, it uses only sync compaction, as there's no
allocation latency to minimize.
This patch doesn't yet add a call to wakeup_kcompactd. The kswapd
compact/reclaim loop for high-order pages will be replaced by waking up
kcompactd in the next patch with the description of what's wrong with
the old approach.
Waking up of the kcompactd threads is also tied to kswapd activity and
follows these rules:
- we don't want to affect any fastpaths, so wake up kcompactd only from
the slowpath, as it's done for kswapd
- if kswapd is doing reclaim, it's more important than compaction, so
don't invoke kcompactd until kswapd goes to sleep
- the target order used for kswapd is passed to kcompactd
Future possible future uses for kcompactd include the ability to wake up
kcompactd on demand in special situations, such as when hugepages are
not available (currently not done due to __GFP_NO_KSWAPD) or when a
fragmentation event (i.e. __rmqueue_fallback()) occurs. It's also
possible to perform periodic compaction with kcompactd.
[arnd@arndb.de: fix build errors with kcompactd]
[paul.gortmaker@windriver.com: don't use modular references for non modular code]
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-18 05:18:08 +08:00
|
|
|
extern int kcompactd_run(int nid);
|
|
|
|
extern void kcompactd_stop(int nid);
|
|
|
|
extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
|
|
|
|
|
2010-05-25 05:32:30 +08:00
|
|
|
#else
|
2016-05-21 07:56:38 +08:00
|
|
|
static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask,
|
2015-02-12 07:25:44 +08:00
|
|
|
unsigned int order, int alloc_flags,
|
|
|
|
const struct alloc_context *ac,
|
|
|
|
enum migrate_mode mode, int *contended)
|
2010-05-25 05:32:30 +08:00
|
|
|
{
|
|
|
|
return COMPACT_CONTINUE;
|
|
|
|
}
|
|
|
|
|
2013-02-23 08:32:33 +08:00
|
|
|
static inline void compact_pgdat(pg_data_t *pgdat, int order)
|
2012-03-22 07:33:52 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2012-10-09 07:32:47 +08:00
|
|
|
static inline void reset_isolation_suitable(pg_data_t *pgdat)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2016-05-21 07:56:38 +08:00
|
|
|
static inline enum compact_result compaction_suitable(struct zone *zone, int order,
|
mm, compaction: pass classzone_idx and alloc_flags to watermark checking
Compaction relies on zone watermark checks for decisions such as if it's
worth to start compacting in compaction_suitable() or whether compaction
should stop in compact_finished(). The watermark checks take
classzone_idx and alloc_flags parameters, which are related to the memory
allocation request. But from the context of compaction they are currently
passed as 0, including the direct compaction which is invoked to satisfy
the allocation request, and could therefore know the proper values.
The lack of proper values can lead to mismatch between decisions taken
during compaction and decisions related to the allocation request. Lack
of proper classzone_idx value means that lowmem_reserve is not taken into
account. This has manifested (during recent changes to deferred
compaction) when DMA zone was used as fallback for preferred Normal zone.
compaction_suitable() without proper classzone_idx would think that the
watermarks are already satisfied, but watermark check in
get_page_from_freelist() would fail. Because of this problem, deferring
compaction has extra complexity that can be removed in the following
patch.
The issue (not confirmed in practice) with missing alloc_flags is opposite
in nature. For allocations that include ALLOC_HIGH, ALLOC_HIGHER or
ALLOC_CMA in alloc_flags (the last includes all MOVABLE allocations on
CMA-enabled systems) the watermark checking in compaction with 0 passed
will be stricter than in get_page_from_freelist(). In these cases
compaction might be running for a longer time than is really needed.
Another issue compaction_suitable() is that the check for "does the zone
need compaction at all?" comes only after the check "does the zone have
enough free free pages to succeed compaction". The latter considers extra
pages for migration and can therefore in some situations fail and return
COMPACT_SKIPPED, although the high-order allocation would succeed and we
should return COMPACT_PARTIAL.
This patch fixes these problems by adding alloc_flags and classzone_idx to
struct compact_control and related functions involved in direct compaction
and watermark checking. Where possible, all other callers of
compaction_suitable() pass proper values where those are known. This is
currently limited to classzone_idx, which is sometimes known in kswapd
context. However, the direct reclaim callers should_continue_reclaim()
and compaction_ready() do not currently know the proper values, so the
coordination between reclaim and compaction may still not be as accurate
as it could. This can be fixed later, if it's shown to be an issue.
Additionaly the checks in compact_suitable() are reordered to address the
second issue described above.
The effect of this patch should be slightly better high-order allocation
success rates and/or less compaction overhead, depending on the type of
allocations and presence of CMA. It allows simplifying deferred
compaction code in a followup patch.
When testing with stress-highalloc, there was some slight improvement
(which might be just due to variance) in success rates of non-THP-like
allocations.
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 07:43:22 +08:00
|
|
|
int alloc_flags, int classzone_idx)
|
2011-01-14 07:45:56 +08:00
|
|
|
{
|
|
|
|
return COMPACT_SKIPPED;
|
|
|
|
}
|
|
|
|
|
2012-03-22 07:33:52 +08:00
|
|
|
static inline void defer_compaction(struct zone *zone, int order)
|
2010-05-25 05:32:32 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2012-03-22 07:33:52 +08:00
|
|
|
static inline bool compaction_deferred(struct zone *zone, int order)
|
2010-05-25 05:32:32 +08:00
|
|
|
{
|
2012-08-01 07:42:49 +08:00
|
|
|
return true;
|
2010-05-25 05:32:32 +08:00
|
|
|
}
|
|
|
|
|
mm, compaction: introduce kcompactd
Memory compaction can be currently performed in several contexts:
- kswapd balancing a zone after a high-order allocation failure
- direct compaction to satisfy a high-order allocation, including THP
page fault attemps
- khugepaged trying to collapse a hugepage
- manually from /proc
The purpose of compaction is two-fold. The obvious purpose is to
satisfy a (pending or future) high-order allocation, and is easy to
evaluate. The other purpose is to keep overal memory fragmentation low
and help the anti-fragmentation mechanism. The success wrt the latter
purpose is more
The current situation wrt the purposes has a few drawbacks:
- compaction is invoked only when a high-order page or hugepage is not
available (or manually). This might be too late for the purposes of
keeping memory fragmentation low.
- direct compaction increases latency of allocations. Again, it would
be better if compaction was performed asynchronously to keep
fragmentation low, before the allocation itself comes.
- (a special case of the previous) the cost of compaction during THP
page faults can easily offset the benefits of THP.
- kswapd compaction appears to be complex, fragile and not working in
some scenarios. It could also end up compacting for a high-order
allocation request when it should be reclaiming memory for a later
order-0 request.
To improve the situation, we should be able to benefit from an
equivalent of kswapd, but for compaction - i.e. a background thread
which responds to fragmentation and the need for high-order allocations
(including hugepages) somewhat proactively.
One possibility is to extend the responsibilities of kswapd, which could
however complicate its design too much. It should be better to let
kswapd handle reclaim, as order-0 allocations are often more critical
than high-order ones.
Another possibility is to extend khugepaged, but this kthread is a
single instance and tied to THP configs.
This patch goes with the option of a new set of per-node kthreads called
kcompactd, and lays the foundations, without introducing any new
tunables. The lifecycle mimics kswapd kthreads, including the memory
hotplug hooks.
For compaction, kcompactd uses the standard compaction_suitable() and
ompact_finished() criteria and the deferred compaction functionality.
Unlike direct compaction, it uses only sync compaction, as there's no
allocation latency to minimize.
This patch doesn't yet add a call to wakeup_kcompactd. The kswapd
compact/reclaim loop for high-order pages will be replaced by waking up
kcompactd in the next patch with the description of what's wrong with
the old approach.
Waking up of the kcompactd threads is also tied to kswapd activity and
follows these rules:
- we don't want to affect any fastpaths, so wake up kcompactd only from
the slowpath, as it's done for kswapd
- if kswapd is doing reclaim, it's more important than compaction, so
don't invoke kcompactd until kswapd goes to sleep
- the target order used for kswapd is passed to kcompactd
Future possible future uses for kcompactd include the ability to wake up
kcompactd on demand in special situations, such as when hugepages are
not available (currently not done due to __GFP_NO_KSWAPD) or when a
fragmentation event (i.e. __rmqueue_fallback()) occurs. It's also
possible to perform periodic compaction with kcompactd.
[arnd@arndb.de: fix build errors with kcompactd]
[paul.gortmaker@windriver.com: don't use modular references for non modular code]
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-18 05:18:08 +08:00
|
|
|
static inline int kcompactd_run(int nid)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
static inline void kcompactd_stop(int nid)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2010-05-25 05:32:28 +08:00
|
|
|
#endif /* CONFIG_COMPACTION */
|
|
|
|
|
2010-05-25 05:32:29 +08:00
|
|
|
#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
|
|
|
|
extern int compaction_register_node(struct node *node);
|
|
|
|
extern void compaction_unregister_node(struct node *node);
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
static inline int compaction_register_node(struct node *node)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void compaction_unregister_node(struct node *node)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_COMPACTION && CONFIG_SYSFS && CONFIG_NUMA */
|
|
|
|
|
2010-05-25 05:32:27 +08:00
|
|
|
#endif /* _LINUX_COMPACTION_H */
|