mm/swapfile: use percpu_ref to serialize against concurrent swapoff

Patch series "close various race windows for swap", v6.

When I was investigating the swap code, I found some possible race
windows.  This series aims to fix all these races.  But using current
get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really
long time.  And to reduce the performance overhead on the hot-path as much
as possible, it appears we can use the percpu_ref to close this race
window(as suggested by Huang, Ying).  The patch 1 adds percpu_ref support
for swap and most of the remaining patches try to use this to close
various race windows.  More details can be found in the respective
changelogs.

This patch (of 4):

Using current get/put_swap_device() to guard against concurrent swapoff
for some swap ops, e.g.  swap_readpage(), looks terrible because they
might take really long time.  This patch adds the percpu_ref support to
serialize against concurrent swapoff(as suggested by Huang, Ying).  Also
we remove the SWP_VALID flag because it's used together with RCU solution.

Link: https://lkml.kernel.org/r/20210426123316.806267-1-linmiaohe@huawei.com
Link: https://lkml.kernel.org/r/20210426123316.806267-2-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Miaohe Lin 2021-06-28 19:36:46 -07:00 committed by Linus Torvalds
parent e17eae2b83
commit 63d8620ecf
2 changed files with 52 additions and 32 deletions

View File

@ -177,7 +177,6 @@ enum {
SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
SWP_VALID = (1 << 13), /* swap is valid to be operated on? */
/* add others here before... */ /* add others here before... */
SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */
}; };
@ -240,6 +239,7 @@ struct swap_cluster_list {
* The in-memory structure used to track swap areas. * The in-memory structure used to track swap areas.
*/ */
struct swap_info_struct { struct swap_info_struct {
struct percpu_ref users; /* indicate and keep swap device valid. */
unsigned long flags; /* SWP_USED etc: see above */ unsigned long flags; /* SWP_USED etc: see above */
signed short prio; /* swap priority of this type */ signed short prio; /* swap priority of this type */
struct plist_node list; /* entry in swap_active_head */ struct plist_node list; /* entry in swap_active_head */
@ -260,6 +260,7 @@ struct swap_info_struct {
struct block_device *bdev; /* swap device or bdev of swap file */ struct block_device *bdev; /* swap device or bdev of swap file */
struct file *swap_file; /* seldom referenced */ struct file *swap_file; /* seldom referenced */
unsigned int old_block_size; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */
struct completion comp; /* seldom referenced */
#ifdef CONFIG_FRONTSWAP #ifdef CONFIG_FRONTSWAP
unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ unsigned long *frontswap_map; /* frontswap in-use, one bit per page */
atomic_t frontswap_pages; /* frontswap pages in-use counter */ atomic_t frontswap_pages; /* frontswap pages in-use counter */
@ -511,7 +512,7 @@ sector_t swap_page_sector(struct page *page);
static inline void put_swap_device(struct swap_info_struct *si) static inline void put_swap_device(struct swap_info_struct *si)
{ {
rcu_read_unlock(); percpu_ref_put(&si->users);
} }
#else /* CONFIG_SWAP */ #else /* CONFIG_SWAP */

View File

@ -39,6 +39,7 @@
#include <linux/export.h> #include <linux/export.h>
#include <linux/swap_slots.h> #include <linux/swap_slots.h>
#include <linux/sort.h> #include <linux/sort.h>
#include <linux/completion.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <linux/swapops.h> #include <linux/swapops.h>
@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
spin_unlock(&si->lock); spin_unlock(&si->lock);
} }
static void swap_users_ref_free(struct percpu_ref *ref)
{
struct swap_info_struct *si;
si = container_of(ref, struct swap_info_struct, users);
complete(&si->comp);
}
static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
{ {
struct swap_cluster_info *ci = si->cluster_info; struct swap_cluster_info *ci = si->cluster_info;
@ -1270,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
* via preventing the swap device from being swapoff, until * via preventing the swap device from being swapoff, until
* put_swap_device() is called. Otherwise return NULL. * put_swap_device() is called. Otherwise return NULL.
* *
* The entirety of the RCU read critical section must come before the
* return from or after the call to synchronize_rcu() in
* enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
* true, the si->map, si->cluster_info, etc. must be valid in the
* critical section.
*
* Notice that swapoff or swapoff+swapon can still happen before the * Notice that swapoff or swapoff+swapon can still happen before the
* rcu_read_lock() in get_swap_device() or after the rcu_read_unlock() * percpu_ref_tryget_live() in get_swap_device() or after the
* in put_swap_device() if there isn't any other way to prevent * percpu_ref_put() in put_swap_device() if there isn't any other way
* swapoff, such as page lock, page table lock, etc. The caller must * to prevent swapoff, such as page lock, page table lock, etc. The
* be prepared for that. For example, the following situation is * caller must be prepared for that. For example, the following
* possible. * situation is possible.
* *
* CPU1 CPU2 * CPU1 CPU2
* do_swap_page() * do_swap_page()
@ -1309,21 +1312,27 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
si = swp_swap_info(entry); si = swp_swap_info(entry);
if (!si) if (!si)
goto bad_nofile; goto bad_nofile;
if (!percpu_ref_tryget_live(&si->users))
rcu_read_lock(); goto out;
if (data_race(!(si->flags & SWP_VALID))) /*
goto unlock_out; * Guarantee the si->users are checked before accessing other
* fields of swap_info_struct.
*
* Paired with the spin_unlock() after setup_swap_info() in
* enable_swap_info().
*/
smp_rmb();
offset = swp_offset(entry); offset = swp_offset(entry);
if (offset >= si->max) if (offset >= si->max)
goto unlock_out; goto put_out;
return si; return si;
bad_nofile: bad_nofile:
pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
out: out:
return NULL; return NULL;
unlock_out: put_out:
rcu_read_unlock(); percpu_ref_put(&si->users);
return NULL; return NULL;
} }
@ -2466,7 +2475,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
static void _enable_swap_info(struct swap_info_struct *p) static void _enable_swap_info(struct swap_info_struct *p)
{ {
p->flags |= SWP_WRITEOK | SWP_VALID; p->flags |= SWP_WRITEOK;
atomic_long_add(p->pages, &nr_swap_pages); atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages; total_swap_pages += p->pages;
@ -2497,10 +2506,9 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
spin_unlock(&p->lock); spin_unlock(&p->lock);
spin_unlock(&swap_lock); spin_unlock(&swap_lock);
/* /*
* Guarantee swap_map, cluster_info, etc. fields are valid * Finished initializing swap device, now it's safe to reference it.
* between get/put_swap_device() if SWP_VALID bit is set
*/ */
synchronize_rcu(); percpu_ref_resurrect(&p->users);
spin_lock(&swap_lock); spin_lock(&swap_lock);
spin_lock(&p->lock); spin_lock(&p->lock);
_enable_swap_info(p); _enable_swap_info(p);
@ -2616,16 +2624,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
reenable_swap_slots_cache_unlock(); reenable_swap_slots_cache_unlock();
spin_lock(&swap_lock);
spin_lock(&p->lock);
p->flags &= ~SWP_VALID; /* mark swap device as invalid */
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
/* /*
* wait for swap operations protected by get/put_swap_device() * Wait for swap operations protected by get/put_swap_device()
* to complete * to complete.
*
* We need synchronize_rcu() here to protect the accessing to
* the swap cache data structure.
*/ */
percpu_ref_kill(&p->users);
synchronize_rcu(); synchronize_rcu();
wait_for_completion(&p->comp);
flush_work(&p->discard_work); flush_work(&p->discard_work);
@ -2857,6 +2865,12 @@ static struct swap_info_struct *alloc_swap_info(void)
if (!p) if (!p)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
if (percpu_ref_init(&p->users, swap_users_ref_free,
PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
kvfree(p);
return ERR_PTR(-ENOMEM);
}
spin_lock(&swap_lock); spin_lock(&swap_lock);
for (type = 0; type < nr_swapfiles; type++) { for (type = 0; type < nr_swapfiles; type++) {
if (!(swap_info[type]->flags & SWP_USED)) if (!(swap_info[type]->flags & SWP_USED))
@ -2864,6 +2878,7 @@ static struct swap_info_struct *alloc_swap_info(void)
} }
if (type >= MAX_SWAPFILES) { if (type >= MAX_SWAPFILES) {
spin_unlock(&swap_lock); spin_unlock(&swap_lock);
percpu_ref_exit(&p->users);
kvfree(p); kvfree(p);
return ERR_PTR(-EPERM); return ERR_PTR(-EPERM);
} }
@ -2891,9 +2906,13 @@ static struct swap_info_struct *alloc_swap_info(void)
plist_node_init(&p->avail_lists[i], 0); plist_node_init(&p->avail_lists[i], 0);
p->flags = SWP_USED; p->flags = SWP_USED;
spin_unlock(&swap_lock); spin_unlock(&swap_lock);
kvfree(defer); if (defer) {
percpu_ref_exit(&defer->users);
kvfree(defer);
}
spin_lock_init(&p->lock); spin_lock_init(&p->lock);
spin_lock_init(&p->cont_lock); spin_lock_init(&p->cont_lock);
init_completion(&p->comp);
return p; return p;
} }