linux/fs/internal.h
Johannes Weiner 51b8c1fe25 vfs: keep inodes with page cache off the inode shrinker LRU
Historically (pre-2.5), the inode shrinker used to reclaim only empty
inodes and skip over those that still contained page cache.  This caused
problems on highmem hosts: struct inode could put fill lowmem zones
before the cache was getting reclaimed in the highmem zones.

To address this, the inode shrinker started to strip page cache to
facilitate reclaiming lowmem.  However, this comes with its own set of
problems: the shrinkers may drop actively used page cache just because
the inodes are not currently open or dirty - think working with a large
git tree.  It further doesn't respect cgroup memory protection settings
and can cause priority inversions between containers.

Nowadays, the page cache also holds non-resident info for evicted cache
pages in order to detect refaults.  We've come to rely heavily on this
data inside reclaim for protecting the cache workingset and driving swap
behavior.  We also use it to quantify and report workload health through
psi.  The latter in turn is used for fleet health monitoring, as well as
driving automated memory sizing of workloads and containers, proactive
reclaim and memory offloading schemes.

The consequences of dropping page cache prematurely is that we're seeing
subtle and not-so-subtle failures in all of the above-mentioned
scenarios, with the workload generally entering unexpected thrashing
states while losing the ability to reliably detect it.

To fix this on non-highmem systems at least, going back to rotating
inodes on the LRU isn't feasible.  We've tried (commit a76cf1a474
("mm: don't reclaim inodes with many attached pages")) and failed
(commit 69056ee6a8 ("Revert "mm: don't reclaim inodes with many
attached pages"")).

The issue is mostly that shrinker pools attract pressure based on their
size, and when objects get skipped the shrinkers remember this as
deferred reclaim work.  This accumulates excessive pressure on the
remaining inodes, and we can quickly eat into heavily used ones, or
dirty ones that require IO to reclaim, when there potentially is plenty
of cold, clean cache around still.

Instead, this patch keeps populated inodes off the inode LRU in the
first place - just like an open file or dirty state would.  An otherwise
clean and unused inode then gets queued when the last cache entry
disappears.  This solves the problem without reintroducing the reclaim
issues, and generally is a bit more scalable than having to wade through
potentially hundreds of thousands of busy inodes.

Locking is a bit tricky because the locks protecting the inode state
(i_lock) and the inode LRU (lru_list.lock) don't nest inside the
irq-safe page cache lock (i_pages.xa_lock).  Page cache deletions are
serialized through i_lock, taken before the i_pages lock, to make sure
depopulated inodes are queued reliably.  Additions may race with
deletions, but we'll check again in the shrinker.  If additions race
with the shrinker itself, we're protected by the i_lock: if find_inode()
or iput() win, the shrinker will bail on the elevated i_count or
I_REFERENCED; if the shrinker wins and goes ahead with the inode, it
will set I_FREEING and inhibit further igets(), which will cause the
other side to create a new instance of the inode instead.

Link: https://lkml.kernel.org/r/20210614211904.14420-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-09 10:02:48 -08:00

208 lines
5.5 KiB
C

/* SPDX-License-Identifier: GPL-2.0-or-later */
/* fs/ internal definitions
*
* Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
struct super_block;
struct file_system_type;
struct iomap;
struct iomap_ops;
struct linux_binprm;
struct path;
struct mount;
struct shrink_control;
struct fs_context;
struct user_namespace;
struct pipe_inode_info;
/*
* block/bdev.c
*/
#ifdef CONFIG_BLOCK
extern void __init bdev_cache_init(void);
extern int __sync_blockdev(struct block_device *bdev, int wait);
void iterate_bdevs(void (*)(struct block_device *, void *), void *);
void emergency_thaw_bdev(struct super_block *sb);
#else
static inline void bdev_cache_init(void)
{
}
static inline int __sync_blockdev(struct block_device *bdev, int wait)
{
return 0;
}
static inline void iterate_bdevs(void (*f)(struct block_device *, void *),
void *arg)
{
}
static inline int emergency_thaw_bdev(struct super_block *sb)
{
return 0;
}
#endif /* CONFIG_BLOCK */
/*
* buffer.c
*/
int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block, const struct iomap *iomap);
/*
* char_dev.c
*/
extern void __init chrdev_init(void);
/*
* fs_context.c
*/
extern const struct fs_context_operations legacy_fs_context_ops;
extern int parse_monolithic_mount_data(struct fs_context *, void *);
extern void vfs_clean_context(struct fs_context *fc);
extern int finish_clean_context(struct fs_context *fc);
/*
* namei.c
*/
extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
struct path *path, struct path *root);
extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
const char *, unsigned int, struct path *);
int do_rmdir(int dfd, struct filename *name);
int do_unlinkat(int dfd, struct filename *name);
int may_linkat(struct user_namespace *mnt_userns, struct path *link);
int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
struct filename *newname, unsigned int flags);
int do_mkdirat(int dfd, struct filename *name, umode_t mode);
int do_symlinkat(struct filename *from, int newdfd, struct filename *to);
int do_linkat(int olddfd, struct filename *old, int newdfd,
struct filename *new, int flags);
/*
* namespace.c
*/
extern struct vfsmount *lookup_mnt(const struct path *);
extern int finish_automount(struct vfsmount *, struct path *);
extern int sb_prepare_remount_readonly(struct super_block *);
extern void __init mnt_init(void);
extern int __mnt_want_write_file(struct file *);
extern void __mnt_drop_write_file(struct file *);
extern void dissolve_on_fput(struct vfsmount *);
int path_mount(const char *dev_name, struct path *path,
const char *type_page, unsigned long flags, void *data_page);
int path_umount(struct path *path, int flags);
/*
* fs_struct.c
*/
extern void chroot_fs_refs(const struct path *, const struct path *);
/*
* file_table.c
*/
extern struct file *alloc_empty_file(int, const struct cred *);
extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
/*
* super.c
*/
extern int reconfigure_super(struct fs_context *);
extern bool trylock_super(struct super_block *sb);
struct super_block *user_get_super(dev_t, bool excl);
void put_super(struct super_block *sb);
extern bool mount_capable(struct fs_context *);
/*
* open.c
*/
struct open_flags {
int open_flag;
umode_t mode;
int acc_mode;
int intent;
int lookup_flags;
};
extern struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op);
extern struct file *do_file_open_root(const struct path *,
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
extern int __close_fd_get_file(unsigned int fd, struct file **res);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
int flag);
int chown_common(const struct path *path, uid_t user, gid_t group);
extern int vfs_open(const struct path *, struct file *);
/*
* inode.c
*/
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
extern int dentry_needs_remove_privs(struct dentry *dentry);
/*
* fs-writeback.c
*/
extern long get_nr_dirty_inodes(void);
extern int invalidate_inodes(struct super_block *, bool);
/*
* dcache.c
*/
extern int d_set_mounted(struct dentry *dentry);
extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
extern struct dentry *d_alloc_cursor(struct dentry *);
extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
extern char *simple_dname(struct dentry *, char *, int);
extern void dput_to_list(struct dentry *, struct list_head *);
extern void shrink_dentry_list(struct list_head *);
/*
* read_write.c
*/
extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
/*
* pipe.c
*/
extern const struct file_operations pipefifo_fops;
/*
* fs_pin.c
*/
extern void group_pin_kill(struct hlist_head *p);
extern void mnt_pin_kill(struct mount *m);
/*
* fs/nsfs.c
*/
extern const struct dentry_operations ns_dentry_operations;
/* direct-io.c: */
int sb_init_dio_done_wq(struct super_block *sb);
/*
* fs/stat.c:
*/
int do_statx(int dfd, const char __user *filename, unsigned flags,
unsigned int mask, struct statx __user *buffer);
/*
* fs/splice.c:
*/
long splice_file_to_pipe(struct file *in,
struct pipe_inode_info *opipe,
loff_t *offset,
size_t len, unsigned int flags);