mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-11 04:18:39 +08:00
for-6.10-rc7-tag
-----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmaRcQgACgkQxWXV+ddt WDvAGxAAknJAiREp/AmzhSwkhr+nSnqex0t+VVgsOaMTu0BEHO0xhoXc3l0QuSwS u2AIqmOYyzr/UQVXCuatBqAE+5T4njtYAYIWwE825yquAtHNyuok9+Sjhfvxrwgs HmNAN4Vvl2Fwds7xbWE8ug18QlssuRTIX8hk7ZtS6xo49g0tsbRX9KlzIPpsULD3 BOZa+2NJwC1PGVeNPf3p06rfiUkKfmFYgdDybe2zJ17uwsRz1CFSsaEEB35ys1f0 xYOS4epfcie03EGyZmYctuNxatUkk/J/1lTH4Z9JHwvPBvLK1U97SyJ11Wz2VQC/ 8ar8gUDRYtjWdf6vn6AWBM4MseaYm9LDMlPhbSfvpDcWiclGTE64IOP4gKKr3mCh WzlNSIR9I+tYgrhvcsCEzd7lvrSVHa7clwfooYgkEx0wl5lgbN0llAdtJWG3eeLn 3stxje2FqqXsFNj5N9SrPy7f7t6xF2i8vwk4qh6EpRuT4yuatb+nWzDm9EuTT/Bc P+zM1KFp7Blk7Zw/Tpw0O9qjt1whStY2xrqcMzg539WVo45MmuFEFzmGBRwZsH55 QPGLIjXPpt728AgMdhBFEG0DtWaiA3AOI/C5nYOtLu92aZVBmbaX7/d/GpJv3Vvd Ihvr9s1c49YvTZsIS0T0tkq/7LXZi/SToRJDjhP5HCrRGf7A30Y= =gtsF -----END PGP SIGNATURE----- Merge tag 'for-6.10-rc7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux Pull btrfs fixes from David Sterba: "Fix a regression in extent map shrinker behaviour. In the past weeks we got reports from users that there are huge latency spikes or freezes. This was bisected to newly added shrinker of extent maps (it was added to fix a build up of the structures in memory). I'm assuming that the freezes would happen to many users after release so I'd like to get it merged now so it's in 6.10. Although the diff size is not small the changes are relatively straightforward, the reporters verified the fixes and we did testing on our side. The fixes: - adjust behaviour under memory pressure and check lock or scheduling conditions, bail out if needed - synchronize tracking of the scanning progress so inode ranges are not skipped or work duplicated - do a delayed iput when scanning a root so evicting an inode does not slow things down in case of lots of dirty data, also fix lockdep warning, a deadlock could happen when writing the dirty data would need to start a transaction" * tag 'for-6.10-rc7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: avoid races when tracking progress for extent map shrinking btrfs: stop extent map shrinker if reschedule is needed btrfs: use delayed iput during extent map shrinking
This commit is contained in:
commit
975f3b6da1
@ -2856,6 +2856,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
spin_lock_init(&fs_info->extent_map_shrinker_lock);
|
||||
|
||||
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
@ -1028,7 +1028,14 @@ out_free_pre:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan)
|
||||
struct btrfs_em_shrink_ctx {
|
||||
long nr_to_scan;
|
||||
long scanned;
|
||||
u64 last_ino;
|
||||
u64 last_root;
|
||||
};
|
||||
|
||||
static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
|
||||
{
|
||||
const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
|
||||
struct extent_map_tree *tree = &inode->extent_tree;
|
||||
@ -1057,14 +1064,25 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
|
||||
if (!down_read_trylock(&inode->i_mmap_lock))
|
||||
return 0;
|
||||
|
||||
write_lock(&tree->lock);
|
||||
/*
|
||||
* We want to be fast because we can be called from any path trying to
|
||||
* allocate memory, so if the lock is busy we don't want to spend time
|
||||
* waiting for it - either some task is about to do IO for the inode or
|
||||
* we may have another task shrinking extent maps, here in this code, so
|
||||
* skip this inode.
|
||||
*/
|
||||
if (!write_trylock(&tree->lock)) {
|
||||
up_read(&inode->i_mmap_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
node = rb_first_cached(&tree->map);
|
||||
while (node) {
|
||||
struct extent_map *em;
|
||||
|
||||
em = rb_entry(node, struct extent_map, rb_node);
|
||||
node = rb_next(node);
|
||||
(*scanned)++;
|
||||
ctx->scanned++;
|
||||
|
||||
if (em->flags & EXTENT_FLAG_PINNED)
|
||||
goto next;
|
||||
@ -1085,16 +1103,18 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
|
||||
free_extent_map(em);
|
||||
nr_dropped++;
|
||||
next:
|
||||
if (*scanned >= nr_to_scan)
|
||||
if (ctx->scanned >= ctx->nr_to_scan)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Restart if we had to reschedule, and any extent maps that were
|
||||
* pinned before may have become unpinned after we released the
|
||||
* lock and took it again.
|
||||
* Stop if we need to reschedule or there's contention on the
|
||||
* lock. This is to avoid slowing other tasks trying to take the
|
||||
* lock and because the shrinker might be called during a memory
|
||||
* allocation path and we want to avoid taking a very long time
|
||||
* and slowing down all sorts of tasks.
|
||||
*/
|
||||
if (cond_resched_rwlock_write(&tree->lock))
|
||||
node = rb_first_cached(&tree->map);
|
||||
if (need_resched() || rwlock_needbreak(&tree->lock))
|
||||
break;
|
||||
}
|
||||
write_unlock(&tree->lock);
|
||||
up_read(&inode->i_mmap_lock);
|
||||
@ -1102,25 +1122,30 @@ next:
|
||||
return nr_dropped;
|
||||
}
|
||||
|
||||
static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan)
|
||||
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = root->fs_info;
|
||||
struct btrfs_inode *inode;
|
||||
long nr_dropped = 0;
|
||||
u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
|
||||
u64 min_ino = ctx->last_ino + 1;
|
||||
|
||||
inode = btrfs_find_first_inode(root, min_ino);
|
||||
while (inode) {
|
||||
nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan);
|
||||
nr_dropped += btrfs_scan_inode(inode, ctx);
|
||||
|
||||
min_ino = btrfs_ino(inode) + 1;
|
||||
fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
|
||||
iput(&inode->vfs_inode);
|
||||
ctx->last_ino = btrfs_ino(inode);
|
||||
btrfs_add_delayed_iput(inode);
|
||||
|
||||
if (*scanned >= nr_to_scan)
|
||||
if (ctx->scanned >= ctx->nr_to_scan)
|
||||
break;
|
||||
|
||||
/*
|
||||
* We may be called from memory allocation paths, so we don't
|
||||
* want to take too much time and slowdown tasks.
|
||||
*/
|
||||
if (need_resched())
|
||||
break;
|
||||
|
||||
cond_resched();
|
||||
inode = btrfs_find_first_inode(root, min_ino);
|
||||
}
|
||||
|
||||
@ -1132,14 +1157,14 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
|
||||
* inode if there is one or we will find out this was the last
|
||||
* one and move to the next root.
|
||||
*/
|
||||
fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
|
||||
ctx->last_root = btrfs_root_id(root);
|
||||
} else {
|
||||
/*
|
||||
* No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
|
||||
* that when processing the next root we start from its first inode.
|
||||
*/
|
||||
fs_info->extent_map_shrinker_last_ino = 0;
|
||||
fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
|
||||
ctx->last_ino = 0;
|
||||
ctx->last_root = btrfs_root_id(root) + 1;
|
||||
}
|
||||
|
||||
return nr_dropped;
|
||||
@ -1147,19 +1172,41 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
|
||||
|
||||
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
|
||||
{
|
||||
const u64 start_root_id = fs_info->extent_map_shrinker_last_root;
|
||||
u64 next_root_id = start_root_id;
|
||||
struct btrfs_em_shrink_ctx ctx;
|
||||
u64 start_root_id;
|
||||
u64 next_root_id;
|
||||
bool cycled = false;
|
||||
long nr_dropped = 0;
|
||||
long scanned = 0;
|
||||
|
||||
ctx.scanned = 0;
|
||||
ctx.nr_to_scan = nr_to_scan;
|
||||
|
||||
/*
|
||||
* In case we have multiple tasks running this shrinker, make the next
|
||||
* one start from the next inode in case it starts before we finish.
|
||||
*/
|
||||
spin_lock(&fs_info->extent_map_shrinker_lock);
|
||||
ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
|
||||
fs_info->extent_map_shrinker_last_ino++;
|
||||
ctx.last_root = fs_info->extent_map_shrinker_last_root;
|
||||
spin_unlock(&fs_info->extent_map_shrinker_lock);
|
||||
|
||||
start_root_id = ctx.last_root;
|
||||
next_root_id = ctx.last_root;
|
||||
|
||||
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
|
||||
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
|
||||
|
||||
trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
|
||||
trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
|
||||
nr, ctx.last_root,
|
||||
ctx.last_ino);
|
||||
}
|
||||
|
||||
while (scanned < nr_to_scan) {
|
||||
/*
|
||||
* We may be called from memory allocation paths, so we don't want to
|
||||
* take too much time and slowdown tasks, so stop if we need reschedule.
|
||||
*/
|
||||
while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
|
||||
struct btrfs_root *root;
|
||||
unsigned long count;
|
||||
|
||||
@ -1171,8 +1218,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
|
||||
spin_unlock(&fs_info->fs_roots_radix_lock);
|
||||
if (start_root_id > 0 && !cycled) {
|
||||
next_root_id = 0;
|
||||
fs_info->extent_map_shrinker_last_root = 0;
|
||||
fs_info->extent_map_shrinker_last_ino = 0;
|
||||
ctx.last_root = 0;
|
||||
ctx.last_ino = 0;
|
||||
cycled = true;
|
||||
continue;
|
||||
}
|
||||
@ -1186,15 +1233,33 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
|
||||
continue;
|
||||
|
||||
if (is_fstree(btrfs_root_id(root)))
|
||||
nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan);
|
||||
nr_dropped += btrfs_scan_root(root, &ctx);
|
||||
|
||||
btrfs_put_root(root);
|
||||
}
|
||||
|
||||
/*
|
||||
* In case of multiple tasks running this extent map shrinking code this
|
||||
* isn't perfect but it's simple and silences things like KCSAN. It's
|
||||
* not possible to know which task made more progress because we can
|
||||
* cycle back to the first root and first inode if it's not the first
|
||||
* time the shrinker ran, see the above logic. Also a task that started
|
||||
* later may finish ealier than another task and made less progress. So
|
||||
* make this simple and update to the progress of the last task that
|
||||
* finished, with the occasional possiblity of having two consecutive
|
||||
* runs of the shrinker process the same inodes.
|
||||
*/
|
||||
spin_lock(&fs_info->extent_map_shrinker_lock);
|
||||
fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
|
||||
fs_info->extent_map_shrinker_last_root = ctx.last_root;
|
||||
spin_unlock(&fs_info->extent_map_shrinker_lock);
|
||||
|
||||
if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
|
||||
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
|
||||
|
||||
trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
|
||||
trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
|
||||
nr, ctx.last_root,
|
||||
ctx.last_ino);
|
||||
}
|
||||
|
||||
return nr_dropped;
|
||||
|
@ -630,6 +630,7 @@ struct btrfs_fs_info {
|
||||
s32 delalloc_batch;
|
||||
|
||||
struct percpu_counter evictable_extent_maps;
|
||||
spinlock_t extent_map_shrinker_lock;
|
||||
u64 extent_map_shrinker_last_root;
|
||||
u64 extent_map_shrinker_last_ino;
|
||||
|
||||
|
@ -2556,9 +2556,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_count,
|
||||
|
||||
TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
|
||||
|
||||
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr),
|
||||
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr,
|
||||
u64 last_root_id, u64 last_ino),
|
||||
|
||||
TP_ARGS(fs_info, nr_to_scan, nr),
|
||||
TP_ARGS(fs_info, nr_to_scan, nr, last_root_id, last_ino),
|
||||
|
||||
TP_STRUCT__entry_btrfs(
|
||||
__field( long, nr_to_scan )
|
||||
@ -2570,8 +2571,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
|
||||
TP_fast_assign_btrfs(fs_info,
|
||||
__entry->nr_to_scan = nr_to_scan;
|
||||
__entry->nr = nr;
|
||||
__entry->last_root_id = fs_info->extent_map_shrinker_last_root;
|
||||
__entry->last_ino = fs_info->extent_map_shrinker_last_ino;
|
||||
__entry->last_root_id = last_root_id;
|
||||
__entry->last_ino = last_ino;
|
||||
),
|
||||
|
||||
TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
|
||||
@ -2581,9 +2582,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
|
||||
|
||||
TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
|
||||
|
||||
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr),
|
||||
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr,
|
||||
u64 last_root_id, u64 last_ino),
|
||||
|
||||
TP_ARGS(fs_info, nr_dropped, nr),
|
||||
TP_ARGS(fs_info, nr_dropped, nr, last_root_id, last_ino),
|
||||
|
||||
TP_STRUCT__entry_btrfs(
|
||||
__field( long, nr_dropped )
|
||||
@ -2595,8 +2597,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
|
||||
TP_fast_assign_btrfs(fs_info,
|
||||
__entry->nr_dropped = nr_dropped;
|
||||
__entry->nr = nr;
|
||||
__entry->last_root_id = fs_info->extent_map_shrinker_last_root;
|
||||
__entry->last_ino = fs_info->extent_map_shrinker_last_ino;
|
||||
__entry->last_root_id = last_root_id;
|
||||
__entry->last_ino = last_ino;
|
||||
),
|
||||
|
||||
TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
|
||||
|
Loading…
Reference in New Issue
Block a user